summaryrefslogtreecommitdiff
path: root/storage/xtradb
diff options
context:
space:
mode:
Diffstat (limited to 'storage/xtradb')
-rw-r--r--storage/xtradb/CMakeLists.txt256
-rw-r--r--storage/xtradb/COPYING.Google30
-rw-r--r--storage/xtradb/COPYING.Percona30
-rw-r--r--storage/xtradb/COPYING.Sun_Microsystems31
-rw-r--r--storage/xtradb/ChangeLog1916
-rw-r--r--storage/xtradb/Doxyfile1419
-rw-r--r--storage/xtradb/Makefile.am345
-rw-r--r--storage/xtradb/btr/btr0btr.c3789
-rw-r--r--storage/xtradb/btr/btr0cur.c5256
-rw-r--r--storage/xtradb/btr/btr0pcur.c606
-rw-r--r--storage/xtradb/btr/btr0sea.c2032
-rw-r--r--storage/xtradb/buf/buf0buddy.c804
-rw-r--r--storage/xtradb/buf/buf0buf.c4901
-rw-r--r--storage/xtradb/buf/buf0flu.c1781
-rw-r--r--storage/xtradb/buf/buf0lru.c2580
-rw-r--r--storage/xtradb/buf/buf0rea.c764
-rw-r--r--storage/xtradb/build/debian/README.Maintainer116
-rw-r--r--storage/xtradb/build/debian/additions/Docs__Images__Makefile.in6
-rw-r--r--storage/xtradb/build/debian/additions/Docs__Makefile.in6
-rw-r--r--storage/xtradb/build/debian/additions/debian-start31
-rw-r--r--storage/xtradb/build/debian/additions/debian-start.inc.sh72
-rw-r--r--storage/xtradb/build/debian/additions/echo_stderr2
-rw-r--r--storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm1089
-rw-r--r--storage/xtradb/build/debian/additions/innotop/changelog.innotop318
-rw-r--r--storage/xtradb/build/debian/additions/innotop/innotop9485
-rw-r--r--storage/xtradb/build/debian/additions/innotop/innotop.12086
-rw-r--r--storage/xtradb/build/debian/additions/msql2mysql.116
-rw-r--r--storage/xtradb/build/debian/additions/my.cnf129
-rw-r--r--storage/xtradb/build/debian/additions/my_print_defaults.116
-rw-r--r--storage/xtradb/build/debian/additions/myisam_ftdump.116
-rw-r--r--storage/xtradb/build/debian/additions/myisamchk.117
-rw-r--r--storage/xtradb/build/debian/additions/myisamlog.116
-rw-r--r--storage/xtradb/build/debian/additions/myisampack.119
-rw-r--r--storage/xtradb/build/debian/additions/mysql-server.lintian-overrides2
-rw-r--r--storage/xtradb/build/debian/additions/mysql_config.117
-rw-r--r--storage/xtradb/build/debian/additions/mysql_convert_table_format.117
-rw-r--r--storage/xtradb/build/debian/additions/mysql_find_rows.118
-rw-r--r--storage/xtradb/build/debian/additions/mysql_fix_extensions.118
-rw-r--r--storage/xtradb/build/debian/additions/mysql_install_db.116
-rw-r--r--storage/xtradb/build/debian/additions/mysql_secure_installation.117
-rw-r--r--storage/xtradb/build/debian/additions/mysql_setpermission.123
-rw-r--r--storage/xtradb/build/debian/additions/mysql_tableinfo.1322
-rw-r--r--storage/xtradb/build/debian/additions/mysql_waitpid.120
-rw-r--r--storage/xtradb/build/debian/additions/mysqlbinlog.117
-rw-r--r--storage/xtradb/build/debian/additions/mysqlbug.114
-rw-r--r--storage/xtradb/build/debian/additions/mysqlcheck.128
-rw-r--r--storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf2
-rw-r--r--storage/xtradb/build/debian/additions/mysqldumpslow.150
-rw-r--r--storage/xtradb/build/debian/additions/mysqlimport.120
-rw-r--r--storage/xtradb/build/debian/additions/mysqlmanager.149
-rw-r--r--storage/xtradb/build/debian/additions/mysqlreport1298
-rw-r--r--storage/xtradb/build/debian/additions/mysqlreport.1180
-rw-r--r--storage/xtradb/build/debian/additions/mysqltest.116
-rw-r--r--storage/xtradb/build/debian/additions/pack_isam.119
-rw-r--r--storage/xtradb/build/debian/additions/resolve_stack_dump.116
-rw-r--r--storage/xtradb/build/debian/additions/resolveip.116
-rw-r--r--storage/xtradb/build/debian/changelog4186
-rw-r--r--storage/xtradb/build/debian/compat1
-rw-r--r--storage/xtradb/build/debian/control118
-rw-r--r--storage/xtradb/build/debian/copyright169
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer4
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs2
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs1
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples1
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.files7
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client-dev.links2
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client16.dirs1
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client16.docs1
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client16.files1
-rw-r--r--storage/xtradb/build/debian/libpercona-xtradb-client16.postinst12
-rw-r--r--storage/xtradb/build/debian/patches/00list6
-rw-r--r--storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch776
-rw-r--r--storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch776
-rw-r--r--storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch29
-rw-r--r--storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch43
-rw-r--r--storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch20
-rw-r--r--storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch24
-rw-r--r--storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch23
-rw-r--r--storage/xtradb/build/debian/patches/60_percona_support.dpatch16
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian4
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs3
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.docs3
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.files39
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.links3
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides3
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-client-5.1.menu3
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-common.dirs1
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-common.files2
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides2
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-common.postrm7
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS34
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian109
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.config46
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs9
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.docs1
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.files53
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.links2
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides4
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid9
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server32
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation32
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init182
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate27
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst277
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm83
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst186
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm8
-rw-r--r--storage/xtradb/build/debian/percona-xtradb-server-5.1.templates90
-rw-r--r--storage/xtradb/build/debian/po/POTFILES.in1
-rw-r--r--storage/xtradb/build/debian/po/ar.po267
-rw-r--r--storage/xtradb/build/debian/po/ca.po342
-rw-r--r--storage/xtradb/build/debian/po/cs.po361
-rw-r--r--storage/xtradb/build/debian/po/da.po397
-rw-r--r--storage/xtradb/build/debian/po/de.po277
-rw-r--r--storage/xtradb/build/debian/po/es.po405
-rw-r--r--storage/xtradb/build/debian/po/eu.po295
-rw-r--r--storage/xtradb/build/debian/po/fr.po274
-rw-r--r--storage/xtradb/build/debian/po/gl.po264
-rw-r--r--storage/xtradb/build/debian/po/it.po266
-rw-r--r--storage/xtradb/build/debian/po/ja.po394
-rw-r--r--storage/xtradb/build/debian/po/nb.po297
-rw-r--r--storage/xtradb/build/debian/po/nl.po302
-rw-r--r--storage/xtradb/build/debian/po/pt.po322
-rw-r--r--storage/xtradb/build/debian/po/pt_BR.po458
-rw-r--r--storage/xtradb/build/debian/po/ro.po319
-rw-r--r--storage/xtradb/build/debian/po/ru.po305
-rw-r--r--storage/xtradb/build/debian/po/sv.po400
-rw-r--r--storage/xtradb/build/debian/po/templates.pot187
-rw-r--r--storage/xtradb/build/debian/po/tr.po342
-rwxr-xr-xstorage/xtradb/build/debian/rules322
-rw-r--r--storage/xtradb/build/debian/source.lintian-overrides2
-rw-r--r--storage/xtradb/build/debian/watch3
-rw-r--r--storage/xtradb/build/percona-sql.spec1644
-rw-r--r--storage/xtradb/data/data0data.c779
-rw-r--r--storage/xtradb/data/data0type.c297
-rw-r--r--storage/xtradb/dict/dict0boot.c549
-rw-r--r--storage/xtradb/dict/dict0crea.c1733
-rw-r--r--storage/xtradb/dict/dict0dict.c5347
-rw-r--r--storage/xtradb/dict/dict0load.c1572
-rw-r--r--storage/xtradb/dict/dict0mem.c323
-rw-r--r--storage/xtradb/dyn/dyn0dyn.c65
-rw-r--r--storage/xtradb/eval/eval0eval.c852
-rw-r--r--storage/xtradb/eval/eval0proc.c295
-rw-r--r--storage/xtradb/fil/fil0fil.c5438
-rw-r--r--storage/xtradb/fsp/fsp0fsp.c4346
-rw-r--r--storage/xtradb/fut/fut0fut.c31
-rw-r--r--storage/xtradb/fut/fut0lst.c530
-rw-r--r--storage/xtradb/ha/ha0ha.c464
-rw-r--r--storage/xtradb/ha/ha0storage.c184
-rw-r--r--storage/xtradb/ha/hash0hash.c242
-rw-r--r--storage/xtradb/ha_innodb.def4
-rw-r--r--storage/xtradb/handler/ha_innodb.cc12109
-rw-r--r--storage/xtradb/handler/ha_innodb.h349
-rw-r--r--storage/xtradb/handler/handler0alter.cc1243
-rw-r--r--storage/xtradb/handler/i_s.cc4516
-rw-r--r--storage/xtradb/handler/i_s.h67
-rw-r--r--storage/xtradb/handler/innodb_patch_info.h52
-rw-r--r--storage/xtradb/handler/mysql_addons.cc42
-rw-r--r--storage/xtradb/ibuf/ibuf0ibuf.c3646
-rw-r--r--storage/xtradb/include/btr0btr.h517
-rw-r--r--storage/xtradb/include/btr0btr.ic316
-rw-r--r--storage/xtradb/include/btr0cur.h764
-rw-r--r--storage/xtradb/include/btr0cur.ic200
-rw-r--r--storage/xtradb/include/btr0pcur.h551
-rw-r--r--storage/xtradb/include/btr0pcur.ic642
-rw-r--r--storage/xtradb/include/btr0sea.h327
-rw-r--r--storage/xtradb/include/btr0sea.ic84
-rw-r--r--storage/xtradb/include/btr0types.h51
-rw-r--r--storage/xtradb/include/buf0buddy.h92
-rw-r--r--storage/xtradb/include/buf0buddy.ic143
-rw-r--r--storage/xtradb/include/buf0buf.h1574
-rw-r--r--storage/xtradb/include/buf0buf.ic1126
-rw-r--r--storage/xtradb/include/buf0flu.h218
-rw-r--r--storage/xtradb/include/buf0flu.ic155
-rw-r--r--storage/xtradb/include/buf0lru.h309
-rw-r--r--storage/xtradb/include/buf0lru.ic25
-rw-r--r--storage/xtradb/include/buf0rea.h170
-rw-r--r--storage/xtradb/include/buf0types.h83
-rw-r--r--storage/xtradb/include/data0data.h483
-rw-r--r--storage/xtradb/include/data0data.ic612
-rw-r--r--storage/xtradb/include/data0type.h486
-rw-r--r--storage/xtradb/include/data0type.ic603
-rw-r--r--storage/xtradb/include/data0types.h36
-rw-r--r--storage/xtradb/include/db0err.h111
-rw-r--r--storage/xtradb/include/dict0boot.h161
-rw-r--r--storage/xtradb/include/dict0boot.ic93
-rw-r--r--storage/xtradb/include/dict0crea.h215
-rw-r--r--storage/xtradb/include/dict0crea.ic25
-rw-r--r--storage/xtradb/include/dict0dict.h1217
-rw-r--r--storage/xtradb/include/dict0dict.ic861
-rw-r--r--storage/xtradb/include/dict0load.h117
-rw-r--r--storage/xtradb/include/dict0load.ic26
-rw-r--r--storage/xtradb/include/dict0mem.h577
-rw-r--r--storage/xtradb/include/dict0mem.ic26
-rw-r--r--storage/xtradb/include/dict0types.h48
-rw-r--r--storage/xtradb/include/dyn0dyn.h188
-rw-r--r--storage/xtradb/include/dyn0dyn.ic365
-rw-r--r--storage/xtradb/include/eval0eval.h114
-rw-r--r--storage/xtradb/include/eval0eval.ic251
-rw-r--r--storage/xtradb/include/eval0proc.h104
-rw-r--r--storage/xtradb/include/eval0proc.ic88
-rw-r--r--storage/xtradb/include/fil0fil.h766
-rw-r--r--storage/xtradb/include/fsp0fsp.h359
-rw-r--r--storage/xtradb/include/fsp0fsp.ic45
-rw-r--r--storage/xtradb/include/fsp0types.h110
-rw-r--r--storage/xtradb/include/fut0fut.h55
-rw-r--r--storage/xtradb/include/fut0fut.ic63
-rw-r--r--storage/xtradb/include/fut0lst.h217
-rw-r--r--storage/xtradb/include/fut0lst.ic167
-rw-r--r--storage/xtradb/include/ha0ha.h243
-rw-r--r--storage/xtradb/include/ha0ha.ic220
-rw-r--r--storage/xtradb/include/ha0storage.h140
-rw-r--r--storage/xtradb/include/ha0storage.ic148
-rw-r--r--storage/xtradb/include/ha_prototypes.h279
-rw-r--r--storage/xtradb/include/handler0alter.h42
-rw-r--r--storage/xtradb/include/hash0hash.h496
-rw-r--r--storage/xtradb/include/hash0hash.ic183
-rw-r--r--storage/xtradb/include/ibuf0ibuf.h383
-rw-r--r--storage/xtradb/include/ibuf0ibuf.ic327
-rw-r--r--storage/xtradb/include/ibuf0types.h31
-rw-r--r--storage/xtradb/include/lock0iter.h69
-rw-r--r--storage/xtradb/include/lock0lock.h829
-rw-r--r--storage/xtradb/include/lock0lock.ic121
-rw-r--r--storage/xtradb/include/lock0priv.h108
-rw-r--r--storage/xtradb/include/lock0priv.ic49
-rw-r--r--storage/xtradb/include/lock0types.h45
-rw-r--r--storage/xtradb/include/log0log.h969
-rw-r--r--storage/xtradb/include/log0log.ic446
-rw-r--r--storage/xtradb/include/log0recv.h530
-rw-r--r--storage/xtradb/include/log0recv.ic53
-rw-r--r--storage/xtradb/include/mach0data.h400
-rw-r--r--storage/xtradb/include/mach0data.ic783
-rw-r--r--storage/xtradb/include/mem0dbg.h150
-rw-r--r--storage/xtradb/include/mem0dbg.ic109
-rw-r--r--storage/xtradb/include/mem0mem.h402
-rw-r--r--storage/xtradb/include/mem0mem.ic640
-rw-r--r--storage/xtradb/include/mem0pool.h124
-rw-r--r--storage/xtradb/include/mem0pool.ic24
-rw-r--r--storage/xtradb/include/mtr0log.h250
-rw-r--r--storage/xtradb/include/mtr0log.ic275
-rw-r--r--storage/xtradb/include/mtr0mtr.h419
-rw-r--r--storage/xtradb/include/mtr0mtr.ic275
-rw-r--r--storage/xtradb/include/mtr0types.h31
-rw-r--r--storage/xtradb/include/mysql_addons.h33
-rw-r--r--storage/xtradb/include/os0file.h794
-rw-r--r--storage/xtradb/include/os0proc.h105
-rw-r--r--storage/xtradb/include/os0proc.ic27
-rw-r--r--storage/xtradb/include/os0sync.h445
-rw-r--r--storage/xtradb/include/os0sync.ic53
-rw-r--r--storage/xtradb/include/os0thread.h162
-rw-r--r--storage/xtradb/include/os0thread.ic25
-rw-r--r--storage/xtradb/include/page0cur.h362
-rw-r--r--storage/xtradb/include/page0cur.ic299
-rw-r--r--storage/xtradb/include/page0page.h1015
-rw-r--r--storage/xtradb/include/page0page.ic1076
-rw-r--r--storage/xtradb/include/page0types.h151
-rw-r--r--storage/xtradb/include/page0zip.h475
-rw-r--r--storage/xtradb/include/page0zip.ic397
-rw-r--r--storage/xtradb/include/pars0grm.h236
-rw-r--r--storage/xtradb/include/pars0opt.h75
-rw-r--r--storage/xtradb/include/pars0opt.ic24
-rw-r--r--storage/xtradb/include/pars0pars.h748
-rw-r--r--storage/xtradb/include/pars0pars.ic24
-rw-r--r--storage/xtradb/include/pars0sym.h244
-rw-r--r--storage/xtradb/include/pars0sym.ic24
-rw-r--r--storage/xtradb/include/pars0types.h50
-rw-r--r--storage/xtradb/include/que0que.h529
-rw-r--r--storage/xtradb/include/que0que.ic287
-rw-r--r--storage/xtradb/include/que0types.h60
-rw-r--r--storage/xtradb/include/read0read.h194
-rw-r--r--storage/xtradb/include/read0read.ic98
-rw-r--r--storage/xtradb/include/read0types.h32
-rw-r--r--storage/xtradb/include/rem0cmp.h197
-rw-r--r--storage/xtradb/include/rem0cmp.ic91
-rw-r--r--storage/xtradb/include/rem0rec.h824
-rw-r--r--storage/xtradb/include/rem0rec.ic1647
-rw-r--r--storage/xtradb/include/rem0types.h46
-rw-r--r--storage/xtradb/include/row0ext.h95
-rw-r--r--storage/xtradb/include/row0ext.ic84
-rw-r--r--storage/xtradb/include/row0ins.h156
-rw-r--r--storage/xtradb/include/row0ins.ic26
-rw-r--r--storage/xtradb/include/row0merge.h197
-rw-r--r--storage/xtradb/include/row0mysql.h807
-rw-r--r--storage/xtradb/include/row0mysql.ic24
-rw-r--r--storage/xtradb/include/row0purge.h96
-rw-r--r--storage/xtradb/include/row0purge.ic25
-rw-r--r--storage/xtradb/include/row0row.h310
-rw-r--r--storage/xtradb/include/row0row.ic120
-rw-r--r--storage/xtradb/include/row0sel.h402
-rw-r--r--storage/xtradb/include/row0sel.ic105
-rw-r--r--storage/xtradb/include/row0types.h59
-rw-r--r--storage/xtradb/include/row0uins.h54
-rw-r--r--storage/xtradb/include/row0uins.ic25
-rw-r--r--storage/xtradb/include/row0umod.h52
-rw-r--r--storage/xtradb/include/row0umod.ic24
-rw-r--r--storage/xtradb/include/row0undo.h142
-rw-r--r--storage/xtradb/include/row0undo.ic24
-rw-r--r--storage/xtradb/include/row0upd.h483
-rw-r--r--storage/xtradb/include/row0upd.ic184
-rw-r--r--storage/xtradb/include/row0vers.h142
-rw-r--r--storage/xtradb/include/row0vers.ic30
-rw-r--r--storage/xtradb/include/srv0que.h42
-rw-r--r--storage/xtradb/include/srv0srv.h733
-rw-r--r--storage/xtradb/include/srv0srv.ic24
-rw-r--r--storage/xtradb/include/srv0start.h134
-rw-r--r--storage/xtradb/include/sync0arr.h142
-rw-r--r--storage/xtradb/include/sync0arr.ic27
-rw-r--r--storage/xtradb/include/sync0rw.h588
-rw-r--r--storage/xtradb/include/sync0rw.ic624
-rw-r--r--storage/xtradb/include/sync0sync.h596
-rw-r--r--storage/xtradb/include/sync0sync.ic222
-rw-r--r--storage/xtradb/include/sync0types.h34
-rw-r--r--storage/xtradb/include/thr0loc.h101
-rw-r--r--storage/xtradb/include/thr0loc.ic24
-rw-r--r--storage/xtradb/include/trx0i_s.h247
-rw-r--r--storage/xtradb/include/trx0purge.h213
-rw-r--r--storage/xtradb/include/trx0purge.ic43
-rw-r--r--storage/xtradb/include/trx0rec.h338
-rw-r--r--storage/xtradb/include/trx0rec.ic112
-rw-r--r--storage/xtradb/include/trx0roll.h352
-rw-r--r--storage/xtradb/include/trx0roll.ic40
-rw-r--r--storage/xtradb/include/trx0rseg.h223
-rw-r--r--storage/xtradb/include/trx0rseg.ic145
-rw-r--r--storage/xtradb/include/trx0sys.h664
-rw-r--r--storage/xtradb/include/trx0sys.ic421
-rw-r--r--storage/xtradb/include/trx0trx.h849
-rw-r--r--storage/xtradb/include/trx0trx.ic164
-rw-r--r--storage/xtradb/include/trx0types.h115
-rw-r--r--storage/xtradb/include/trx0undo.h551
-rw-r--r--storage/xtradb/include/trx0undo.ic351
-rw-r--r--storage/xtradb/include/trx0xa.h70
-rw-r--r--storage/xtradb/include/univ.i501
-rw-r--r--storage/xtradb/include/usr0sess.h76
-rw-r--r--storage/xtradb/include/usr0sess.ic24
-rw-r--r--storage/xtradb/include/usr0types.h31
-rw-r--r--storage/xtradb/include/ut0auxconf.h14
-rw-r--r--storage/xtradb/include/ut0byte.h270
-rw-r--r--storage/xtradb/include/ut0byte.ic411
-rw-r--r--storage/xtradb/include/ut0dbg.h175
-rw-r--r--storage/xtradb/include/ut0list.h172
-rw-r--r--storage/xtradb/include/ut0list.ic48
-rw-r--r--storage/xtradb/include/ut0lst.h304
-rw-r--r--storage/xtradb/include/ut0mem.h307
-rw-r--r--storage/xtradb/include/ut0mem.ic338
-rw-r--r--storage/xtradb/include/ut0rbt.h309
-rw-r--r--storage/xtradb/include/ut0rnd.h150
-rw-r--r--storage/xtradb/include/ut0rnd.ic256
-rw-r--r--storage/xtradb/include/ut0sort.h106
-rw-r--r--storage/xtradb/include/ut0ut.h403
-rw-r--r--storage/xtradb/include/ut0ut.ic162
-rw-r--r--storage/xtradb/include/ut0vec.h125
-rw-r--r--storage/xtradb/include/ut0vec.ic96
-rw-r--r--storage/xtradb/include/ut0wqueue.h85
-rw-r--r--storage/xtradb/lock/lock0iter.c114
-rw-r--r--storage/xtradb/lock/lock0lock.c5773
-rw-r--r--storage/xtradb/log/log0log.c3507
-rw-r--r--storage/xtradb/log/log0recv.c3955
-rw-r--r--storage/xtradb/mach/mach0data.c134
-rw-r--r--storage/xtradb/mem/mem0dbg.c1041
-rw-r--r--storage/xtradb/mem/mem0mem.c573
-rw-r--r--storage/xtradb/mem/mem0pool.c728
-rw-r--r--storage/xtradb/mtr/mtr0log.c612
-rw-r--r--storage/xtradb/mtr/mtr0mtr.c401
-rw-r--r--storage/xtradb/os/os0file.c4604
-rw-r--r--storage/xtradb/os/os0proc.c401
-rw-r--r--storage/xtradb/os/os0sync.c762
-rw-r--r--storage/xtradb/os/os0thread.c375
-rw-r--r--storage/xtradb/page/page0cur.c2055
-rw-r--r--storage/xtradb/page/page0page.c2624
-rw-r--r--storage/xtradb/page/page0zip.c4677
-rw-r--r--storage/xtradb/pars/lexyy.c2795
-rwxr-xr-xstorage/xtradb/pars/make_bison.sh32
-rwxr-xr-xstorage/xtradb/pars/make_flex.sh48
-rw-r--r--storage/xtradb/pars/pars0grm.c2601
-rw-r--r--storage/xtradb/pars/pars0grm.y635
-rw-r--r--storage/xtradb/pars/pars0lex.l678
-rw-r--r--storage/xtradb/pars/pars0opt.c1216
-rw-r--r--storage/xtradb/pars/pars0pars.c2196
-rw-r--r--storage/xtradb/pars/pars0sym.c371
-rw-r--r--storage/xtradb/plug.in228
-rw-r--r--storage/xtradb/que/que0que.c1450
-rw-r--r--storage/xtradb/read/read0read.c540
-rw-r--r--storage/xtradb/rem/rem0cmp.c1204
-rw-r--r--storage/xtradb/rem/rem0rec.c1774
-rw-r--r--storage/xtradb/row/row0ext.c115
-rw-r--r--storage/xtradb/row/row0ins.c2533
-rw-r--r--storage/xtradb/row/row0merge.c2644
-rw-r--r--storage/xtradb/row/row0mysql.c4234
-rw-r--r--storage/xtradb/row/row0purge.c700
-rw-r--r--storage/xtradb/row/row0row.c1179
-rw-r--r--storage/xtradb/row/row0sel.c4964
-rw-r--r--storage/xtradb/row/row0uins.c361
-rw-r--r--storage/xtradb/row/row0umod.c866
-rw-r--r--storage/xtradb/row/row0undo.c393
-rw-r--r--storage/xtradb/row/row0upd.c2203
-rw-r--r--storage/xtradb/row/row0vers.c741
-rw-r--r--storage/xtradb/scripts/install_innodb_plugins.sql17
-rw-r--r--storage/xtradb/scripts/install_innodb_plugins_win.sql17
-rw-r--r--storage/xtradb/srv/srv0que.c49
-rw-r--r--storage/xtradb/srv/srv0srv.c3440
-rw-r--r--storage/xtradb/srv/srv0start.c2268
-rw-r--r--storage/xtradb/sync/sync0arr.c1023
-rw-r--r--storage/xtradb/sync/sync0rw.c1037
-rw-r--r--storage/xtradb/sync/sync0sync.c1525
-rw-r--r--storage/xtradb/thr/thr0loc.c308
-rw-r--r--storage/xtradb/trx/trx0i_s.c1481
-rw-r--r--storage/xtradb/trx/trx0purge.c1288
-rw-r--r--storage/xtradb/trx/trx0rec.c1611
-rw-r--r--storage/xtradb/trx/trx0roll.c1366
-rw-r--r--storage/xtradb/trx/trx0rseg.c324
-rw-r--r--storage/xtradb/trx/trx0sys.c1936
-rw-r--r--storage/xtradb/trx/trx0trx.c2156
-rw-r--r--storage/xtradb/trx/trx0undo.c2032
-rw-r--r--storage/xtradb/usr/usr0sess.c71
-rw-r--r--storage/xtradb/ut/ut0byte.c55
-rw-r--r--storage/xtradb/ut/ut0dbg.c187
-rw-r--r--storage/xtradb/ut/ut0list.c194
-rw-r--r--storage/xtradb/ut/ut0mem.c711
-rw-r--r--storage/xtradb/ut/ut0rbt.c1249
-rw-r--r--storage/xtradb/ut/ut0rnd.c97
-rw-r--r--storage/xtradb/ut/ut0ut.c625
-rw-r--r--storage/xtradb/ut/ut0vec.c79
-rw-r--r--storage/xtradb/ut/ut0wqueue.c118
423 files changed, 250066 insertions, 0 deletions
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
new file mode 100644
index 00000000000..789dbd5327e
--- /dev/null
+++ b/storage/xtradb/CMakeLists.txt
@@ -0,0 +1,256 @@
+# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+# This is the CMakeLists for XtraDB
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCSourceRuns)
+
+# OS tests
+IF(UNIX)
+ IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
+ CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
+ ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
+ IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
+ ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
+ LINK_LIBRARIES(aio)
+ ENDIF()
+ ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
+ ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
+ ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
+ ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
+ ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+ ADD_DEFINITIONS("-DUNIV_SOLARIS")
+ ELSE()
+ ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
+ ENDIF()
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG if MySQL's WITH_DEBUG[_FULL] is defined
+# enable when this bug is resolved:
+# Bug#54861 Additional connections not handled properly in mtr --embedded
+#IF(WITH_DEBUG)
+# ADD_DEFINITIONS("-DUNIV_DEBUG")
+#ENDIF()
+
+IF(NOT MSVC)
+# either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
+IF(NOT CMAKE_CROSSCOMPILING)
+ CHECK_C_SOURCE_RUNS(
+ "
+ int main()
+ {
+ long x;
+ long y;
+ long res;
+ char c;
+
+ x = 10;
+ y = 123;
+ res = __sync_bool_compare_and_swap(&x, x, y);
+ if (!res || x != y) {
+ return(1);
+ }
+
+ x = 10;
+ y = 123;
+ res = __sync_bool_compare_and_swap(&x, x + 1, y);
+ if (res || x != 10) {
+ return(1);
+ }
+ x = 10;
+ y = 123;
+ res = __sync_add_and_fetch(&x, y);
+ if (res != 123 + 10 || x != 123 + 10) {
+ return(1);
+ }
+
+ c = 10;
+ res = __sync_lock_test_and_set(&c, 123);
+ if (res != 10 || c != 123) {
+ return(1);
+ }
+ return(0);
+ }"
+ HAVE_IB_GCC_ATOMIC_BUILTINS
+ )
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
+ENDIF()
+
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
+IF(NOT CMAKE_CROSSCOMPILING)
+ CHECK_C_SOURCE_RUNS(
+ "
+ #include <pthread.h>
+ #include <string.h>
+
+ int main() {
+ pthread_t x1;
+ pthread_t x2;
+ pthread_t x3;
+
+ memset(&x1, 0x0, sizeof(x1));
+ memset(&x2, 0x0, sizeof(x2));
+ memset(&x3, 0x0, sizeof(x3));
+
+ __sync_bool_compare_and_swap(&x1, x2, x3);
+
+ return(0);
+ }"
+ HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+ENDIF()
+IF(HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+ ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_GCC=1)
+ENDIF()
+
+ENDIF(NOT MSVC)
+
+# Solaris atomics
+IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+ CHECK_FUNCTION_EXISTS(atomic_cas_ulong HAVE_ATOMIC_CAS_ULONG)
+ CHECK_FUNCTION_EXISTS(atomic_cas_32 HAVE_ATOMIC_CAS_32)
+ CHECK_FUNCTION_EXISTS(atomic_cas_64 HAVE_ATOMIC_CAS_64)
+ CHECK_FUNCTION_EXISTS(atomic_add_long HAVE_ATOMIC_ADD_LONG)
+ IF(HAVE_ATOMIC_CAS_ULONG AND HAVE_ATOMIC_CAS_32 AND
+ HAVE_ATOMIC_CAS_64 AND HAVE_ATOMIC_ADD_LONG)
+ SET(HAVE_IB_SOLARIS_ATOMICS 1)
+ ENDIF()
+
+ IF(HAVE_IB_SOLARIS_ATOMICS)
+ ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
+ ENDIF()
+
+ IF(NOT CMAKE_CROSSCOMPILING)
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
+ CHECK_C_SOURCE_COMPILES(
+ " #include <pthread.h>
+ #include <string.h>
+
+ int main(int argc, char** argv) {
+ pthread_t x1;
+ pthread_t x2;
+ pthread_t x3;
+
+ memset(&x1, 0x0, sizeof(x1));
+ memset(&x2, 0x0, sizeof(x2));
+ memset(&x3, 0x0, sizeof(x3));
+
+ if (sizeof(pthread_t) == 4) {
+
+ atomic_cas_32(&x1, x2, x3);
+
+ } else if (sizeof(pthread_t) == 8) {
+
+ atomic_cas_64(&x1, x2, x3);
+
+ } else {
+
+ return(1);
+ }
+
+ return(0);
+ }
+ " HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+ ENDIF()
+ IF(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+ ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_SOLARIS=1)
+ ENDIF()
+ENDIF()
+
+
+IF(UNIX)
+# this is needed to know which one of atomic_cas_32() or atomic_cas_64()
+# to use in the source
+SET(CMAKE_EXTRA_INCLUDE_FILES pthread.h)
+CHECK_TYPE_SIZE(pthread_t SIZEOF_PTHREAD_T)
+SET(CMAKE_EXTRA_INCLUDE_FILES)
+ENDIF()
+
+IF(SIZEOF_PTHREAD_T)
+ ADD_DEFINITIONS(-DSIZEOF_PTHREAD_T=${SIZEOF_PTHREAD_T})
+ENDIF()
+
+IF(MSVC)
+ ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
+ENDIF()
+
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+ ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
+
+# Sun Studio bug with -xO2
+IF(CMAKE_C_COMPILER_ID MATCHES "SunPro"
+ AND CMAKE_C_FLAGS_RELEASE MATCHES "O2"
+ AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+ # Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
+ # -xO3
+ SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.c
+ PROPERTIES COMPILE_FLAGS -xO3)
+ENDIF()
+
+# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
+# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
+IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+ SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.c mem/mem0pool.c
+ PROPERTIES COMPILE_FLAGS -Od)
+ENDIF()
+
+ADD_DEFINITIONS(-D_WIN32 -D_LIB -DMYSQL_SERVER)
+
+SET(XTRADB_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
+ buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
+ data/data0data.c data/data0type.c
+ dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
+ dyn/dyn0dyn.c
+ eval/eval0eval.c eval/eval0proc.c
+ fil/fil0fil.c
+ fsp/fsp0fsp.c
+ fut/fut0fut.c fut/fut0lst.c
+ ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c
+ ibuf/ibuf0ibuf.c
+ pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
+ lock/lock0lock.c lock/lock0iter.c
+ log/log0log.c log/log0recv.c
+ mach/mach0data.c
+ mem/mem0mem.c mem/mem0pool.c
+ mtr/mtr0log.c mtr/mtr0mtr.c
+ os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c
+ page/page0cur.c page/page0page.c page/page0zip.c
+ que/que0que.c
+ handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc handler/mysql_addons.cc
+ read/read0read.c
+ rem/rem0cmp.c rem/rem0rec.c
+ row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c row/row0purge.c row/row0row.c
+ row/row0sel.c row/row0uins.c row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c
+ srv/srv0que.c srv/srv0srv.c srv/srv0start.c
+ sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c
+ thr/thr0loc.c
+ trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
+ trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
+ usr/usr0sess.c
+ ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
+ ut/ut0list.c ut/ut0wqueue.c)
+
+# New plugin support, cross-platform , base name for shared module is "ha_innodb"
+MYSQL_ADD_PLUGIN(innobase ${XTRADB_SOURCES} STORAGE_ENGINE
+ DEFAULT
+ MODULE_OUTPUT_NAME ha_xtradb
+ LINK_LIBRARIES ${ZLIB_LIBRARY})
diff --git a/storage/xtradb/COPYING.Google b/storage/xtradb/COPYING.Google
new file mode 100644
index 00000000000..5ade2b0e381
--- /dev/null
+++ b/storage/xtradb/COPYING.Google
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Google, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of the Google Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/COPYING.Percona b/storage/xtradb/COPYING.Percona
new file mode 100644
index 00000000000..8c786811719
--- /dev/null
+++ b/storage/xtradb/COPYING.Percona
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Percona, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, 2009, Percona Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of the Percona Inc. nor the names of its
+ contributors may be used to endorse or promote products
+ derived from this software without specific prior written
+ permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/COPYING.Sun_Microsystems b/storage/xtradb/COPYING.Sun_Microsystems
new file mode 100644
index 00000000000..5a77ef3ab73
--- /dev/null
+++ b/storage/xtradb/COPYING.Sun_Microsystems
@@ -0,0 +1,31 @@
+Portions of this software contain modifications contributed by
+Sun Microsystems, Inc. These contributions are used with the following
+license:
+
+Copyright (c) 2009, Sun Microsystems, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ * Neither the name of Sun Microsystems, Inc. nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog
new file mode 100644
index 00000000000..43f87a1baf5
--- /dev/null
+++ b/storage/xtradb/ChangeLog
@@ -0,0 +1,1916 @@
+2010-08-24 The InnoDB Team
+
+ * handler/ha_innodb.c, dict/dict0dict.c:
+ Fix Bug #55832 selects crash too easily when innodb_force_recovery>3
+
+2010-08-03 The InnoDB Team
+
+ * include/dict0dict.h, include/dict0dict.ic, row/row0mysql.c:
+ Fix bug #54678, InnoDB, TRUNCATE, ALTER, I_S SELECT, crash or deadlock
+
+2010-08-03 The InnoDB Team
+
+ * dict/dict0load.c, handler/ha_innodb.cc, include/db0err.h,
+ include/dict0load.h, include/dict0mem.h, include/que0que.h,
+ row/row0merge.c, row/row0mysql.c:
+ Fix Bug#54582 stack overflow when opening many tables linked
+ with foreign keys at once
+
+2010-08-03 The InnoDB Team
+
+ * include/ut0mem.h, ut/ut0mem.c:
+ Fix Bug #55627 segv in ut_free pars_lexer_close innobase_shutdown
+ innodb-use-sys-malloc=0
+
+2010-08-01 The InnoDB Team
+
+ * handler/ha_innodb.cc
+ Fix Bug #55382 Assignment with SELECT expressions takes unexpected
+ S locks in READ COMMITTED
+>>>>>>> MERGE-SOURCE
+
+2010-07-27 The InnoDB Team
+
+ * include/mem0pool.h, mem/mem0mem.c, mem/mem0pool.c, srv/srv0start.c:
+ Fix Bug#55581 shutdown with innodb-use-sys-malloc=0: assert
+ mutex->magic_n == MUTEX_MAGIC_N.
+
+2010-06-30 The InnoDB Team
+
+ * btr/btr0sea.c, ha/ha0ha.c, handler/ha_innodb.cc, include/btr0sea.h:
+ Fix Bug#54311 Crash on CHECK PARTITION after concurrent LOAD DATA
+ and adaptive_hash_index=OFF
+
+2010-06-29 The InnoDB Team
+ * row/row0row.c, row/row0undo.c, row/row0upd.c:
+ Fix Bug#54408 txn rollback after recovery: row0umod.c:673
+ dict_table_get_format(index->table)
+
+2010-06-29 The InnoDB Team
+
+ * btr/btr0cur.c, include/btr0cur.h,
+ include/row0mysql.h, row/row0merge.c, row/row0sel.c:
+ Fix Bug#54358 READ UNCOMMITTED access failure of off-page DYNAMIC
+ or COMPRESSED columns
+
+2010-06-24 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#54679 alter table causes compressed row_format to revert
+ to compact
+
+2010-06-22 The InnoDB Team
+
+ * dict/dict0dict.c, dict/dict0mem.c, include/dict0mem.h,
+ include/univ.i, page/page0zip.c, row/row0merge.c:
+ Fix Bug#47991 InnoDB Dictionary Cache memory usage increases
+ indefinitely when renaming tables
+
+2010-06-22 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#54686: "field->col->mtype == type" assertion error at
+ row/row0sel.c
+
+2010-06-22 The InnoDB Team
+
+ * handler/ha_innodb.cc, innodb_bug54044.result, innodb_bug54044.test:
+ Fix Bug#54044 Create temporary tables and using innodb crashes.
+
+2010-06-21 The InnoDB Team
+
+ * dict/dict0load.c, fil/fil0fil.c:
+ Fix Bug#54658: InnoDB: Warning: allocated tablespace %lu,
+ old maximum was 0 (introduced in Bug #53578 fix)
+
+2010-06-16 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix Bug#54330 Broken fast index creation
+
+2010-06-10 The InnoDB Team
+
+ * include/log0log.ic, row/row0ins.c, row/row0purge.c,
+ row/row0uins.c, row/row0umod.c, row/row0upd.c:
+ Fix Bug#39168 ERROR: the age of the last checkpoint ... exceeds
+ the log group capacity
+
+2010-06-08 The InnoDB Team
+
+ * dict/dict0load.c:
+ Fix Bug#54009 Server crashes when data is selected from non backed
+ up table for InnoDB plugin
+
+2010-06-02 The InnoDB Team
+
+ * include/db0err.h, include/lock0lock.h, include/row0mysql.h,
+ lock/lock0lock.c, row/row0ins.c, row/row0mysql.c, row/row0sel.c:
+ Fix Bug#53674 InnoDB: Error: unlock row could not find a
+ 4 mode lock on the record
+
+2010-06-01 The InnoDB Team
+
+ * include/sync0rw.h, sync/sync0rw.c:
+ Fix Bug#48197 Concurrent rw_lock_free may cause assertion failure
+
+2010-06-01 The InnoDB Team
+
+ * row/row0umod.c:
+ Fix Bug#53812 assert row/row0umod.c line 660 in txn rollback
+ after crash recovery
+
+2010-05-25 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+ Fix Bug#53592: crash replacing duplicates into table after fast
+ alter table added unique key
+
+2010-05-24 The InnoDB Team
+
+ * dict/dict0boot.c, dict/dict0crea.c, fil/fil0fil.c,
+ include/dict0boot.h, include/fil0fil.h, row/row0mysql.c:
+ Fix Bug#53578: assert on invalid page access, in fil_io()
+
+2010-05-14 The InnoDB Team
+ * mysql-test/innodb_bug48024.test, mysql-test/innodb_bug48024.result,
+ dict/dict0dict.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+ include/dict0dict.h, include/ha_prototypes.h, include/row0mysql.h,
+ include/trx0trx.h, row/row0mysql.c, trx/trx0i_s.c, trx/trx0trx.c:
+ Fix Bug#48024 Innodb doesn't work with multi-statements
+ Fix Bug#53644 InnoDB thinks that /*/ starts and ends a comment
+
+2010-05-12 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Fix Bug#53591 crash with fast alter table and text/blob prefix
+ primary key
+
+2010-05-12 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix Bug#53471 row_merge_drop_temp_indexes() refers freed memory, SEGVs
+
+2010-05-11 The InnoDB Team
+
+ * mysql-test/innodb_bug53290.test, mysql-test/innodb_bug53290.result,
+ include/rem0cmp.h, rem/rem0cmp.c, row/row0merge.c:
+ Fix Bug#53290 wrong duplicate key error when adding a unique index
+ via fast alter table
+
+2010-05-11 The InnoDB Team
+ * buf/buf0lru.c, include/buf0buf.ic:
+ Fix Bug#53307 valgrind: warnings in main.partition_innodb_plugin
+
+2010-05-05 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix Bug#53256 in a stress test, assert dict/dict0dict.c:815
+ table2 == NULL
+
+2010-05-05 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#53165 Setting innodb_change_buffering=DEFAULT produces
+ incorrect result
+
+2010-05-04 The InnoDB Team
+
+ * fsp/fsp0fsp.c:
+ Fix Bug#53306 valgrind: warnings in innodb.innodb
+
+2010-05-03 The InnoDB Team
+
+ * buf0buf.c:
+ Fix Bug#53248 compressed tables page checksum mismatch after
+ re-enabling innodb_checksums
+
+2010-04-28 The InnoDB Team
+
+ * log/log0recv.h, log/log0recv.c:
+ Fix Bug#53122 InnoDB recovery uses too big a hash table for redo
+ log records
+
+2010-04-27 The InnoDB Team
+
+ * handler/ha_innodb.cc, lock/lock0lock.c, row/row0mysql.c,
+ row/row0sel.c:
+ Fix Bug#48607 READ UNCOMMITTED uses more locks than READ COMMITTED
+ in InnoDB 5.1+
+
+2010-04-26 The InnoDB Team
+
+ * row/row0sel.c:
+ Fix Bug#52663 Lost update incrementing column value under
+ READ COMMITTED isolation level
+
+2010-04-22 The InnoDB Team
+
+ * include/dict0boot.h, dict/dict0boot.c:
+ Fix a bug that prevented the crash recovery of fast CREATE INDEX
+ from dropping partially created indexes.
+
+2010-04-21 The InnoDB Team
+
+ * btr/btr0btr.c:
+ Fix Bug#52964 Infinite loop in btr_page_split_and_insert()
+ in ROW_FORMAT=COMPRESSED
+
+2010-04-21 The InnoDB Team
+
+ * data/data0data.c:
+ Fix Bug#52745 Failing assertion: blob_no < page_zip->n_blobs
+
+2010-04-20 The InnoDB Team
+
+ * dict/dict0crea.c, handler/ha_innodb.cc, include/trx0trx.h:
+ Fix Bug#50495 'Row size too large' for plugin, but works for
+ built-in InnoDB
+ Only check the record size at index creation time when
+ innodb_strict_mode is set or when ROW_FORMAT is DYNAMIC or COMPRESSED.
+
+2010-04-15 The InnoDB Team
+
+ * trx/trx0rec.c:
+ Fix Bug#52746 InnoDB purge thread crashed with table containing
+ prefix indexed blobs
+
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug51920.test, mysql-test/innodb_bug51920.result,
+ srv/srv0srv.c:
+ Fix Bug#51920 InnoDB connections in row lock wait ignore KILL
+ until lock wait timeout
+
+2010-03-31 The InnoDB Team
+
+ * mysql-test/innodb_bug38231.test:
+ Remove non-determinism in the test case.
+
+2010-03-29 The InnoDB Team
+
+ InnoDB Plugin 1.0.7 released
+
+2010-03-18 The InnoDB Team
+
+ * CMakeLists.txt:
+ Fix Bug#52102 InnoDB Plugin shows performance drop compared to
+ InnoDB (Windows)
+
+2010-03-18 The InnoDB Team
+
+ * buf0buf.ic:
+ When comparing the time of the first access to a block against
+ innodb_old_blocks_time, use 32-bit arithmetics. The comparison was
+ incorrect on 64-bit systems.
+
+2010-03-11 The InnoDB Team
+
+ * buf0buf.h, buf0buf.ic:
+ Fix and clarify the latching of some buf_block_t members.
+ Note that check_index_page_at_flush is not protected by any mutex.
+ Note and assert that lock_hash_val is protected by the rw-latch.
+
+2010-03-10 The InnoDB Team
+
+ * trx/trx0sys.c:
+ Fix Bug#51653 outdated reference to set-variable
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+ mysql-test/innodb_bug47621.result, mysql-test/innodb_bug47621.test:
+ Fix Bug#47621 MySQL and InnoDB data dictionaries will become out of
+ sync when renaming columns
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#51356 Many Valgrind errors in error messages
+ with concurrent DDL
+
+2010-03-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/handler0alter.cc,
+ mysql-test/innodb_bug51378.result, mysql-test/innodb_bug51378.test:
+ Fix Bug#51378 Init 'ref_length' to correct value, in case an out
+ of bound MySQL primary_key
+
+2010-03-10 The InnoDB Team
+
+ * log/log0recv.c:
+ Remove a bogus assertion about page numbers exceeding 0x90000000
+ in the redo log. Abort when encountering a corrupted redo log
+ record, unless innodb_force_recovery is set.
+
+2010-03-09 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make SHOW ENGINE INNODB MUTEX STATUS display SUM(os_waits)
+ for the buffer pool block mutexes and locks.
+
+2010-03-08 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Fix ALTER TABLE ... IMPORT TABLESPACE of compressed tables.
+
+2010-03-03 The InnoDB Team
+
+ * handler/handler0alter.cc, innodb-index.result, innodb-index.test,
+ innodb.result, innodb.test:
+ Disallow a duplicate index name when creating an index.
+
+2010-02-11 The InnoDB Team
+
+ * include/mem0mem.h, include/mem0mem.ic, mem/mem0mem.c:
+ Fix Bug#49535 Available memory check slows down crash
+ recovery tens of times
+
+2010-02-09 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Fix Bug#38901 InnoDB logs error repeatedly when trying to load
+ page into buffer pool
+
+2010-02-09 The InnoDB Team
+
+ * srv/srv0srv.c:
+ Let the master thread sleep if the amount of work to be done is
+ calibrated as taking less than a second.
+
+2010-02-04 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, buf/buf0buf.c,
+ include/btr0btr.h, include/btr0cur.h, include/btr0pcur.h,
+ include/btr0pcur.ic, include/buf0buf.h, row/row0ins.c, row/row0sel.c:
+ Pass the file name and line number of the caller of the
+ b-tree cursor functions to the buffer pool requests, in order
+ to make the latch diagnostics more accurate.
+
+2010-02-03 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#49001 SHOW INNODB STATUS deadlock info incorrect
+ when deadlock detection aborts
+
+2010-02-03 The InnoDB Team
+
+ * buf/buf0lru.c:
+ Fix Bug#35077 Very slow DROP TABLE (ALTER TABLE, OPTIMIZE TABLE)
+ on compressed tables
+
+2010-02-03 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+ Clean up CHECK TABLE error handling.
+
+2010-02-01 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.test,
+ mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc-44030.test,
+ mysql-test/innodb-autoinc-44030.result:
+ Fix Bug#49497 Error 1467 (ER_AUTOINC_READ_FAILED) on inserting
+ a negative value
+
+2010-01-28 The InnoDB Team
+ * handler/ha_innodb.h, handler/ha_innodb.cc,
+ handler/handler0alter.cc,
+ mysql-test/innodb_bug47622.test,
+ mysql-test/innodb_bug47622.result:
+ Fix Bug#47622 the new index is added before the existing ones
+ in MySQL, but after one in SE
+
+2010-01-27 The InnoDB Team
+
+ * include/row0mysql.h, log/log0recv.c, row/row0mysql.c:
+ Drop temporary tables at startup.
+ This addresses the third aspect of
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+
+2010-01-21 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Do not merge buffered inserts to compressed pages before
+ the redo log has been applied in crash recovery.
+
+2010-01-13 The InnoDB Team
+
+ * row/row0sel.c:
+ On the READ UNCOMMITTED isolation level, do not attempt to access
+ a clustered index record that has been marked for deletion. The
+ built-in InnoDB in MySQL 5.1 and earlier would attempt to retrieve
+ a previous version of the record in this case.
+
+2010-01-13 The InnoDB Team
+
+ * buf/buf0buf.c:
+ When disabling the adaptive hash index, check the block state
+ before checking block->is_hashed, because the latter may be
+ uninitialized right after server startup.
+
+2010-01-12 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h:
+ Fix Bug#46193 crash when accessing tables after enabling
+ innodb_force_recovery option
+
+2010-01-12 The InnoDB Team
+
+ * row/row0mysql.c:
+ Fix Bug#49238 Creating/Dropping a temporary table while at 1023
+ transactions will cause assert.
+
+2009-12-02 The InnoDB Team
+
+ * srv/srv0start.c:
+ Display the zlib version number at startup.
+ InnoDB compressed tables use zlib, and the implementation depends
+ on the zlib function compressBound(), whose definition was slightly
+ changed in zlib version 1.2.3.1 in 2006. MySQL bundles zlib 1.2.3
+ from 2005, but some installations use a more recent zlib.
+
+2009-11-30 The InnoDB Team
+
+ * dict/dict0crea.c, dict/dict0mem.c, dict/dict0load.c,
+ dict/dict0boot.c, fil/fil0fil.c, handler/ha_innodb.cc,
+ include/dict0mem.h, row/row0mysql.c:
+ Fix the bogus warning messages for non-existing temporary
+ tables that were reported in
+ Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+ The actual crash recovery bug was corrected on 2009-04-29.
+
+2009-11-27 The InnoDB Team
+
+ InnoDB Plugin 1.0.6 released
+
+2009-11-20 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Add a workaround to prevent a crash due to Bug#45961 DDL on
+ partitioned innodb tables leaves data dictionary in an inconsistent
+ state
+
+2009-11-19 The InnoDB Team
+
+ * btr/btr0btr.c:
+ Fix Bug#48469 when innodb tablespace is configured too small, crash
+ and corruption!
+
+2009-11-19 The InnoDB Team
+
+ * data/data0type.c:
+ Fix Bug#48526 Data type for float and double is incorrectly reported
+ in InnoDB table monitor
+
+2009-11-19 The InnoDB Team
+
+ * CMakeLists.txt:
+ Fix Bug#48317 cannot build innodb as static library
+
+2009-11-18 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Fix Bug#48782 On lock wait timeout, CREATE INDEX (creating primary key)
+ attempts DROP TABLE
+
+2009-11-17 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb.result,
+ mysql-test/innodb.test, mysql-test/innodb_bug44369.result,
+ mysql-test/innodb_bug44369.test, mysql-test/patches/innodb-index.diff,
+ row/row0mysql.c:
+ Report duplicate table names to the client connection, not to the
+ error log.
+
+2009-11-12 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/db0err.h, row/row0merge.c,
+ row/row0mysql.c:
+ Allow CREATE INDEX to be interrupted.
+ Also, when CHECK TABLE is interrupted, report ER_QUERY_INTERRUPTED.
+
+2009-11-11 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug47167.result,
+ mysql-test/innodb_bug47167.test, mysql-test/innodb_file_format.result:
+ Fix Bug#47167 "set global innodb_file_format_check" cannot set value
+ by User-Defined Variable
+
+2009-11-11 The InnoDB Team
+
+ * include/os0file.h, os/os0file.c:
+ Fix Bug#3139 Mysql crashes: 'windows error 995' after several selects
+ on a large DB
+
+2009-11-04 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#32430 'show innodb status' causes errors
+ Invalid (old?) table or database name in logs
+
+2009-11-02 The InnoDB Team
+
+ * btr/btr0sea.c, buf/buf0buf.c, dict/dict0dict.c, fil/fil0fil.c,
+ ibuf/ibuf0ibuf.c, include/btr0sea.h, include/dict0dict.h,
+ include/fil0fil.h, include/ibuf0ibuf.h, include/lock0lock.h,
+ include/log0log.h, include/log0recv.h, include/mem0mem.h,
+ include/mem0pool.h, include/os0file.h, include/pars0pars.h,
+ include/srv0srv.h, include/thr0loc.h, include/trx0i_s.h,
+ include/trx0purge.h, include/trx0rseg.h, include/trx0sys.h,
+ include/trx0undo.h, include/usr0sess.h, lock/lock0lock.c,
+ log/log0log.c, log/log0recv.c, mem/mem0dbg.c, mem/mem0pool.c,
+ os/os0file.c, os/os0sync.c, os/os0thread.c, pars/lexyy.c,
+ pars/pars0lex.l, que/que0que.c, srv/srv0srv.c, srv/srv0start.c,
+ sync/sync0arr.c, sync/sync0sync.c, thr/thr0loc.c, trx/trx0i_s.c,
+ trx/trx0purge.c, trx/trx0rseg.c, trx/trx0sys.c, trx/trx0undo.c,
+ usr/usr0sess.c, ut/ut0mem.c:
+ Fix Bug#45992 innodb memory not freed after shutdown
+ Fix Bug#46656 InnoDB plugin: memory leaks (Valgrind)
+
+2009-10-29 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#47125 auto_increment start value is ignored if an index is
+ created and engine=innodb
+
+2009-10-29 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug47777.result,
+ mysql-test/innodb_bug47777.test:
+ Fix Bug#47777 innodb dies with spatial pk: Failing assertion: buf <=
+ original_buf + buf_len
+
+2009-10-29 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#38996 Race condition in ANALYZE TABLE
+
+2009-10-29 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix bug#42383: Can't create table 'test.bug39438'
+
+2009-10-29 The InnoDB Team
+
+ * os/os0proc.c:
+ Fix Bug#48237 Error handling in os_mem_alloc_large appears to
+ be incorrect
+
+2009-10-29 The InnoDB Team
+
+ * buf/buf0buf.c, buf/buf0lru.c, include/buf0buf.h, include/buf0buf.ic:
+ Fix corruption of the buf_pool->LRU_old list and improve debug
+ assertions.
+
+2009-10-28 The InnoDB Team
+
+ * srv/srv0start.c:
+ Fix Bug#41490 After enlargement of InnoDB page size, the error message
+ become inaccurate
+
+2009-10-26 The InnoDB Team
+
+ * row/row0ins.c:
+ When allocating a data tuple, zero out the system fields in order
+ to avoid Valgrind warnings about uninitialized fields in
+ dtuple_validate().
+
+2009-10-22 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-zip.result,
+ mysql-test/innodb-zip.test, mysql-test/innodb_bug44369.result,
+ mysql-test/innodb_bug44369.test:
+ Fix Bug#47233 Innodb calls push_warning(MYSQL_ERROR::WARN_LEVEL_ERROR)
+
+2009-10-19 The InnoDB Team
+
+ * mysql-test/innodb_information_schema.test:
+ Fix Bug#47808 innodb_information_schema.test fails when run under
+ valgrind
+
+2009-10-15 The InnoDB Team
+
+ * include/page0page.ic:
+ Fix Bug#47058 Failure to compile innodb_plugin on solaris 10u7 + spro
+ cc/CC 5.10
+
+2009-10-13 The InnoDB Team
+
+ * buf/buf0flu.c:
+ Call fsync() on datafiles after a batch of pages is written to disk
+ even when skip_innodb_doublewrite is set.
+
+2009-10-05 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Do not invalidate buffer pool while an LRU batch is active. Added code
+ to buf_pool_invalidate() to wait for the running batches to finish.
+
+2009-10-01 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#47763 typo in error message: Failed to open table %s after %lu
+ attemtps.
+
+2009-10-01 The InnoDB Team
+
+ * fsp/fsp0fsp.c, row/row0merge.c:
+ Clean up after a crash during DROP INDEX. When InnoDB crashes
+ while dropping an index, ensure that the index will be completely
+ dropped during crash recovery. The MySQL .frm file may still
+ contain the dropped index, but there is little that we can do
+ about it.
+
+2009-09-28 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ When a secondary index exists in the MySQL .frm file but not in
+ the InnoDB data dictionary, return an error instead of letting an
+ assertion fail in index_read.
+
+2009-09-28 The InnoDB Team
+
+ * btr/btr0btr.c, buf/buf0buf.c, include/page0page.h,
+ include/page0zip.h, page/page0cur.c, page/page0page.c,
+ page/page0zip.c:
+ Do not write to PAGE_INDEX_ID when restoring an uncompressed page
+ after a compression failure. The field should only be written
+ when creating a B-tree page. This fix addresses a race condition
+ in a debug assertion.
+
+2009-09-28 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Try to prevent the reuse of tablespace identifiers after InnoDB
+ has crashed during table creation. Also, refuse to start if files
+ with duplicate tablespace identifiers are encountered.
+
+2009-09-25 The InnoDB Team
+
+ * include/os0file.h, os/os0file.c:
+ Fix Bug#47055 unconditional exit(1) on ERROR_WORKING_SET_QUOTA
+ 1453 (0x5AD) for InnoDB backend
+
+2009-09-19 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-consistent-master.opt,
+ mysql-test/innodb-consistent.result,
+ mysql-test/innodb-consistent.test:
+ Fix Bug#37232 Innodb might get too many read locks for DML with
+ repeatable-read
+
+2009-09-19 The InnoDB Team
+
+ * fsp/fsp0fsp.c:
+ Fix Bug#31183 Tablespace full problems not reported in error log,
+ error message unclear
+
+2009-09-17 The InnoDB Team
+
+ * mysql-test/innodb-zip.result, mysql-test/innodb-zip.test:
+ Make the test pass with zlib 1.2.3.3. Apparently, the definition
+ of compressBound() has changed between zlib versions, and the
+ maximum record size of a table with 1K compressed page size has
+ been reduced by one byte. This is an arbitrary test. In practical
+ applications, for good write performance, the compressed page size
+ should be chosen to be bigger than the absolute minimum.
+
+2009-09-16 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#46256 drop table with unknown collation crashes innodb
+
+2009-09-16 The InnoDB Team
+
+ * dict/dict0dict.c, handler/ha_innodb.cc,
+ mysql-test/innodb_bug44369.result, mysql-test/innodb_bug44369.test,
+ row/row0mysql.c:
+ Fix Bug#44369 InnoDB: Does not uniformly disallow disallowed column
+ names
+
+2009-09-16 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/db0err.h,
+ mysql-test/innodb_bug46000.result, mysql-test/innodb_bug46000.test:
+ Fix Bug#46000 using index called GEN_CLUST_INDEX crashes server
+
+2009-09-02 The InnoDB Team
+
+ * include/lock0lock.h, include/row0mysql.h, lock/lock0lock.c,
+ row/row0mysql.c:
+ Fix a regression introduced by the fix for MySQL bug#26316. We check
+ whether a transaction holds any AUTOINC locks before we acquire
+ the kernel mutex and release those locks.
+
+2009-08-27 The InnoDB Team
+
+ * dict/dict0dict.c, include/dict0dict.h,
+ mysql-test/innodb_bug44571.result, mysql-test/innodb_bug44571.test:
+ Fix Bug#44571 InnoDB Plugin crashes on ADD INDEX
+
+2009-08-27 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix a bug in the merge sort that can corrupt indexes in fast index
+ creation. Add some consistency checks. Check that the number of
+ records remains constant in every merge sort pass.
+
+2009-08-27 The InnoDB Team
+
+ * buf/buf0buf.c, buf/buf0lru.c, buf/buf0rea.c, handler/ha_innodb.cc,
+ include/buf0buf.h, include/buf0buf.ic, include/buf0lru.h,
+ include/ut0ut.h, ut/ut0ut.c:
+ Make it possible to tune the buffer pool LRU eviction policy to be
+ more resistant against index scans. Introduce the settable global
+ variables innodb_old_blocks_pct and innodb_old_blocks_time for
+ controlling the buffer pool eviction policy. The parameter
+ innodb_old_blocks_pct (5..95) controls the desired amount of "old"
+ blocks in the LRU list. The default is 37, corresponding to the
+ old fixed ratio of 3/8. Each time a block is accessed, it will be
+ moved to the "new" blocks if its first access was at least
+ innodb_old_blocks_time milliseconds ago (default 0, meaning every
+ block). The idea is that in index scans, blocks will be accessed
+ a few times within innodb_old_blocks_time, and they will remain in
+ the "old" section of the LRU list. Thus, when innodb_old_blocks_time
+ is nonzero, blocks retrieved for one-time index scans will be more
+ likely candidates for eviction than blocks that are accessed in
+ random patterns.
+
+2009-08-26 The InnoDB Team
+
+ * handler/ha_innodb.cc, os/os0file.c:
+ Fix Bug#42885 buf_read_ahead_random, buf_read_ahead_linear counters,
+ thread wakeups
+
+2009-08-20 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#46650 Innodb assertion autoinc_lock == lock in
+ lock_table_remove_low on INSERT SELECT
+
+2009-08-13 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Fix Bug#46657 InnoDB plugin: invalid read in index_merge_innodb test
+ (Valgrind)
+
+2009-08-11 The InnoDB Team
+
+ InnoDB Plugin 1.0.4 released
+
+2009-07-20 The InnoDB Team
+
+ * buf/buf0rea.c, handler/ha_innodb.cc, include/srv0srv.h,
+ srv/srv0srv.c:
+ Change the read ahead parameter name to innodb_read_ahead_threshold.
+ Change the meaning of this parameter to signify the number of pages
+ that must be sequentially accessed for InnoDB to trigger a readahead
+ request.
+
+2009-07-20 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#39802 On Windows, 32-bit time_t should be enforced
+
+2009-07-16 The InnoDB Team
+
+ * include/univ.i:
+ Support inlining of functions and prefetch with Sun Studio.
+ These changes are based on contribution from Sun Microsystems Inc.
+ under a BSD license.
+
+2009-07-14 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to
+ match documentation
+
+2009-07-14 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+ mysql-test/innodb_bug21704.test:
+ Fix Bug#21704 Renaming column does not update FK definition
+
+2009-07-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, srv/srv0srv.c:
+ Change the defaults for
+ innodb_sync_spin_loops: 20 -> 30
+ innodb_spin_wait_delay: 5 -> 6
+
+2009-07-08 The InnoDB Team
+
+ * buf/buf0flu.c, handler/ha_innodb.cc, include/buf0flu.h,
+ include/log0log.h, include/log0log.ic, include/srv0srv.h,
+ srv/srv0srv.c:
+ Implement the adaptive flushing of dirty pages, which uses
+ a heuristics based flushing rate of dirty pages to avoid IO
+ bursts at checkpoint. Expose new configure knob
+ innodb_adaptive_flushing to control whether the new flushing
+ algorithm should be used.
+
+2009-07-07 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h, log/log0log.c,
+ srv/srv0srv.c:
+ Implement IO capacity tuning. Expose new configure knob
+ innodb_io_capacity to control the master threads IO rate. The
+ ibuf merge is also changed from synchronous to asynchronous.
+ These changes are based on contribution from Google Inc.
+ under a BSD license.
+
+2009-07-02 The InnoDB Team
+
+ * include/ut0ut.h, plug.in, ut/ut0ut.c:
+ Use the PAUSE instruction inside the spinloop if it is available,
+ Thanks to Mikael Ronstrom <mikael@mysql.com>.
+
+2009-06-29 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_file_format.test,
+ mysql-test/innodb_file_format.result:
+ Do not crash on SET GLOBAL innodb_file_format=DEFAULT
+ or SET GLOBAL innodb_file_format_check=DEFAULT.
+
+2009-06-29 The InnoDB Team
+
+ * buf/buf0buf.c, buf/buf0rea.c, lock/lock0lock.c:
+ Tolerate missing tablespaces during crash recovery and when
+ printing information on locks.
+
+2009-06-29 The InnoDB Team
+
+ * buf/buf0buf.c:
+ Fix a race condition when reading buf_fix_count.
+ Currently, it is not being protected by the buffer pool mutex,
+ but by the block mutex.
+
+2009-06-29 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Start the user transaction prebuilt->trx if it was not started
+ before adding or dropping an index. Without this fix, the
+ table could be locked outside an active transaction.
+
+2009-06-25 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug42101.test,
+ mysql-test/innodb_bug42101.result,
+ mysql-test/innodb_bug42101-nonzero.test,
+ mysql-test/innodb_bug42101-nonzero.result:
+ Fix Bug#45749 Race condition in SET GLOBAL
+ innodb_commit_concurrency=DEFAULT
+
+2009-06-25 The InnoDB Team
+
+ * dict/dict0dict.c:
+ When an index column cannot be found in the table during index
+ creation, display additional diagnostic before an assertion failure.
+ This does NOT fix Bug#44571 InnoDB Plugin crashes on ADD INDEX,
+ but it helps understand the reason of the crash.
+
+2009-06-17 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix Bug#45426 UNIV_DEBUG build cause assertion error at CREATE INDEX
+
+2009-06-17 The InnoDB Team
+
+ * mysql-test/innodb_bug45357.result, mysql-test/innodb_bug45357.test,
+ row/row0mysql.c:
+ Fix Bug#45357 5.1.35 crashes with Failing assertion: index->type &
+ DICT_CLUSTERED
+
+2009-06-17 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value
+ from the index (PRIMARY)
+
+2009-06-11 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb.result, srv/srv0srv.c:
+ Change the following defaults:
+ max_dirty_pages_pct: from 90 to 75, max allowed from 100 to 99
+ additional_mem_pool_size: from 1 to 8 MB
+ buffer_pool_size: from 8 to 128 MB
+ log_buffer_size: from 1 to 8 MB
+ read_io_threads/write_io_threads: from 1 to 4
+
+2009-06-09 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/trx0trx.h, trx/trx0trx.c:
+ Enable Group Commit functionality that was broken in 5.0 when
+ distributed transactions were introduced.
+
+2009-06-05 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/os0file.h, include/srv0srv.h,
+ os/os0file.c, srv/srv0srv.c, srv/srv0start.c:
+ Enable functionality to have multiple background IO helper threads.
+ Expose new configure knobs innodb_read_io_threads and
+ innodb_write_io_threads and deprecate innodb_file_io_threads (this
+ parameter was relevant only on windows). Internally this allows
+ multiple segments for read and write IO request arrays where one
+ thread works on one segment.
+
+2009-06-05 The InnoDB Team
+
+ * buf/buf0lru.c, buf/buf0rea.c, handler/ha_innodb.cc,
+ include/srv0srv.h, srv/srv0srv.c:
+ Fix a bug in linear read ahead:
+ 1) Take into account access pattern when deciding whether or not to
+ do linear read ahead.
+ 2) Expose a knob innodb_read_ahead_factor = [0-64] default (8),
+ dynamic, global to control linear read ahead behavior. This is the
+ value of the number of pages that InnoDB will tolerate within a
+ 64 page extent even if they are accessed out of order or have
+ not been accessed at all. This number (which varies from 0 to 64)
+ is indicative of the slack that we have when deciding about linear
+ readahead.
+ 3) Disable random read ahead. Keep the code for now.
+
+2009-06-03 The InnoDB Team
+
+ * dict/dict0dict.c, mysql-test/t/innodb_mysql.test,
+ mysql-test/r/innodb_mysql.result:
+ Fix Bug#39793 Foreign keys not constructed when column
+ has a '#' in a comment or default value
+
+2009-05-27 The InnoDB Team
+
+ * Doxyfile:
+ Allow the extraction of documentation from the code base with the
+ Doxygen tool. Convert and add many (but not yet all) comments to
+ Doxygen format.
+
+2009-05-19 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, lock/lock0lock.c,
+ include/page0page.ic, include/lock0lock.h, include/dict0dict.h,
+ include/page0page.h, include/dict0dict.ic, ibuf/ibuf0ibuf.c,
+ page/page0zip.c, page/page0page.c:
+ Write updates of PAGE_MAX_TRX_ID to the redo log and add debug
+ assertions for checking that PAGE_MAX_TRX_ID is valid on leaf
+ pages of secondary indexes and the insert buffer B-tree. This bug
+ could cause failures in secondary index lookups in consistent
+ reads right after crash recovery.
+
+2009-05-18 The InnoDB Team
+
+ * btr/btr0cur.c:
+ Correctly estimate the space needed on the compressed page when
+ performing an update by delete-and-insert.
+
+2009-05-14 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h,
+ mysql-test/innodb_bug42101-nonzero-master.opt,
+ mysql-test/innodb_bug42101-nonzero.result,
+ mysql-test/innodb_bug42101-nonzero.test,
+ mysql-test/innodb_bug42101.result, mysql-test/innodb_bug42101.test,
+ srv/srv0srv.c:
+ Fix Bug#42101 Race condition in innodb_commit_concurrency
+
+2009-05-13 The InnoDB Team
+
+ * dict/dict0dict.c:
+ Fix Bug#44320 InnoDB: missing DB_ROLL_PTR in Table Monitor COLUMNS
+ output
+
+2009-04-29 The InnoDB Team
+
+ * fil/fil0fil.c, include/fil0fil.h, include/mtr0mtr.h,
+ log/log0recv.c:
+ Fix Bug#41609 Crash recovery does not work for InnoDB temporary tables
+
+2009-04-23 The InnoDB Team
+
+ * row/row0mysql.c:
+ When scanning indexes, report in the error log any error codes
+ returned by the search function. These error codes will still be
+ ignored in CHECK TABLE.
+
+2009-04-23 The InnoDB Team
+
+ * include/trx0types.h:
+ Define the logical type names trx_id_t, roll_ptr_t, and undo_no_t
+ and use them in place of dulint everywhere.
+
+2009-04-18 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/pars0pars.h:
+ Fix Bug#29125 Windows Server X64: so many compiler warnings
+
+2009-04-16 The InnoDB Team
+
+ * include/univ.i:
+ Define REFMAN as the base URL of the MySQL Reference Manual and
+ use the macro in all diagnostic output.
+
+2009-04-16 The InnoDB Team
+
+ * CMakeLists.txt, include/os0sync.h, include/sync0sync.h,
+ include/sync0sync.ic, include/univ.i, srv/srv0start.c,
+ sync/sync0sync.c:
+ Use the Windows Interlocked functions for atomic memory
+ access.
+
+2009-04-15 The InnoDB Team
+
+ * mysql-test/innodb.result, mysql-test/innodb.test:
+ Fix Bug#43309 Test main.innodb can't be run twice
+
+2009-04-14 The InnoDB Team
+
+ * CMakeLists.txt, handler/win_delay_loader.cc,
+ win-plugin/win-plugin.diff:
+ Remove statically linked libraries from MySQL (zlib and strings).
+
+2009-04-11 The InnoDB Team
+
+ * CMakeLists.txt, win-plugin/README, win-plugin/win-plugin.diff:
+ Rewrite CMakeLists.txt.
+
+2009-04-07 The InnoDB Team
+
+ * include/os0sync.h, include/sync0rw.ic, include/sync0sync.h,
+ include/sync0sync.ic, include/univ.i, plug.in, srv/srv0srv.c,
+ srv/srv0start.c, sync/sync0arr.c, sync/sync0sync.c:
+ Enable atomics on Solaris (using the libc functions as defined in
+ atomic.h) if GCC atomic builtins are not present.
+
+2009-04-07 The InnoDB Team
+
+ * btr/btr0btr.c, dict/dict0dict.c, ibuf/ibuf0ibuf.c,
+ include/data0data.h, include/data0data.ic, include/data0type.h,
+ include/data0type.ic, include/dict0dict.h, include/dict0dict.ic,
+ include/rem0rec.ic, mysql-test/innodb.result, mysql-test/innodb.test,
+ pars/pars0pars.c, rem/rem0rec.c, row/row0upd.c:
+ Fix Bug#44032 In ROW_FORMAT=REDUNDANT, update UTF-8 CHAR
+ to/from NULL is not in-place
+
+2009-04-07 The InnoDB Team
+
+ * page/page0cur.c:
+ Fix Bug#43660 SHOW INDEXES/ANALYZE does NOT update cardinality for
+ indexes of InnoDB table
+
+2009-04-06 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make the parameter innodb_change_buffering settable by the
+ configuration file or mysqld command line options. Before this
+ fix, the initial value specified for this parameter was ignored.
+
+2009-04-06 The InnoDB Team
+
+ * sync/sync0rw.c:
+ Avoid a bogus failure in UNIV_SYNC_DEBUG diagnostics.
+
+2009-04-02 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h, srv/srv0srv.c:
+ Add new parameter innodb_spin_wait_delay to set the maximum delay
+ between polling for a spin lock.
+
+2009-04-02 The InnoDB Team
+
+ * dict/dict0crea.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+ include/dict0mem.h, include/row0merge.h, include/row0mysql.h,
+ mysql-test/innodb-index.result, mysql-test/innodb-index.test,
+ row/row0merge.c, row/row0sel.c:
+ In consistent reads, refuse to use newly created indexes that may
+ lack history.
+
+2009-03-25 The InnoDB Team
+
+ * buf/buf0buf.c, handler/ha_innodb.cc, include/buf0buf.h:
+ In SHOW ENGINE INNODB MUTEX do not show the status of block->mutex,
+ block->lock, block->lock->mutex (if applicable) and all mutexes and
+ rw-locks for which number of os-waits are zero because this can
+ be overwhelming particularly when the buffer pool is very large.
+
+2009-03-20 The InnoDB Team
+
+ * buf/buf0buf.c, include/log0recv.h, log/log0recv.c:
+ Remove the compile-time constant parameters of
+ recv_recover_page(), recv_scan_log_recs(), and recv_sys_init().
+
+2009-03-20 The InnoDB Team
+
+ * data/data0type.c, handler/ha_innodb.cc, include/ha_prototypes.h:
+ Declare innobase_get_at_most_n_mbchars() in ha_prototypes.h.
+
+2009-03-20 The InnoDB Team
+
+ * fil/fil0fil.h, fil/fil0fil.c, srv/srv0start.c:
+ Add the parameter hash_size to fil_init().
+
+2009-03-20 The InnoDB Team
+
+ * fil/fil0fil.c:
+ Refer to fil_system directly, not via local variables.
+
+2009-03-20 The InnoDB Team
+
+ * page/page0page.c:
+ In page_validate(), always report the space id, page number and
+ the name of the index when corruption is noticed.
+
+2009-03-20 The InnoDB Team
+
+ * include/log0log.h, include/log0log.ic, log/log0log.c:
+ Add in/out comments or const qualifiers to some function
+ parameters as appropriate.
+
+2009-03-20 The InnoDB Team
+
+ * dict/dict0boot.c, dict/dict0dict.c, fsp/fsp0fsp.c,
+ include/dict0dict.h, include/srv0srv.h, srv/srv0srv.c,
+ page/page0page.c:
+ Replace srv_sys->dummy_ind1 and srv_sys->dummy_ind2 with
+ dict_ind_redundant and dict_ind_compact, which are
+ initialized by dict_init().
+
+2009-03-11 The InnoDB Team
+
+ InnoDB Plugin 1.0.3 released
+
+2009-03-05 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#43203 Overflow from auto incrementing causes server segv
+
+2009-02-25 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#42714 AUTO_INCREMENT errors in 5.1.31
+
+2009-02-23 The InnoDB Team
+
+ * btr/btr0cur.c:
+ Fix Bug#43043 Crash on BLOB delete operation
+
+2009-02-20 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make innodb_use_sys_malloc=ON the default.
+
+2009-02-20 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#42400 InnoDB autoinc code can't handle floating-point columns
+
+2009-02-18 The InnoDB Team
+
+ * include/ut0mem.h, os/os0proc.c, ut/ut0mem.c:
+ Protect ut_total_allocated_memory with ut_list_mutex in
+ os_mem_alloc_large() and os_mem_free_large(). The lack of this mutex
+ protection could cause an assertion failure during fast index
+ creation. Also, add UNIV_MEM_ALLOC and UNIV_MEM_FREE instrumentation
+ to os_mem_alloc_large() and os_mem_free_large(), so that Valgrind can
+ detect more errors.
+
+2009-02-11 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Make innodb_thread_concurrency=0 the default. The old default value
+ was 8. A non-zero setting may be useful when InnoDB is showing severe
+ scalability problems under multiple concurrent connections.
+
+2009-02-10 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h:
+ Fix Bug#41676 Table names are case insensitive in locking
+
+2009-02-10 The InnoDB Team
+
+ * mem/mem0dbg.c, mem/mem0mem.c, mem/mem0pool.c:
+ When innodb_use_sys_malloc is set, ignore
+ innodb_additional_mem_pool_size, because nothing will be allocated
+ from mem_comm_pool.
+
+2009-02-10 The InnoDB Team
+
+ * ut/ut0mem.c:
+ Map ut_malloc_low(), ut_realloc(), and ut_free() directly to malloc(),
+ realloc(), and free() when innodb_use_sys_malloc is set. As a side
+ effect, ut_total_allocated_memory ("Total memory allocated" in the
+ "BUFFER POOL AND MEMORY" section of SHOW ENGINE INNODB STATUS) will
+ exclude any memory allocated by these functions when
+ innodb_use_sys_malloc is set.
+
+2009-02-10 The InnoDB Team
+
+ * btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc,
+ include/buf0buf.ic, include/os0sync.h, include/srv0srv.h,
+ include/sync0rw.h, include/sync0rw.ic, include/sync0sync.h,
+ include/sync0sync.ic, include/univ.i, row/row0sel.c, srv/srv0srv.c,
+ srv/srv0start.c, sync/sync0arr.c, sync/sync0rw.c, sync/sync0sync.c:
+ On those platforms that support it, implement the synchronization
+ primitives of InnoDB mutexes and read/write locks with GCC atomic
+ builtins instead of Pthreads mutexes and InnoDB mutexes. These changes
+ are based on a patch supplied by Mark Callaghan of Google under a BSD
+ license.
+
+2009-01-30 The InnoDB Team
+
+ * btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc,
+ include/btr0sea.h, include/buf0buf.h, include/sync0sync.h,
+ sync/sync0sync.c:
+ Make the configuration parameter innodb_adaptive_hash_index dynamic,
+ so that it can be changed at runtime.
+
+2009-01-29 The InnoDB Team
+
+ * handler/ha_innodb.cc, ibuf/ibuf0ibuf.c, include/ibuf0ibuf.h,
+ include/ibuf0ibuf.ic:
+ Implement the settable global variable innodb_change_buffering,
+ with the allowed values 'none' and 'inserts'. The default value
+ 'inserts' enables the buffering of inserts to non-unique secondary
+ index trees when the B-tree leaf page is not in the buffer pool.
+
+2009-01-27 The InnoDB Team
+
+ * buf/buf0lru.c:
+ Fix a race condition in buf_LRU_invalidate_tablespace(): The
+ compressed page size (zip_size) was read while the block descriptor
+ was no longer protected by a mutex. This could lead to corruption
+ when a table is dropped on a busy system that contains compressed
+ tables.
+
+2009-01-26 The InnoDB Team
+
+ * btr/btr0sea.c, buf/buf0buf.c, include/buf0buf.h, include/buf0buf.ic,
+ include/mtr0log.ic, include/row0upd.ic, mtr/mtr0mtr.c:
+ Implement buf_block_align() with pointer arithmetics, as it is in the
+ built-in InnoDB distributed with MySQL. Do not acquire the buffer pool
+ mutex before buf_block_align(). This removes a scalability bottleneck
+ in the adaptive hash index lookup. In CHECK TABLE, check that
+ buf_pool->page_hash is consistent with buf_block_align().
+
+2009-01-23 The InnoDB Team
+
+ * btr/btr0sea.c:
+ Fix Bug#42279 Race condition in btr_search_drop_page_hash_when_freed()
+
+2009-01-23 The InnoDB Team
+
+ * buf/buf0buf.c, include/buf0buf.h:
+ Remove the unused mode BUF_GET_NOWAIT of buf_page_get_gen()
+
+2009-01-20 The InnoDB Team
+
+ * include/rem0rec.h, include/rem0rec.ic:
+ Fix Bug#41571 MySQL segfaults after innodb recovery
+
+2009-01-20 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#42152 Race condition in lock_is_table_exclusive()
+
+2009-01-14 The InnoDB Team
+
+ * include/trx0roll.h, trx/trx0roll.c, trx/trx0trx.c:
+ Fix Bug#38187 Error 153 when creating savepoints
+
+2009-01-14 The InnoDB Team
+
+ * dict/dict0load.c:
+ Fix Bug#42075 dict_load_indexes failure in dict_load_table will
+ corrupt the dictionary cache
+
+2009-01-13 The InnoDB Team
+
+ * buf/buf0buddy.c, dict/dict0dict.c, dict/dict0mem.c, fil/fil0fil.c,
+ ha/ha0storage.c, handler/ha_innodb.cc, handler/win_delay_loader.cc,
+ include/buf0buf.ic, include/dict0dict.ic, include/hash0hash.h,
+ thr/thr0loc.c, trx/trx0i_s.c:
+ Add the parameter ASSERTION to HASH_SEARCH() macro, and use it for
+ light validation of the traversed items in hash table lookups when
+ UNIV_DEBUG is enabled.
+
+2009-01-09 The InnoDB Team
+
+ * buf/buf0flu.c, include/buf0flu.h, include/buf0flu.ic:
+ Remove unused code from the functions
+ buf_flush_insert_into_flush_list() and
+ buf_flush_insert_sorted_into_flush_list().
+
+2009-01-09 The InnoDB Team
+
+ * buf/buf0flu.c:
+ Simplify the functions buf_flush_try_page() and buf_flush_batch(). Add
+ debug assertions and an explanation to buf_flush_write_block_low().
+
+2009-01-07 The InnoDB Team
+
+ * row/row0merge.c:
+ Fix a bug in recovery when dropping temporary indexes.
+
+2009-01-07 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h, handler/handler0alter.cc:
+ Fix Bug#41680 calls to trx_allocate_for_mysql are not consistent
+
+2009-01-07 The InnoDB Team
+
+ * mysql-test/innodb_bug41904.result, mysql-test/innodb_bug41904.test,
+ row/row0merge.c:
+ Fix Bug#41904 create unique index problem
+
+2009-01-02 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h, mem/mem0pool.c,
+ mysql-test/innodb-use-sys-malloc-master.opt,
+ mysql-test/innodb-use-sys-malloc.result,
+ mysql-test/innodb-use-sys-malloc.test, srv/srv0srv.c, srv/srv0start.c:
+ Implement the configuration parameter innodb_use_sys_malloc (false by
+ default), for disabling InnoDB's internal memory allocator and using
+ system malloc/free instead. The "BUFFER POOL AND MEMORY" section of
+ SHOW ENGINE INNODB STATUS will report "in additional pool allocated
+ allocated 0" when innodb_use_sys_malloc is set.
+
+2008-12-30 The InnoDB Team
+
+ * btr/btr0btr.c:
+ When setting the PAGE_LEVEL of a compressed B-tree page from or to 0,
+ compress the page at the same time. This is necessary, because the
+ column information stored on the compressed page will differ between
+ leaf and non-leaf pages. Leaf pages are identified by PAGE_LEVEL=0.
+ This bug can make InnoDB crash when all rows of a compressed table are
+ deleted.
+
+2008-12-17 The InnoDB Team
+
+ * include/row0sel.h, include/row0upd.h, pars/pars0pars.c,
+ row/row0mysql.c, row/row0sel.c, row/row0upd.c:
+ Remove update-in-place select from the internal SQL interpreter. It
+ was only used for updating the InnoDB internal data dictionary when
+ renaming or dropping tables. It could have caused deadlocks when
+ acquiring latches on insert buffer bitmap pages.
+
+2008-12-17 The InnoDB Team
+
+ * btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c, ha/ha0ha.c,
+ ha/hash0hash.c, include/buf0buf.h, include/ha0ha.h, include/ha0ha.ic,
+ include/hash0hash.h, include/univ.i:
+ Introduce the preprocessor symbol UNIV_AHI_DEBUG for enabling adaptive
+ hash index debugging independently of UNIV_DEBUG.
+
+2008-12-16 The InnoDB Team
+
+ * btr/btr0cur.c:
+ Do not update the free bits in the insert buffer bitmap when inserting
+ or deleting from the insert buffer B-tree. Assert that records in the
+ insert buffer B-tree are never updated.
+
+2008-12-12 The InnoDB Team
+
+ * buf/buf0buf.c, fil/fil0fil.c, fsp/fsp0fsp.c, ibuf/ibuf0ibuf.c,
+ include/fil0fil.h, include/ibuf0ibuf.h, include/ibuf0ibuf.ic,
+ include/ibuf0types.h:
+ Clean up the insert buffer subsystem so that only one insert
+ buffer B-tree exists.
+ Originally, there were provisions in InnoDB for multiple insert
+ buffer B-trees, apparently one for each tablespace.
+ When Heikki Tuuri implemented multiple InnoDB tablespaces in
+ MySQL/InnoDB 4.1, he made the insert buffer live only in the
+ system tablespace (space 0) but left the provisions in the code.
+
+2008-12-11 The InnoDB Team
+
+ * include/srv0srv.h, os/os0proc.c, srv/srv0srv.c:
+ Fix the issue that the InnoDB plugin fails if innodb_buffer_pool_size
+ is defined bigger than 4096M on 64-bit Windows. This bug should not
+ have affected other 64-bit systems.
+
+2008-12-09 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#40386 Not flushing query cache after truncate.
+
+2008-12-09 The InnoDB Team
+
+ * handler/ha_innodb.cc, srv/srv0srv.c, trx/trx0trx.c:
+ Fix Bug#40760 "set global innodb_thread_concurrency = 0;" is not safe
+
+2008-12-04 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/mysql_addons.cc,
+ include/mysql_addons.h, trx/trx0i_s.c, win-plugin/win-plugin.diff:
+ Remove dependencies to MySQL internals (defining MYSQL_SERVER).
+
+2008-12-02 The InnoDB Team
+
+ * page/page0cur.c:
+ When allocating space for a record from the free list of previously
+ purged records, zero out the DB_TRX_ID and DB_ROLL_PTR of the purged
+ record if the new record would not overwrite these fields. This fixes
+ a harmless content mismatch reported by page_zip_validate().
+
+2008-12-02 The InnoDB Team
+
+ * row/row0merge.c:
+ Replace the WHILE 1 with WHILE 1=1 in the SQL procedure, so that the
+ loop will actually be entered and temporary indexes be dropped during
+ crash recovery.
+
+2008-12-01 The InnoDB Team
+
+ InnoDB Plugin 1.0.2 released
+
+2008-10-31 The InnoDB Team
+
+ * dict/dict0mem.c, include/dict0mem.h, include/lock0lock.h,
+ include/row0mysql.h, include/trx0trx.h, include/univ.i,
+ include/ut0vec.h, include/ut0vec.ic, lock/lock0lock.c,
+ row/row0mysql.c, trx/trx0trx.c:
+ Fix Bug#26316 Triggers create duplicate entries on auto-increment
+ columns
+
+2008-10-30 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/handler0vars.h,
+ handler/win_delay_loader.cc, mysql-test/innodb_bug40360.result,
+ mysql-test/innodb_bug40360.test:
+ Fix Bug#40360 Binlog related errors with binlog off
+
+2008-10-29 The InnoDB Team
+
+ * include/data0type.ic:
+ Fix Bug#40369 dtype_get_sql_null_size() returns 0 or 1, not the size
+
+2008-10-29 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h, srv/srv0srv.c:
+ Fix Bug#38189 innodb_stats_on_metadata missing
+
+2008-10-28 The InnoDB Team
+
+ * CMakeLists.txt, ha_innodb.def, handler/ha_innodb.cc,
+ handler/handler0alter.cc, handler/handler0vars.h, handler/i_s.cc,
+ handler/win_delay_loader.cc, win-plugin/*:
+ Implemented the delayloading of externals for the plugin on Windows.
+ This makes it possible to build a dynamic plugin (ha_innodb.dll) on
+ Windows.
+
+2008-10-27 The InnoDB Team
+
+ * CMakeLists.txt:
+ Fix Bug#19424 InnoDB: Possibly a memory overrun of the buffer being
+ freed (64-bit Visual C)
+
+2008-10-23 The InnoDB Team
+
+ * ibuf/ibuf0ibuf.c:
+ ibuf_delete_rec(): When the cursor to the insert buffer record
+ cannot be restored, do not complain if the tablespace does not
+ exist, because the insert buffer record may have been discarded by
+ some other thread. This bug has existed in MySQL/InnoDB since
+ version 4.1, when innodb_file_per_table was implemented.
+ This may fix Bug#27276 InnoDB Error: ibuf cursor restoration fails.
+
+2008-10-22 The InnoDB Team
+
+ * dict/dict0dict.c, dict/dict0mem.c, handler/ha_innodb.cc,
+ handler/ha_innodb.h, include/dict0dict.h, include/dict0mem.h,
+ row/row0mysql.c:
+ Fix Bug#39830 Table autoinc value not updated on first insert
+ Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in
+ ::info
+ Fix Bug#36411 "Failed to read auto-increment value from storage
+ engine" in 5.1.24 auto-inc
+
+2008-10-22 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+ Fix Bug#40224 New AUTOINC changes mask reporting of deadlock/timeout
+ errors
+
+2008-10-16 The InnoDB Team
+
+ * dict/dict0dict.c, mysql-test/innodb-index.result,
+ mysql-test/innodb-index.test:
+ Skip the undo log size check when creating REDUNDANT and COMPACT
+ tables. In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED, column
+ prefix indexes require that prefixes of externally stored columns
+ be written to the undo log. This may make the undo log record
+ bigger than the record on the B-tree page. The maximum size of an
+ undo log record is the page size. That must be checked for, in
+ dict_index_add_to_cache(). However, this restriction must not
+ be enforced on REDUNDANT or COMPACT tables.
+
+2008-10-15 The InnoDB Team
+
+ * btr/btr0cur.c, include/btr0cur.h, row/row0ext.c, row/row0sel.c,
+ row/row0upd.c:
+ When the server crashes while freeing an externally stored column
+ of a compressed table, the BTR_EXTERN_LEN field in the BLOB
+ pointer will be written as 0. Tolerate this in the functions that
+ deal with externally stored columns. This fixes problems after
+ crash recovery, in the rollback of incomplete transactions, and in
+ the purge of delete-marked records.
+
+2008-10-15 The InnoDB Team
+
+ * btr/btr0btr.c, include/page0zip.h, page/page0zip.c, include/univ.i:
+ When a B-tree node of a compressed table is split or merged, the
+ compression may fail. In this case, the entire compressed page
+ will be copied and the excess records will be deleted. However,
+ page_zip_copy(), now renamed to page_zip_copy_recs(), copied too
+ many fields in the page header, overwriting PAGE_BTR_SEG_LEAF and
+ PAGE_BTR_SEG_TOP when splitting the B-tree root. This caused
+ corruption of compressed tables. Furthermore, the lock table and
+ the adaptive hash index would be corrupted, because we forgot to
+ update them when invoking page_zip_copy_recs().
+
+ Introduce the symbol UNIV_ZIP_DEBUG for triggering the copying of
+ compressed pages more often, for debugging purposes.
+
+2008-10-10 The InnoDB Team
+
+ * handler/handler0alter.cc, include/row0merge.h, row/row0merge.c,
+ row/row0mysql.c:
+ Fix some locking issues, mainly in fast index creation. The
+ InnoDB data dictionary cache should be latched whenever a
+ transaction is holding locks on any data dictionary tables.
+ Otherwise, lock waits or deadlocks could occur. Furthermore, the
+ data dictionary transaction must be committed (and the locks
+ released) before the data dictionary latch is released.
+
+ ha_innobase::add_index(): Lock the data dictionary before renaming
+ or dropping the created indexes, because neither operation will
+ commit the data dictionary transaction.
+
+ ha_innobase::final_drop_index(): Commit the transactions before
+ unlocking the data dictionary.
+
+2008-10-09 The InnoDB Team
+
+ * buf/buf0lru.c:
+ Fix Bug#39939 DROP TABLE/DISCARD TABLESPACE takes long time in
+ buf_LRU_invalidate_tablespace()
+
+2008-10-08 The InnoDB Team
+
+ * dict/dict0crea.c, trx/trx0roll.c, include/row0mysql.h,
+ row/row0merge.c, row/row0mysql.c:
+ When dropping a table, hold the data dictionary latch until the
+ transaction has been committed. The data dictionary latch is
+ supposed to prevent lock waits and deadlocks in the data
+ dictionary tables. Due to this bug, DROP TABLE could cause a
+ deadlock or hang. Note that because of Bug#33650 and Bug#39833,
+ MySQL may also drop a (temporary) table when executing CREATE INDEX
+ or ALTER TABLE ... ADD INDEX.
+
+2008-10-04 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb_bug39438-master.opt,
+ mysql-test/innodb_bug39438.result, mysql-test/innodb_bug39438.test:
+ Fix Bug#39438 Testcase for Bug#39436 crashes on 5.1 in
+ fil_space_get_latch
+
+2008-10-04 The InnoDB Team
+
+ * include/lock0lock.h, lock/lock0lock.c,
+ mysql-test/innodb_bug38231.result, mysql-test/innodb_bug38231.test,
+ row/row0mysql.c:
+ Fix Bug#38231 Innodb crash in lock_reset_all_on_table() on TRUNCATE +
+ LOCK / UNLOCK
+
+2008-10-04 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in
+ ::info
+
+2008-10-04 The InnoDB Team
+
+ * handler/ha_innodb.cc, handler/ha_innodb.h:
+ Fix Bug#37788 InnoDB Plugin: AUTO_INCREMENT wrong for compressed
+ tables
+
+2008-10-04 The InnoDB Team
+
+ * dict/dict0dict.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+ include/dict0dict.h, include/dict0mem.h, row/row0mysql.c:
+ Fix Bug#39830 Table autoinc value not updated on first insert
+
+2008-10-03 The InnoDB Team
+
+ * mysql-test/innodb-index.test, mysql-test/innodb-index.result,
+ mysql-test/innodb-timeout.test, mysql-test/innodb-timeout.result,
+ srv/srv0srv.c, include/srv0srv.h, handler/ha_innodb.cc,
+ include/ha_prototypes.h:
+ Fix Bug#36285 innodb_lock_wait_timeout is not dynamic, not per session
+
+2008-09-19 The InnoDB Team
+
+ * os/os0proc.c:
+ Fix a memory leak on Windows. The memory leak was due to wrong
+ parameters passed into VirtualFree() call. As the result, the
+ call fails with Windows error 87.
+
+2008-09-17 The InnoDB Team
+
+ * mysql-test/innodb.result, mysql-test/innodb-zip.result,
+ mysql-test/innodb-zip.test, mysql-test/innodb.test, ibuf/ibuf0ibuf.c,
+ dict/dict0crea.c, dict/dict0load.c, dict/dict0boot.c,
+ include/dict0dict.h, include/trx0trx.h, dict/dict0dict.c,
+ trx/trx0trx.c, include/ha_prototypes.h, handler/ha_innodb.cc:
+ When creating an index in innodb_strict_mode, check that the
+ maximum record size will never exceed the B-tree page size limit.
+ For uncompressed tables, there should always be enough space for
+ two records in an empty B-tree page. For compressed tables, there
+ should be enough space for storing two node pointer records or one
+ data record in an empty page in uncompressed format.
+ The purpose of this check is to guarantee that INSERT or UPDATE
+ will never fail due to too big record size.
+
+2008-09-17 The InnoDB Team
+
+ * btr/btr0cur.c, data/data0data.c, include/page0zip.h,
+ include/page0zip.ic, page/page0zip.c, mysql-test/innodb_bug36172.test:
+ Prevent infinite B-tree page splits in compressed tables by
+ ensuring that there will always be enough space for two node
+ pointer records in an empty B-tree page. Also, require that at
+ least one data record will fit in an empty compressed page. This
+ will reduce the maximum size of records in compressed tables.
+
+2008-09-09 The InnoDB Team
+
+ * mysql-test/innodb.result:
+ Fix the failing innodb test by merging changes that MySQL made to
+ that file (r2646.12.1 in MySQL BZR repository)
+
+2008-09-09 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#38839 auto increment does not work properly with InnoDB after
+ update
+
+2008-09-09 The InnoDB Team
+
+ * dict/dict0dict.c, handler/handler0alter.cc, include/dict0dict.h,
+ mysql-test/innodb-index.result, mysql-test/innodb-index.test:
+ Fix Bug#38786 InnoDB plugin crashes on drop table/create table with FK
+
+2008-08-21 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/ha_prototypes.h, row/row0sel.c:
+ Fix Bug#37885 row_search_for_mysql may gap lock unnecessarily with SQL
+ comments in query
+
+2008-08-21 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Fix Bug#38185 ha_innobase::info can hold locks even when called with
+ HA_STATUS_NO_LOCK
+
+2008-08-18 The InnoDB Team
+
+ * buf/buf0buf.c, buf/buf0lru.c, include/buf0buf.ic, include/univ.i:
+ Introduce UNIV_LRU_DEBUG for debugging the LRU buffer pool cache
+
+2008-08-08 The InnoDB Team
+
+ * buf/buf0lru.c, include/buf0buf.h:
+ Fix two recovery bugs that could lead to a crash in debug builds with
+ small buffer size
+
+2008-08-07 The InnoDB Team
+
+ * btr/btr0cur.c, handler/ha_innodb.cc, include/srv0srv.h,
+ srv/srv0srv.c:
+ Add a parameter innodb_stats_sample_pages to allow users to control
+ the number of index dives when InnoDB estimates the cardinality of
+ an index (ANALYZE TABLE, SHOW TABLE STATUS etc)
+
+2008-08-07 The InnoDB Team
+
+ * trx/trx0i_s.c:
+ Fix a bug that would lead to a crash if a SELECT was issued from the
+ INFORMATION_SCHEMA tables and there are rolling back transactions at
+ the same time
+
+2008-08-06 The InnoDB Team
+
+ * btr/btr0btr.c, btr/btr0cur.c, ibuf/ibuf0ibuf.c, include/btr0cur.h,
+ include/trx0roll.h, include/trx0types.h, row/row0purge.c,
+ row/row0uins.c, row/row0umod.c, trx/trx0roll.c:
+ In the rollback of incomplete transactions after crash recovery,
+ tolerate clustered index records whose externally stored columns
+ have not been written.
+
+2008-07-30 The InnoDB Team
+
+ * trx/trx0trx.c:
+ Fixes a race in recovery where the recovery thread recovering a
+ PREPARED trx and the background rollback thread can both try
+ to free the trx after its status is set to COMMITTED_IN_MEMORY.
+
+2008-07-29 The InnoDB Team
+
+ * include/trx0rec.h, row/row0purge.c, row/row0vers.c, trx/trx0rec.c:
+ Fix a BLOB corruption bug
+
+2008-07-15 The InnoDB Team
+
+ * btr/btr0sea.c, dict/dict0dict.c, include/btr0sea.h:
+ Fixed a timing hole where a thread dropping an index can free the
+ in-memory index struct while another thread is still using that
+ structure to remove entries from adaptive hash index belonging
+ to one of the pages that belongs to the index being dropped.
+
+2008-07-04 The InnoDB Team
+
+ * mysql-test/innodb-index.result:
+ Fix the failing innodb-index test by adjusting the result to a new
+ MySQL behavior (the change occured in BZR-r2667)
+
+2008-07-03 The InnoDB Team
+
+ * mysql-test/innodb-zip.result, mysql-test/innodb-zip.test:
+ Remove the negative test cases that produce warnings
+
+2008-07-02 The InnoDB Team
+
+ * mysql-test/innodb-replace.result, mysql-test/innodb-index.test:
+ Disable part of innodb-index test because MySQL changed its behavior
+ and is not calling ::add_index() anymore when adding primary index on
+ non-NULL column
+
+2008-07-01 The InnoDB Team
+
+ * mysql-test/innodb-replace.result, mysql-test/innodb-replace.test:
+ Fix the failing innodb-replace test by merging changes that MySQL
+ made to that file (r2659 in MySQL BZR repository)
+
+2008-07-01 The InnoDB Team
+
+ * lock/lock0lock.c:
+ Fix Bug#36942 Performance problem in lock_get_n_rec_locks (SHOW INNODB
+ STATUS)
+
+2008-07-01 The InnoDB Team
+
+ * ha/ha0ha.c:
+ Fix Bug#36941 Performance problem in ha_print_info (SHOW INNODB
+ STATUS)
+
+2008-07-01 The InnoDB Team
+
+ * handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+ mysql-test/innodb-autoinc.test:
+ Fix Bug#37531 After truncate, auto_increment behaves incorrectly for
+ InnoDB
+
+2008-06-19 The InnoDB Team
+
+ * handler/ha_innodb.cc:
+ Rewrite the function innodb_plugin_init() to support parameters in
+ different order (in static and dynamic InnoDB) and to support more
+ parameters in the static InnoDB
+
+2008-06-19 The InnoDB Team
+
+ * handler/handler0alter.cc:
+ Fix a bug in ::add_index() which set the transaction state to "active"
+ but never restored it to the original value. This bug caused warnings
+ to be printed by the rpl.rpl_ddl mysql-test.
+
+2008-06-19 The InnoDB Team
+
+ * mysql-test/patches:
+ Add a directory which contains patches, which need to be applied to
+ MySQL source in order to get some mysql-tests to succeed. The patches
+ cannot be committed in MySQL repository because they are specific to
+ the InnoDB plugin.
+
+2008-06-19 The InnoDB Team
+
+ * mysql-test/innodb-zip.result, mysql-test/innodb-zip.test,
+ row/row0row.c:
+ Fix an anomaly when updating a record with BLOB prefix
+
+2008-06-18 The InnoDB Team
+
+ * include/trx0sys.h, srv/srv0start.c, trx/trx0sys.c:
+ Fix a bug in recovery which was a side effect of the file_format_check
+ changes
+
+2008-06-09 The InnoDB Team
+
+ * mysql-test/innodb.result:
+ Fix the failing innodb test by merging changes that MySQL made to that
+ file
+
+2008-06-06 The InnoDB Team
+
+ * buf/buf0buf.c, handler/ha_innodb.cc, include/buf0buf.h,
+ include/srv0srv.h, srv/srv0srv.c:
+ Fix Bug#36600 SHOW STATUS takes a lot of CPU in
+ buf_get_latched_pages_number
+
+ * handler/ha_innodb.cc, os/os0file.c:
+ Fix Bug#11894 innodb_file_per_table crashes w/ Windows .sym symbolic
+ link hack
+
+ * include/ut0ut.h, srv/srv0srv.c, ut/ut0ut.c:
+ Fix Bug#36819 ut_usectime does not handle errors from gettimeofday
+
+ * handler/ha_innodb.cc:
+ Fix Bug#35602 Failed to read auto-increment value from storage engine
+
+ * srv/srv0start.c:
+ Fix Bug#36149 Read buffer overflow in srv0start.c found during "make
+ test"
+
+2008-05-08 The InnoDB Team
+
+ * btr/btr0btr.c, mysql-test/innodb_bug36172.result,
+ mysql-test/innodb_bug36172.test:
+ Fix Bug#36172 insert into compressed innodb table crashes
+
+2008-05-08 The InnoDB Team
+
+ InnoDB Plugin 1.0.1 released
+
+2008-05-06 The InnoDB Team
+
+ * handler/ha_innodb.cc, include/srv0srv.h, include/sync0sync.h,
+ include/trx0sys.h, mysql-test/innodb-zip.result,
+ mysql-test/innodb-zip.test, srv/srv0srv.c, srv/srv0start.c,
+ sync/sync0sync.c, trx/trx0sys.c:
+ Implement the system tablespace tagging
+
+ * handler/ha_innodb.cc, handler/i_s.cc, include/univ.i,
+ srv/srv0start.c:
+ Add InnoDB version in INFORMATION_SCHEMA.PLUGINS.PLUGIN_VERSION,
+ in the startup message and in a server variable innodb_version.
+
+ * sync/sync0sync.c:
+ Fix a bug in the sync debug code where a lock with level
+ SYNC_LEVEL_VARYING would cause an assertion failure when a thread
+ tried to release it.
+
+2008-04-30 The InnoDB Team
+
+ * Makefile.am:
+ Fix Bug#36434 ha_innodb.so is installed in the wrong directory
+
+ * handler/ha_innodb.cc:
+ Merge change from MySQL (Fix Bug#35406 5.1-opt crashes on select from
+ I_S.REFERENTIAL_CONSTRAINTS):
+ ChangeSet@1.2563, 2008-03-18 19:42:04+04:00, gluh@mysql.com +1 -0
+
+ * scripts/install_innodb_plugins.sql:
+ Added
+
+ * mysql-test/innodb.result:
+ Merge change from MySQL (this fixes the failing innodb test):
+ ChangeSet@1.1810.3601.4, 2008-02-07 02:33:21+04:00
+
+ * row/row0sel.c:
+ Fix Bug#35226 RBR event crashes slave
+
+ * handler/ha_innodb.cc:
+ Change the fix for Bug#32440 to show bytes instead of kilobytes in
+ INFORMATION_SCHEMA.TABLES.DATA_FREE
+
+ * handler/ha_innodb.cc, mysql-test/innodb.result,
+ mysql-test/innodb.test:
+ Fix Bug#29507 TRUNCATE shows to many rows effected
+
+ * handler/ha_innodb.cc, mysql-test/innodb.result,
+ mysql-test/innodb.test:
+ Fix Bug#35537 Innodb doesn't increment handler_update and
+ handler_delete
+
+2008-04-29 The InnoDB Team
+
+ * handler/i_s.cc, include/srv0start.h, srv/srv0start.c:
+ Fix Bug#36310 InnoDB plugin crash
+
+2008-04-23 The InnoDB Team
+
+ * mysql-test/innodb_bug36169.result, mysql-test/innodb_bug36169.test,
+ row/row0mysql.c:
+ Fix Bug#36169 create innodb compressed table with too large row size
+ crashed
+
+ * (outside the source tree):
+ Fix Bug#36222 New InnoDB plugin 1.0 has wrong MKDIR_P defined in
+ Makefile.in
+
+2008-04-15 The InnoDB Team
+
+ InnoDB Plugin 1.0.0 released
diff --git a/storage/xtradb/Doxyfile b/storage/xtradb/Doxyfile
new file mode 100644
index 00000000000..62aa7dd8abc
--- /dev/null
+++ b/storage/xtradb/Doxyfile
@@ -0,0 +1,1419 @@
+# Doxyfile 1.5.6
+
+# Usage: SVNVERSION=-r$(svnversion) doxygen
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = "InnoDB Plugin"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER = 1.0$(SVNVERSION)
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = dox
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek,
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
+# and Ukrainian.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member
+# documentation.
+
+DETAILS_AT_TOP = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page. This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = . include/univ.i
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS = *.c *.ic *.h
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE = ut0auxconf_*
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output. If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code. Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = org.doxygen.Project
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to FRAME, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature. Other possible values
+# for this tag are: HIERARCHIES, which will generate the Groups, Directories,
+# and Class Hiererachy pages using a tree view instead of an ordered list;
+# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which
+# disables this behavior completely. For backwards compatibility with previous
+# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE
+# respectively.
+
+GENERATE_TREEVIEW = NONE
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader. This is useful
+# if you want to understand what is going on. On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED = DOXYGEN UNIV_DEBUG UNIV_SYNC_DEBUG __attribute__()=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED = UT_LIST_BASE_NODE_T UT_LIST_NODE_T
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = YES
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME = FreeSans
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = NO
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 3
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is enabled by default, which results in a transparent
+# background. Warning: Depending on the platform used, enabling this option
+# may lead to badly anti-aliased labels on the edges of a graph (i.e. they
+# become hard to read).
+
+DOT_TRANSPARENT = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE = NO
diff --git a/storage/xtradb/Makefile.am b/storage/xtradb/Makefile.am
new file mode 100644
index 00000000000..0a6d7ddefb6
--- /dev/null
+++ b/storage/xtradb/Makefile.am
@@ -0,0 +1,345 @@
+# Copyright (C) 2001, 2004, 2006 MySQL AB & Innobase Oy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+# Process this file with automake to create Makefile.in
+
+MYSQLDATAdir= $(localstatedir)
+MYSQLSHAREdir= $(pkgdatadir)
+MYSQLBASEdir= $(prefix)
+MYSQLLIBdir= $(pkglibdir)
+pkgplugindir= $(pkglibdir)/plugin
+INCLUDES= -I$(top_srcdir)/include -I$(top_builddir)/include \
+ -I$(top_srcdir)/regex \
+ -I$(srcdir)/include \
+ -I$(top_srcdir)/sql \
+ -I$(srcdir) @ZLIB_INCLUDES@
+
+DEFS= @DEFS@
+
+
+noinst_HEADERS= \
+ handler/ha_innodb.h \
+ handler/i_s.h \
+ include/btr0btr.h \
+ include/btr0btr.ic \
+ include/btr0cur.h \
+ include/btr0cur.ic \
+ include/btr0pcur.h \
+ include/btr0pcur.ic \
+ include/btr0sea.h \
+ include/btr0sea.ic \
+ include/btr0types.h \
+ include/buf0buddy.h \
+ include/buf0buddy.ic \
+ include/buf0buf.h \
+ include/buf0buf.ic \
+ include/buf0flu.h \
+ include/buf0flu.ic \
+ include/buf0lru.h \
+ include/buf0lru.ic \
+ include/buf0rea.h \
+ include/buf0types.h \
+ include/data0data.h \
+ include/data0data.ic \
+ include/data0type.h \
+ include/data0type.ic \
+ include/data0types.h \
+ include/db0err.h \
+ include/dict0boot.h \
+ include/dict0boot.ic \
+ include/dict0crea.h \
+ include/dict0crea.ic \
+ include/dict0dict.h \
+ include/dict0dict.ic \
+ include/dict0load.h \
+ include/dict0load.ic \
+ include/dict0mem.h \
+ include/dict0mem.ic \
+ include/dict0types.h \
+ include/dyn0dyn.h \
+ include/dyn0dyn.ic \
+ include/eval0eval.h \
+ include/eval0eval.ic \
+ include/eval0proc.h \
+ include/eval0proc.ic \
+ include/fil0fil.h \
+ include/fsp0fsp.h \
+ include/fsp0fsp.ic \
+ include/fsp0types.h \
+ include/fut0fut.h \
+ include/fut0fut.ic \
+ include/fut0lst.h \
+ include/fut0lst.ic \
+ include/ha0ha.h \
+ include/ha0ha.ic \
+ include/ha0storage.h \
+ include/ha0storage.ic \
+ include/ha_prototypes.h \
+ include/handler0alter.h \
+ include/hash0hash.h \
+ include/hash0hash.ic \
+ include/ibuf0ibuf.h \
+ include/ibuf0ibuf.ic \
+ include/ibuf0types.h \
+ include/lock0iter.h \
+ include/lock0lock.h \
+ include/lock0lock.ic \
+ include/lock0priv.h \
+ include/lock0priv.ic \
+ include/lock0types.h \
+ include/log0log.h \
+ include/log0log.ic \
+ include/log0recv.h \
+ include/log0recv.ic \
+ include/mach0data.h \
+ include/mach0data.ic \
+ include/mem0dbg.h \
+ include/mem0dbg.ic \
+ include/mem0mem.h \
+ include/mem0mem.ic \
+ include/mem0pool.h \
+ include/mem0pool.ic \
+ include/mtr0log.h \
+ include/mtr0log.ic \
+ include/mtr0mtr.h \
+ include/mtr0mtr.ic \
+ include/mtr0types.h \
+ include/mysql_addons.h \
+ include/os0file.h \
+ include/os0proc.h \
+ include/os0proc.ic \
+ include/os0sync.h \
+ include/os0sync.ic \
+ include/os0thread.h \
+ include/os0thread.ic \
+ include/page0cur.h \
+ include/page0cur.ic \
+ include/page0page.h \
+ include/page0page.ic \
+ include/page0types.h \
+ include/page0zip.h \
+ include/page0zip.ic \
+ include/pars0grm.h \
+ include/pars0opt.h \
+ include/pars0opt.ic \
+ include/pars0pars.h \
+ include/pars0pars.ic \
+ include/pars0sym.h \
+ include/pars0sym.ic \
+ include/pars0types.h \
+ include/que0que.h \
+ include/que0que.ic \
+ include/que0types.h \
+ include/read0read.h \
+ include/read0read.ic \
+ include/read0types.h \
+ include/rem0cmp.h \
+ include/rem0cmp.ic \
+ include/rem0rec.h \
+ include/rem0rec.ic \
+ include/rem0types.h \
+ include/row0ext.h \
+ include/row0ext.ic \
+ include/row0ins.h \
+ include/row0ins.ic \
+ include/row0merge.h \
+ include/row0mysql.h \
+ include/row0mysql.ic \
+ include/row0purge.h \
+ include/row0purge.ic \
+ include/row0row.h \
+ include/row0row.ic \
+ include/row0sel.h \
+ include/row0sel.ic \
+ include/row0types.h \
+ include/row0uins.h \
+ include/row0uins.ic \
+ include/row0umod.h \
+ include/row0umod.ic \
+ include/row0undo.h \
+ include/row0undo.ic \
+ include/row0upd.h \
+ include/row0upd.ic \
+ include/row0vers.h \
+ include/row0vers.ic \
+ include/srv0que.h \
+ include/srv0srv.h \
+ include/srv0srv.ic \
+ include/srv0start.h \
+ include/sync0arr.h \
+ include/sync0arr.ic \
+ include/sync0rw.h \
+ include/sync0rw.ic \
+ include/sync0sync.h \
+ include/sync0sync.ic \
+ include/sync0types.h \
+ include/thr0loc.h \
+ include/thr0loc.ic \
+ include/trx0i_s.h \
+ include/trx0purge.h \
+ include/trx0purge.ic \
+ include/trx0rec.h \
+ include/trx0rec.ic \
+ include/trx0roll.h \
+ include/trx0roll.ic \
+ include/trx0rseg.h \
+ include/trx0rseg.ic \
+ include/trx0sys.h \
+ include/trx0sys.ic \
+ include/trx0trx.h \
+ include/trx0trx.ic \
+ include/trx0types.h \
+ include/trx0undo.h \
+ include/trx0undo.ic \
+ include/trx0xa.h \
+ include/univ.i \
+ include/usr0sess.h \
+ include/usr0sess.ic \
+ include/usr0types.h \
+ include/ut0auxconf.h \
+ include/ut0byte.h \
+ include/ut0byte.ic \
+ include/ut0dbg.h \
+ include/ut0list.h \
+ include/ut0list.ic \
+ include/ut0lst.h \
+ include/ut0mem.h \
+ include/ut0mem.ic \
+ include/ut0rbt.h \
+ include/ut0rnd.h \
+ include/ut0rnd.ic \
+ include/ut0sort.h \
+ include/ut0ut.h \
+ include/ut0ut.ic \
+ include/ut0vec.h \
+ include/ut0vec.ic \
+ include/ut0wqueue.h \
+ handler/innodb_patch_info.h \
+ mem/mem0dbg.c
+
+EXTRA_LIBRARIES= libxtradb.a
+noinst_LIBRARIES= @plugin_xtradb_static_target@
+libxtradb_a_SOURCES= \
+ btr/btr0btr.c \
+ btr/btr0cur.c \
+ btr/btr0pcur.c \
+ btr/btr0sea.c \
+ buf/buf0buddy.c \
+ buf/buf0buf.c \
+ buf/buf0flu.c \
+ buf/buf0lru.c \
+ buf/buf0rea.c \
+ data/data0data.c \
+ data/data0type.c \
+ dict/dict0boot.c \
+ dict/dict0crea.c \
+ dict/dict0dict.c \
+ dict/dict0load.c \
+ dict/dict0mem.c \
+ dyn/dyn0dyn.c \
+ eval/eval0eval.c \
+ eval/eval0proc.c \
+ fil/fil0fil.c \
+ fsp/fsp0fsp.c \
+ fut/fut0fut.c \
+ fut/fut0lst.c \
+ ha/ha0ha.c \
+ ha/ha0storage.c \
+ ha/hash0hash.c \
+ handler/ha_innodb.cc \
+ handler/handler0alter.cc \
+ handler/i_s.cc \
+ handler/mysql_addons.cc \
+ ibuf/ibuf0ibuf.c \
+ lock/lock0iter.c \
+ lock/lock0lock.c \
+ log/log0log.c \
+ log/log0recv.c \
+ mach/mach0data.c \
+ mem/mem0mem.c \
+ mem/mem0pool.c \
+ mtr/mtr0log.c \
+ mtr/mtr0mtr.c \
+ os/os0file.c \
+ os/os0proc.c \
+ os/os0sync.c \
+ os/os0thread.c \
+ page/page0cur.c \
+ page/page0page.c \
+ page/page0zip.c \
+ pars/lexyy.c \
+ pars/pars0grm.c \
+ pars/pars0opt.c \
+ pars/pars0pars.c \
+ pars/pars0sym.c \
+ que/que0que.c \
+ read/read0read.c \
+ rem/rem0cmp.c \
+ rem/rem0rec.c \
+ row/row0ext.c \
+ row/row0ins.c \
+ row/row0merge.c \
+ row/row0mysql.c \
+ row/row0purge.c \
+ row/row0row.c \
+ row/row0sel.c \
+ row/row0uins.c \
+ row/row0umod.c \
+ row/row0undo.c \
+ row/row0upd.c \
+ row/row0vers.c \
+ srv/srv0que.c \
+ srv/srv0srv.c \
+ srv/srv0start.c \
+ sync/sync0arr.c \
+ sync/sync0rw.c \
+ sync/sync0sync.c \
+ thr/thr0loc.c \
+ trx/trx0i_s.c \
+ trx/trx0purge.c \
+ trx/trx0rec.c \
+ trx/trx0roll.c \
+ trx/trx0rseg.c \
+ trx/trx0sys.c \
+ trx/trx0trx.c \
+ trx/trx0undo.c \
+ usr/usr0sess.c \
+ ut/ut0byte.c \
+ ut/ut0dbg.c \
+ ut/ut0list.c \
+ ut/ut0mem.c \
+ ut/ut0rbt.c \
+ ut/ut0rnd.c \
+ ut/ut0ut.c \
+ ut/ut0vec.c \
+ ut/ut0wqueue.c
+
+libxtradb_a_CXXFLAGS= $(AM_CXXFLAGS)
+libxtradb_a_CFLAGS= $(AM_CFLAGS)
+
+EXTRA_LTLIBRARIES= ha_xtradb.la
+pkgplugin_LTLIBRARIES= @plugin_xtradb_shared_target@
+
+ha_xtradb_la_LDFLAGS= -module -rpath $(pkgplugindir) -L$(top_builddir)/libservices -lmysqlservices
+ha_xtradb_la_CXXFLAGS= -shared $(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_xtradb_la_CFLAGS= -shared $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_xtradb_la_SOURCES= $(libxtradb_a_SOURCES)
+
+EXTRA_DIST= CMakeLists.txt plug.in \
+ pars/make_bison.sh pars/make_flex.sh \
+ pars/pars0grm.y pars/pars0lex.l
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c
new file mode 100644
index 00000000000..ff047095aa4
--- /dev/null
+++ b/storage/xtradb/btr/btr0btr.c
@@ -0,0 +1,3789 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.c
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0zip.h"
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+A tree latch protects all non-leaf nodes of the tree. Each node of a tree
+also has a latch of its own.
+
+A B-tree operation normally first acquires an S-latch on the tree. It
+searches down the tree and releases the tree latch when it has the
+leaf node latch. To save CPU time we do not acquire any latch on
+non-leaf nodes of the tree during a search, those pages are only bufferfixed.
+
+If an operation needs to restructure the tree, it acquires an X-latch on
+the tree before searching to a leaf node. If it needs, for example, to
+split a leaf,
+(1) InnoDB decides the split point in the leaf,
+(2) allocates a new page,
+(3) inserts the appropriate node pointer to the first non-leaf level,
+(4) releases the tree X-latch,
+(5) and then moves records from the leaf to the new allocated page.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+#ifdef UNIV_BTR_DEBUG
+/**************************************************************//**
+Checks a file segment header within a B-tree root page.
+@return TRUE if valid */
+static
+ibool
+btr_root_fseg_validate(
+/*===================*/
+ const fseg_header_t* seg_header, /*!< in: segment header */
+ ulint space) /*!< in: tablespace identifier */
+{
+ ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+ ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
+ ut_a(offset >= FIL_PAGE_DATA);
+ ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+ return(TRUE);
+}
+#endif /* UNIV_BTR_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return root page, x-latched */
+static
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ ulint root_page_no;
+ buf_block_t* block;
+
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+ root_page_no = dict_index_get_page(index);
+
+ block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(0);
+ }
+ ut_a(block);
+
+ ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
+ == dict_table_is_comp(index->table));
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ const page_t* root = buf_block_get_frame(block);
+
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ return(block);
+}
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ return(buf_block_get_frame(btr_root_block_get(index, mtr)));
+}
+
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed that
+the caller has appropriate latches on the page and its neighbor.
+@return previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
+ needed, also to the previous page */
+{
+ page_t* page;
+ page_t* prev_page;
+ ulint prev_page_no;
+
+ if (!page_rec_is_infimum(rec)) {
+
+ rec_t* prev_rec = page_rec_get_prev(rec);
+
+ if (!page_rec_is_infimum(prev_rec)) {
+
+ return(prev_rec);
+ }
+ }
+
+ page = page_align(rec);
+ prev_page_no = btr_page_get_prev(page, mtr);
+
+ if (prev_page_no != FIL_NULL) {
+
+ ulint space;
+ ulint zip_size;
+ buf_block_t* prev_block;
+
+ space = page_get_space_id(page);
+ zip_size = fil_space_get_zip_size(space);
+
+ prev_block = buf_page_get_with_no_latch(space, zip_size,
+ prev_page_no, mtr);
+ prev_page = buf_block_get_frame(prev_block);
+ /* The caller must already have a latch to the brother */
+ ut_ad(mtr_memo_contains(mtr, prev_block,
+ MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains(mtr, prev_block,
+ MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(prev_page) == page_is_comp(page));
+ ut_a(btr_page_get_next(prev_page, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed that the
+caller has appropriate latches on the page and its neighbor.
+@return next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
+ needed, also to the next page */
+{
+ page_t* page;
+ page_t* next_page;
+ ulint next_page_no;
+
+ if (!page_rec_is_supremum(rec)) {
+
+ rec_t* next_rec = page_rec_get_next(rec);
+
+ if (!page_rec_is_supremum(next_rec)) {
+
+ return(next_rec);
+ }
+ }
+
+ page = page_align(rec);
+ next_page_no = btr_page_get_next(page, mtr);
+
+ if (next_page_no != FIL_NULL) {
+ ulint space;
+ ulint zip_size;
+ buf_block_t* next_block;
+
+ space = page_get_space_id(page);
+ zip_size = fil_space_get_zip_size(space);
+
+ next_block = buf_page_get_with_no_latch(space, zip_size,
+ next_page_no, mtr);
+ next_page = buf_block_get_frame(next_block);
+ /* The caller must already have a latch to the brother */
+ ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains(mtr, next_block,
+ MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_page) == page_is_comp(page));
+ ut_a(btr_page_get_prev(next_page, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ return(page_rec_get_next(page_get_infimum_rec(next_page)));
+ }
+
+ return(NULL);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization). @see btr_page_empty(). */
+static
+void
+btr_page_create(
+/*============*/
+ buf_block_t* block, /*!< in/out: page to be created */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_create_zip(block, index, level, mtr);
+ } else {
+ page_create(block, mtr, dict_table_is_comp(index->table));
+ /* Set the level of the new index page */
+ btr_page_set_level(page, NULL, level, mtr);
+ }
+
+ block->check_index_page_at_flush = TRUE;
+
+ btr_page_set_index_id(page, page_zip, index->id, mtr);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fil_addr_t node_addr;
+ page_t* root;
+ page_t* new_page;
+ buf_block_t* new_block;
+
+ root = btr_root_get(index, mtr);
+
+ node_addr = flst_get_first(root + PAGE_HEADER
+ + PAGE_BTR_IBUF_FREE_LIST, mtr);
+ ut_a(node_addr.page != FIL_NULL);
+
+ new_block = buf_page_get(dict_index_get_space(index),
+ dict_table_zip_size(index->table),
+ node_addr.page, RW_X_LATCH, mtr);
+ new_page = buf_block_get_frame(new_block);
+ buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+
+ flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+ mtr);
+ ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ mtr));
+
+ return(new_block);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index */
+ ulint hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fseg_header_t* seg_header;
+ page_t* root;
+ buf_block_t* new_block;
+ ulint new_page_no;
+
+ if (dict_index_is_ibuf(index)) {
+
+ return(btr_page_alloc_for_ibuf(index, mtr));
+ }
+
+ root = btr_root_get(index, mtr);
+
+ if (level == 0) {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+ } else {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+ }
+
+ /* Parameter TRUE below states that the caller has made the
+ reservation for free extents, and thus we know that a page can
+ be allocated: */
+
+ new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
+ file_direction, TRUE, mtr);
+ if (new_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ new_block = buf_page_get(dict_index_get_space(index),
+ dict_table_zip_size(index->table),
+ new_page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+
+ return(new_block);
+}
+
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag) /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+{
+ fseg_header_t* seg_header;
+ page_t* root;
+ ulint n;
+ ulint dummy;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+ root = btr_root_get(index, &mtr);
+
+ if (srv_pass_corrupt_table && !root) {
+ mtr_commit(&mtr);
+ return(0);
+ }
+ ut_a(root);
+
+ if (flag == BTR_N_LEAF_PAGES) {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+ fseg_n_reserved_pages(seg_header, &n, &mtr);
+
+ } else if (flag == BTR_TOTAL_SIZE) {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+ n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
+
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+ n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
+ } else {
+ ut_error;
+ }
+
+ mtr_commit(&mtr);
+
+ return(n);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+void
+btr_page_free_for_ibuf(
+/*===================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* root;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ root = btr_root_get(index, mtr);
+
+ flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ buf_block_get_frame(block)
+ + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+
+ ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ mtr));
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to (BLOB)
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ ulint level, /*!< in: page level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fseg_header_t* seg_header;
+ page_t* root;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ /* The page gets invalid for optimistic searches: increment the frame
+ modify clock */
+
+ buf_block_modify_clock_inc(block);
+
+ if (dict_index_is_ibuf(index)) {
+
+ btr_page_free_for_ibuf(index, block, mtr);
+
+ return;
+ }
+
+ root = btr_root_get(index, mtr);
+
+ if (level == 0) {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+ } else {
+ seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+ }
+
+ fseg_free_page(seg_header,
+ buf_block_get_space(block),
+ buf_block_get_page_no(block), mtr);
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint level;
+
+ level = btr_page_get_level(buf_block_get_frame(block), mtr);
+
+ btr_page_free_low(index, block, level, mtr);
+}
+
+/**************************************************************//**
+Sets the child node file address in a node pointer. */
+UNIV_INLINE
+void
+btr_node_ptr_set_child_page_no(
+/*===========================*/
+ rec_t* rec, /*!< in: node pointer record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint page_no,/*!< in: child node address */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* field;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!page_is_leaf(page_align(rec)));
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1, &len);
+
+ ut_ad(len == REC_NODE_PTR_SIZE);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_write_node_ptr(page_zip, rec,
+ rec_offs_data_size(offsets),
+ page_no, mtr);
+ } else {
+ mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
+ }
+}
+
+/************************************************************//**
+Returns the child page of a node pointer and x-latches it.
+@return child page, x-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+ const rec_t* node_ptr,/*!< in: node pointer */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_no;
+ ulint space;
+
+ ut_ad(rec_offs_validate(node_ptr, index, offsets));
+ space = page_get_space_id(page_align(node_ptr));
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+
+ return(btr_block_get(space, dict_table_zip_size(index->table),
+ page_no, RW_X_LATCH, mtr));
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_node_ptr_func(
+/*==============================*/
+ ulint* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ btr_cur_t* cursor, /*!< in: cursor pointing to user record,
+ out: cursor on node pointer record,
+ its page x-latched */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dtuple_t* tuple;
+ rec_t* user_rec;
+ rec_t* node_ptr;
+ ulint level;
+ ulint page_no;
+ dict_index_t* index;
+
+ page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+
+ ut_ad(dict_index_get_page(index) != page_no);
+
+ level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
+ user_rec = btr_cur_get_rec(cursor);
+ ut_a(page_rec_is_user_rec(user_rec));
+ tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
+
+ btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
+ BTR_CONT_MODIFY_TREE, cursor, 0,
+ file, line, mtr);
+
+ node_ptr = btr_cur_get_rec(cursor);
+ ut_ad(!page_rec_is_comp(node_ptr)
+ || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
+ offsets = rec_get_offsets(node_ptr, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
+ != page_no)) {
+ rec_t* print_rec;
+ fputs("InnoDB: Dump of the child page:\n", stderr);
+ buf_page_print(page_align(user_rec), 0);
+ fputs("InnoDB: Dump of the parent page:\n", stderr);
+ buf_page_print(page_align(node_ptr), 0);
+
+ fputs("InnoDB: Corruption of an index tree: table ", stderr);
+ ut_print_name(stderr, NULL, TRUE, index->table_name);
+ fputs(", index ", stderr);
+ ut_print_name(stderr, NULL, FALSE, index->name);
+ fprintf(stderr, ",\n"
+ "InnoDB: father ptr page no %lu, child page no %lu\n",
+ (ulong)
+ btr_node_ptr_get_child_page_no(node_ptr, offsets),
+ (ulong) page_no);
+ print_rec = page_rec_get_next(
+ page_get_infimum_rec(page_align(user_rec)));
+ offsets = rec_get_offsets(print_rec, index,
+ offsets, ULINT_UNDEFINED, &heap);
+ page_rec_print(print_rec, offsets);
+ offsets = rec_get_offsets(node_ptr, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(node_ptr, offsets);
+
+ fputs("InnoDB: You should dump + drop + reimport the table"
+ " to fix the\n"
+ "InnoDB: corruption. If the crash happens at "
+ "the database startup, see\n"
+ "InnoDB: " REFMAN "forcing-recovery.html about\n"
+ "InnoDB: forcing recovery. "
+ "Then dump + drop + reimport.\n", stderr);
+
+ ut_error;
+ }
+
+ return(offsets);
+}
+
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
+ btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_block(
+/*======================*/
+ ulint* offsets,/*!< in: work area for the return value */
+ mem_heap_t* heap, /*!< in: memory heap to use */
+ dict_index_t* index, /*!< in: b-tree index */
+ buf_block_t* block, /*!< in: child page in the index */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ its page x-latched */
+{
+ rec_t* rec
+ = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+ block)));
+ btr_cur_position(index, rec, block, cursor);
+ return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+}
+
+/************************************************************//**
+Seeks to the upper level node pointer to a page.
+It is assumed that mtr holds an x-latch on the tree. */
+static
+void
+btr_page_get_father(
+/*================*/
+ dict_index_t* index, /*!< in: b-tree index */
+ buf_block_t* block, /*!< in: child page in the index */
+ mtr_t* mtr, /*!< in: mtr */
+ btr_cur_t* cursor) /*!< out: cursor on node pointer record,
+ its page x-latched */
+{
+ mem_heap_t* heap;
+ rec_t* rec
+ = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+ block)));
+ btr_cur_position(index, rec, block, cursor);
+
+ heap = mem_heap_create(100);
+ btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
+ mem_heap_free(heap);
+}
+
+/************************************************************//**
+Creates the root node for a new index tree.
+@return page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+ ulint type, /*!< in: type of the index */
+ ulint space, /*!< in: space where created */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ dulint index_id,/*!< in: index id */
+ dict_index_t* index, /*!< in: index */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint page_no;
+ buf_block_t* block;
+ buf_frame_t* frame;
+ page_t* page;
+ page_zip_des_t* page_zip;
+
+ /* Create the two new segments (one, in the case of an ibuf tree) for
+ the index tree; the segment headers are put on the allocated root page
+ (for an ibuf tree, not in the root, but on a separate ibuf header
+ page) */
+
+ if (type & DICT_IBUF) {
+ /* Allocate first the ibuf header page */
+ buf_block_t* ibuf_hdr_block = fseg_create(
+ space, 0,
+ IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+
+ buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
+
+ ut_ad(buf_block_get_page_no(ibuf_hdr_block)
+ == IBUF_HEADER_PAGE_NO);
+ /* Allocate then the next page to the segment: it will be the
+ tree root page */
+
+ page_no = fseg_alloc_free_page(buf_block_get_frame(
+ ibuf_hdr_block)
+ + IBUF_HEADER
+ + IBUF_TREE_SEG_HEADER,
+ IBUF_TREE_ROOT_PAGE_NO,
+ FSP_UP, mtr);
+ ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
+
+ block = buf_page_get(space, zip_size, page_no,
+ RW_X_LATCH, mtr);
+ } else {
+ block = fseg_create(space, 0,
+ PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
+ }
+
+ if (block == NULL) {
+
+ return(FIL_NULL);
+ }
+
+ page_no = buf_block_get_page_no(block);
+ frame = buf_block_get_frame(block);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+ if (type & DICT_IBUF) {
+ /* It is an insert buffer tree: initialize the free list */
+
+ ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
+
+ flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+ } else {
+ /* It is a non-ibuf tree: create a file segment for leaf
+ pages */
+ if (!fseg_create(space, page_no,
+ PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
+ /* Not enough space for new segment, free root
+ segment before return. */
+ btr_free_root(space, zip_size, page_no, mtr);
+
+ return(FIL_NULL);
+ }
+
+ /* The fseg create acquires a second latch on the page,
+ therefore we must declare it: */
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+ }
+
+ /* Create a new index page on the allocated segment page */
+ page_zip = buf_block_get_page_zip(block);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page = page_create_zip(block, index, 0, mtr);
+ } else {
+ page = page_create(block, mtr,
+ dict_table_is_comp(index->table));
+ /* Set the level of the new index page */
+ btr_page_set_level(page, NULL, 0, mtr);
+ }
+
+ block->check_index_page_at_flush = TRUE;
+
+ /* Set the index id of the page */
+ btr_page_set_index_id(page, page_zip, index_id, mtr);
+
+ /* Set the next node and previous node fields */
+ btr_page_set_next(page, page_zip, FIL_NULL, mtr);
+ btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
+
+ /* We reset the free bits for the page to allow creation of several
+ trees in the same mtr, otherwise the latch on a bitmap page would
+ prevent it because of the latching order */
+
+ if (!(type & DICT_CLUSTERED)) {
+ ibuf_reset_free_bits(block);
+ }
+
+ /* In the following assertion we test that two records of maximum
+ allowed size fit on the root page: this fact is needed to ensure
+ correctness of split algorithms */
+
+ ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+ return(page_no);
+}
+
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no) /*!< in: root page number */
+{
+ ibool finished;
+ page_t* root;
+ mtr_t mtr;
+
+leaf_loop:
+ mtr_start(&mtr);
+
+ root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+
+ if (srv_pass_corrupt_table && !root) {
+ mtr_commit(&mtr);
+ return;
+ }
+ ut_a(root);
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+ /* NOTE: page hash indexes are dropped when a page is freed inside
+ fsp0fsp. */
+
+ finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+ &mtr);
+ mtr_commit(&mtr);
+
+ if (!finished) {
+
+ goto leaf_loop;
+ }
+top_loop:
+ mtr_start(&mtr);
+
+ root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+
+ if (srv_pass_corrupt_table && !root) {
+ mtr_commit(&mtr);
+ return;
+ }
+ ut_a(root);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+ finished = fseg_free_step_not_header(
+ root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+ mtr_commit(&mtr);
+
+ if (!finished) {
+
+ goto top_loop;
+ }
+}
+
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no, /*!< in: root page number */
+ mtr_t* mtr) /*!< in: a mini-transaction which has already
+ been started */
+{
+ buf_block_t* block;
+ fseg_header_t* header;
+
+ block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !block) {
+ return;
+ }
+ ut_a(block);
+
+ btr_search_drop_page_hash_index(block);
+
+ header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_root_fseg_validate(header, space));
+#endif /* UNIV_BTR_DEBUG */
+
+ while (!fseg_free_step(header, mtr));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Reorganizes an index page. */
+static
+ibool
+btr_page_reorganize_low(
+/*====================*/
+ ibool recovery,/*!< in: TRUE if called in recovery:
+ locks should not be updated, i.e.,
+ there cannot exist locks on the
+ page, and a hash index should not be
+ dropped: it cannot exist */
+ buf_block_t* block, /*!< in: page to be reorganized */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ buf_block_t* temp_block;
+ page_t* temp_page;
+ ulint log_mode;
+ ulint data_size1;
+ ulint data_size2;
+ ulint max_ins_size1;
+ ulint max_ins_size2;
+ ibool success = FALSE;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ data_size1 = page_get_data_size(page);
+ max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
+
+#ifndef UNIV_HOTBACKUP
+ /* Write the log record */
+ mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
+ ? MLOG_COMP_PAGE_REORGANIZE
+ : MLOG_PAGE_REORGANIZE, 0);
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Turn logging off */
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+ temp_block = buf_block_alloc(0);
+#else /* !UNIV_HOTBACKUP */
+ ut_ad(block == back_block1);
+ temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+ temp_page = temp_block->frame;
+
+ /* Copy the old page to temporary space */
+ buf_frame_copy(temp_page, page);
+
+#ifndef UNIV_HOTBACKUP
+ if (UNIV_LIKELY(!recovery)) {
+ btr_search_drop_page_hash_index(block);
+ }
+
+ block->check_index_page_at_flush = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ page_create(block, mtr, dict_table_is_comp(index->table));
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ page_copy_rec_list_end_no_locks(block, temp_block,
+ page_get_infimum_rec(temp_page),
+ index, mtr);
+
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+ /* Copy max trx id to recreated page */
+ trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
+ page_set_max_trx_id(block, NULL, max_trx_id, mtr);
+ /* In crash recovery, dict_index_is_sec_or_ibuf() always
+ returns TRUE, even for clustered indexes. max_trx_id is
+ unused in clustered index pages. */
+ ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
+ }
+
+ if (UNIV_LIKELY_NULL(page_zip)
+ && UNIV_UNLIKELY
+ (!page_zip_compress(page_zip, page, index, NULL))) {
+
+ /* Restore the old page and exit. */
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the bytes that we skip are identical. */
+ ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+ PAGE_HEADER + PAGE_N_RECS + temp_page,
+ PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+ ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
+ FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+ PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+ memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+ UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ goto func_exit;
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (UNIV_LIKELY(!recovery)) {
+ /* Update the record lock bitmaps */
+ lock_move_reorganize_page(block, temp_block);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ data_size2 = page_get_data_size(page);
+ max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
+
+ if (UNIV_UNLIKELY(data_size1 != data_size2)
+ || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
+ buf_page_print(page, 0);
+ buf_page_print(temp_page, 0);
+ fprintf(stderr,
+ "InnoDB: Error: page old data size %lu"
+ " new data size %lu\n"
+ "InnoDB: Error: page old max ins size %lu"
+ " new max ins size %lu\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n",
+ (unsigned long) data_size1, (unsigned long) data_size2,
+ (unsigned long) max_ins_size1,
+ (unsigned long) max_ins_size2);
+ } else {
+ success = TRUE;
+ }
+
+func_exit:
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+#ifndef UNIV_HOTBACKUP
+ buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Restore logging mode */
+ mtr_set_log_mode(mtr, log_mode);
+
+ return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Reorganizes an index page.
+IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
+page of a non-clustered index, the caller must update the insert
+buffer free bits in the same mini-transaction in such a way that the
+modification will be redo-logged.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+btr_page_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in: page to be reorganized */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ return(btr_page_reorganize_low(FALSE, block, index, mtr));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)),
+ /*!< in: buffer end */
+ dict_index_t* index, /*!< in: record descriptor */
+ buf_block_t* block, /*!< in: page to be reorganized, or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ /* The record is empty, except for the record initial part */
+
+ if (UNIV_LIKELY(block != NULL)) {
+ btr_page_reorganize_low(TRUE, block, index, mtr);
+ }
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Empties an index page. @see btr_page_create(). */
+static
+void
+btr_page_empty(
+/*===========*/
+ buf_block_t* block, /*!< in: page to be emptied */
+ page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_zip == buf_block_get_page_zip(block));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_create_zip(block, index, level, mtr);
+ } else {
+ page_create(block, mtr, dict_table_is_comp(index->table));
+ btr_page_set_level(page, NULL, level, mtr);
+ }
+
+ block->check_index_page_at_flush = TRUE;
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ page_t* root;
+ page_t* new_page;
+ ulint new_page_no;
+ rec_t* rec;
+ mem_heap_t* heap;
+ dtuple_t* node_ptr;
+ ulint level;
+ rec_t* node_ptr_rec;
+ page_cur_t* page_cursor;
+ page_zip_des_t* root_page_zip;
+ page_zip_des_t* new_page_zip;
+ buf_block_t* root_block;
+ buf_block_t* new_block;
+
+ root = btr_cur_get_page(cursor);
+ root_block = btr_cur_get_block(cursor);
+ root_page_zip = buf_block_get_page_zip(root_block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
+#endif /* UNIV_ZIP_DEBUG */
+ index = btr_cur_get_index(cursor);
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ ulint space = dict_index_get_space(index);
+
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+ }
+
+ ut_a(dict_index_get_page(index) == page_get_page_no(root));
+#endif /* UNIV_BTR_DEBUG */
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
+
+ /* Allocate a new page to the tree. Root splitting is done by first
+ moving the root records to the new page, emptying the root, putting
+ a node pointer to the new page, and then splitting the new page. */
+
+ level = btr_page_get_level(root, mtr);
+
+ new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+ ut_a(!new_page_zip == !root_page_zip);
+ ut_a(!new_page_zip
+ || page_zip_get_size(new_page_zip)
+ == page_zip_get_size(root_page_zip));
+
+ btr_page_create(new_block, new_page_zip, index, level, mtr);
+
+ /* Set the next node and previous node fields of new page */
+ btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
+ btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
+
+ /* Copy the records from root to the new page one by one. */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || UNIV_UNLIKELY
+ (!page_copy_rec_list_end(new_block, root_block,
+ page_get_infimum_rec(root),
+ index, mtr))) {
+ ut_a(new_page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(new_page_zip, new_page,
+ root_page_zip, root, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ lock_move_rec_list_end(new_block, root_block,
+ page_get_infimum_rec(root));
+
+ btr_search_move_or_delete_hash_entries(new_block, root_block,
+ index);
+ }
+
+ /* If this is a pessimistic insert which is actually done to
+ perform a pessimistic update then we have stored the lock
+ information of the record to be inserted on the infimum of the
+ root page: we cannot discard the lock structs on the root page */
+
+ lock_update_root_raise(new_block, root_block);
+
+ /* Create a memory heap where the node pointer is stored */
+ heap = mem_heap_create(100);
+
+ rec = page_rec_get_next(page_get_infimum_rec(new_page));
+ new_page_no = buf_block_get_page_no(new_block);
+
+ /* Build the node pointer (= node key and page address) for the
+ child */
+
+ node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
+ level);
+ /* The node pointer must be marked as the predefined minimum record,
+ as there is no lower alphabetical limit to records in the leftmost
+ node of a level: */
+ dtuple_set_info_bits(node_ptr,
+ dtuple_get_info_bits(node_ptr)
+ | REC_INFO_MIN_REC_FLAG);
+
+ /* Rebuild the root page to get free space */
+ btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
+
+ /* Set the next node and previous node fields, although
+ they should already have been set. The previous node field
+ must be FIL_NULL if root_page_zip != NULL, because the
+ REC_INFO_MIN_REC_FLAG (of the first user record) will be
+ set if and only if btr_page_get_prev() == FIL_NULL. */
+ btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
+ btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Insert node pointer to the root */
+
+ page_cur_set_before_first(root_block, page_cursor);
+
+ node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+ index, 0, mtr);
+
+ /* The root page should only contain the node pointer
+ to new_page at this point. Thus, the data should fit. */
+ ut_a(node_ptr_rec);
+
+ /* Free the memory heap */
+ mem_heap_free(heap);
+
+ /* We play safe and reset the free bits for the new page */
+
+#if 0
+ fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
+#endif
+
+ if (!dict_index_is_clust(index)) {
+ ibuf_reset_free_bits(new_block);
+ }
+
+ /* Reposition the cursor to the child node */
+ page_cur_search(new_block, index, tuple,
+ PAGE_CUR_LE, page_cursor);
+
+ /* Split the child and insert tuple */
+ return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the left.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec) /*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple to be inserted should
+ be first */
+{
+ page_t* page;
+ rec_t* insert_point;
+ rec_t* infimum;
+
+ page = btr_cur_get_page(cursor);
+ insert_point = btr_cur_get_rec(cursor);
+
+ if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+ == page_rec_get_next(insert_point)) {
+
+ infimum = page_get_infimum_rec(page);
+
+ /* If the convergence is in the middle of a page, include also
+ the record immediately before the new insert to the upper
+ page. Otherwise, we could repeatedly move from page to page
+ lots of records smaller than the convergence point. */
+
+ if (infimum != insert_point
+ && page_rec_get_next(infimum) != insert_point) {
+
+ *split_rec = insert_point;
+ } else {
+ *split_rec = page_rec_get_next(insert_point);
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the right.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec) /*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple to be inserted should
+ be first */
+{
+ page_t* page;
+ rec_t* insert_point;
+
+ page = btr_cur_get_page(cursor);
+ insert_point = btr_cur_get_rec(cursor);
+
+ /* We use eager heuristics: if the new insert would be right after
+ the previous insert on the same page, we assume that there is a
+ pattern of sequential inserts here. */
+
+ if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
+ == insert_point)) {
+
+ rec_t* next_rec;
+
+ next_rec = page_rec_get_next(insert_point);
+
+ if (page_rec_is_supremum(next_rec)) {
+split_at_new:
+ /* Split at the new record to insert */
+ *split_rec = NULL;
+ } else {
+ rec_t* next_next_rec = page_rec_get_next(next_rec);
+ if (page_rec_is_supremum(next_next_rec)) {
+
+ goto split_at_new;
+ }
+
+ /* If there are >= 2 user records up from the insert
+ point, split all but 1 off. We want to keep one because
+ then sequential inserts can use the adaptive hash
+ index, as they can do the necessary checks of the right
+ search position just by looking at the records on this
+ page. */
+
+ *split_rec = next_next_rec;
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ page_t* page;
+ page_zip_des_t* page_zip;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ ulint total_space;
+ ulint incl_data;
+ rec_t* ins_rec;
+ rec_t* rec;
+ rec_t* next_rec;
+ ulint n;
+ mem_heap_t* heap;
+ ulint* offsets;
+
+ page = btr_cur_get_page(cursor);
+
+ insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ page_zip = btr_cur_get_page_zip(cursor);
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ /* Estimate the free space of an empty compressed page. */
+ ulint free_space_zip = page_zip_empty_size(
+ cursor->index->n_fields,
+ page_zip_get_size(page_zip));
+
+ if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
+ free_space = (ulint) free_space_zip;
+ }
+ }
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = page_get_n_recs(page) + 1;
+ ut_ad(total_n_recs >= 2);
+ total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+ n = 0;
+ incl_data = 0;
+ ins_rec = btr_cur_get_rec(cursor);
+ rec = page_get_infimum_rec(page);
+
+ heap = NULL;
+ offsets = NULL;
+
+ /* We start to include records to the left half, and when the
+ space reserved by them exceeds half of total_space, then if
+ the included records fit on the left page, they will be put there
+ if something was left over also for the right page,
+ otherwise the last included record will be the first on the right
+ half page */
+
+ do {
+ /* Decide the next record to include */
+ if (rec == ins_rec) {
+ rec = NULL; /* NULL denotes that tuple is
+ now included */
+ } else if (rec == NULL) {
+ rec = page_rec_get_next(ins_rec);
+ } else {
+ rec = page_rec_get_next(rec);
+ }
+
+ if (rec == NULL) {
+ /* Include tuple */
+ incl_data += insert_size;
+ } else {
+ offsets = rec_get_offsets(rec, cursor->index,
+ offsets, ULINT_UNDEFINED,
+ &heap);
+ incl_data += rec_offs_size(offsets);
+ }
+
+ n++;
+ } while (incl_data + page_dir_calc_reserved_space(n)
+ < total_space / 2);
+
+ if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+ /* The next record will be the first on
+ the right half page if it is not the
+ supremum record of page */
+
+ if (rec == ins_rec) {
+ rec = NULL;
+
+ goto func_exit;
+ } else if (rec == NULL) {
+ next_rec = page_rec_get_next(ins_rec);
+ } else {
+ next_rec = page_rec_get_next(rec);
+ }
+ ut_ad(next_rec);
+ if (!page_rec_is_supremum(next_rec)) {
+ rec = next_rec;
+ }
+ }
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(rec);
+}
+
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return TRUE if fits */
+static
+ibool
+btr_page_insert_fits(
+/*=================*/
+ btr_cur_t* cursor, /*!< in: cursor at which insert
+ should be made */
+ const rec_t* split_rec,/*!< in: suggestion for first record
+ on upper half-page, or NULL if
+ tuple to be inserted should be first */
+ const ulint* offsets,/*!< in: rec_get_offsets(
+ split_rec, cursor->index) */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ page_t* page;
+ ulint insert_size;
+ ulint free_space;
+ ulint total_data;
+ ulint total_n_recs;
+ const rec_t* rec;
+ const rec_t* end_rec;
+ ulint* offs;
+
+ page = btr_cur_get_page(cursor);
+
+ ut_ad(!split_rec == !offsets);
+ ut_ad(!offsets
+ || !page_is_comp(page) == !rec_offs_comp(offsets));
+ ut_ad(!offsets
+ || rec_offs_validate(split_rec, cursor->index, offsets));
+
+ insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ /* free_space is now the free space of a created new page */
+
+ total_data = page_get_data_size(page) + insert_size;
+ total_n_recs = page_get_n_recs(page) + 1;
+
+ /* We determine which records (from rec to end_rec, not including
+ end_rec) will end up on the other half page from tuple when it is
+ inserted. */
+
+ if (split_rec == NULL) {
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+ } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ end_rec = split_rec;
+ } else {
+ rec = split_rec;
+ end_rec = page_get_supremum_rec(page);
+ }
+
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(TRUE);
+ }
+
+ offs = NULL;
+
+ while (rec != end_rec) {
+ /* In this loop we calculate the amount of reserved
+ space after rec is removed from page. */
+
+ offs = rec_get_offsets(rec, cursor->index, offs,
+ ULINT_UNDEFINED, &heap);
+
+ total_data -= rec_offs_size(offs);
+ total_n_recs--;
+
+ if (total_data + page_dir_calc_reserved_space(total_n_recs)
+ <= free_space) {
+
+ /* Ok, there will be enough available space on the
+ half page where the tuple is inserted */
+
+ return(TRUE);
+ }
+
+ rec = page_rec_get_next_const(rec);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ big_rec_t* dummy_big_rec;
+ btr_cur_t cursor;
+ ulint err;
+ rec_t* rec;
+
+ ut_ad(level > 0);
+
+ btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
+ BTR_CONT_MODIFY_TREE,
+ &cursor, 0, file, line, mtr);
+
+ err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ &cursor, tuple, &rec,
+ &dummy_big_rec, 0, NULL, mtr);
+ ut_a(err == DB_SUCCESS);
+}
+
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static
+void
+btr_attach_half_pages(
+/*==================*/
+ dict_index_t* index, /*!< in: the index tree */
+ buf_block_t* block, /*!< in/out: page to be split */
+ rec_t* split_rec, /*!< in: first record on upper
+ half page */
+ buf_block_t* new_block, /*!< in/out: the new half page */
+ ulint direction, /*!< in: FSP_UP or FSP_DOWN */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ ulint prev_page_no;
+ ulint next_page_no;
+ ulint level;
+ page_t* page = buf_block_get_frame(block);
+ page_t* lower_page;
+ page_t* upper_page;
+ ulint lower_page_no;
+ ulint upper_page_no;
+ page_zip_des_t* lower_page_zip;
+ page_zip_des_t* upper_page_zip;
+ dtuple_t* node_ptr_upper;
+ mem_heap_t* heap;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
+
+ /* Create a memory heap where the data tuple is stored */
+ heap = mem_heap_create(1024);
+
+ /* Based on split direction, decide upper and lower pages */
+ if (direction == FSP_DOWN) {
+
+ btr_cur_t cursor;
+ ulint* offsets;
+
+ lower_page = buf_block_get_frame(new_block);
+ lower_page_no = buf_block_get_page_no(new_block);
+ lower_page_zip = buf_block_get_page_zip(new_block);
+ upper_page = buf_block_get_frame(block);
+ upper_page_no = buf_block_get_page_no(block);
+ upper_page_zip = buf_block_get_page_zip(block);
+
+ /* Look up the index for the node pointer to page */
+ offsets = btr_page_get_father_block(NULL, heap, index,
+ block, mtr, &cursor);
+
+ /* Replace the address of the old child node (= page) with the
+ address of the new lower half */
+
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_rec(&cursor),
+ btr_cur_get_page_zip(&cursor),
+ offsets, lower_page_no, mtr);
+ mem_heap_empty(heap);
+ } else {
+ lower_page = buf_block_get_frame(block);
+ lower_page_no = buf_block_get_page_no(block);
+ lower_page_zip = buf_block_get_page_zip(block);
+ upper_page = buf_block_get_frame(new_block);
+ upper_page_no = buf_block_get_page_no(new_block);
+ upper_page_zip = buf_block_get_page_zip(new_block);
+ }
+
+ /* Get the level of the split pages */
+ level = btr_page_get_level(buf_block_get_frame(block), mtr);
+ ut_ad(level
+ == btr_page_get_level(buf_block_get_frame(new_block), mtr));
+
+ /* Build the node pointer (= node key and page address) for the upper
+ half */
+
+ node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
+ upper_page_no, heap, level);
+
+ /* Insert it next to the pointer to the lower half. Note that this
+ may generate recursion leading to a split on the higher level. */
+
+ btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
+
+ /* Free the memory heap */
+ mem_heap_free(heap);
+
+ /* Get the previous and next pages of page */
+
+ prev_page_no = btr_page_get_prev(page, mtr);
+ next_page_no = btr_page_get_next(page, mtr);
+ space = buf_block_get_space(block);
+ zip_size = buf_block_get_zip_size(block);
+
+ /* Update page links of the level */
+
+ if (prev_page_no != FIL_NULL) {
+ buf_block_t* prev_block = btr_block_get(space, zip_size,
+ prev_page_no,
+ RW_X_LATCH, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+ ut_a(btr_page_get_next(prev_block->frame, mtr)
+ == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_next(buf_block_get_frame(prev_block),
+ buf_block_get_page_zip(prev_block),
+ lower_page_no, mtr);
+ }
+
+ if (next_page_no != FIL_NULL) {
+ buf_block_t* next_block = btr_block_get(space, zip_size,
+ next_page_no,
+ RW_X_LATCH, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+ ut_a(btr_page_get_prev(next_block->frame, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_prev(buf_block_get_frame(next_block),
+ buf_block_get_page_zip(next_block),
+ upper_page_no, mtr);
+ }
+
+ btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
+ btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
+
+ btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
+ btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static
+ibool
+btr_page_tuple_smaller(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: b-tree cursor */
+ const dtuple_t* tuple, /*!< in: tuple to consider */
+ ulint* offsets,/*!< in/out: temporary storage */
+ ulint n_uniq, /*!< in: number of unique fields
+ in the index page records */
+ mem_heap_t** heap) /*!< in/out: heap for offsets */
+{
+ buf_block_t* block;
+ const rec_t* first_rec;
+ page_cur_t pcur;
+
+ /* Read the first user record in the page. */
+ block = btr_cur_get_block(cursor);
+ page_cur_set_before_first(block, &pcur);
+ page_cur_move_to_next(&pcur);
+ first_rec = page_cur_get_rec(&pcur);
+
+ offsets = rec_get_offsets(
+ first_rec, cursor->index, offsets,
+ n_uniq, heap);
+
+ return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ ulint page_no;
+ byte direction;
+ ulint hint_page_no;
+ buf_block_t* new_block;
+ page_t* new_page;
+ page_zip_des_t* new_page_zip;
+ rec_t* split_rec;
+ buf_block_t* left_block;
+ buf_block_t* right_block;
+ buf_block_t* insert_block;
+ page_t* insert_page;
+ page_cur_t* page_cursor;
+ rec_t* first_rec;
+ byte* buf = 0; /* remove warning */
+ rec_t* move_limit;
+ ibool insert_will_fit;
+ ibool insert_left;
+ ulint n_iterations = 0;
+ rec_t* rec;
+ mem_heap_t* heap;
+ ulint n_uniq;
+ ulint* offsets;
+
+ heap = mem_heap_create(1024);
+ n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+func_start:
+ mem_heap_empty(heap);
+ offsets = NULL;
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+ MTR_MEMO_X_LOCK));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_get_n_recs(page) >= 1);
+
+ page_no = buf_block_get_page_no(block);
+
+ /* 1. Decide the split record; split_rec == NULL means that the
+ tuple to be inserted should be the first record on the upper
+ half-page */
+ insert_left = FALSE;
+
+ if (n_iterations > 0) {
+ direction = FSP_UP;
+ hint_page_no = page_no + 1;
+ split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+ if (UNIV_UNLIKELY(split_rec == NULL)) {
+ insert_left = btr_page_tuple_smaller(
+ cursor, tuple, offsets, n_uniq, &heap);
+ }
+ } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+ direction = FSP_UP;
+ hint_page_no = page_no + 1;
+
+ } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
+ direction = FSP_DOWN;
+ hint_page_no = page_no - 1;
+ ut_ad(split_rec);
+ } else {
+ direction = FSP_UP;
+ hint_page_no = page_no + 1;
+
+ /* If there is only one record in the index page, we
+ can't split the node in the middle by default. We need
+ to determine whether the new record will be inserted
+ to the left or right. */
+
+ if (page_get_n_recs(page) > 1) {
+ split_rec = page_get_middle_rec(page);
+ } else if (btr_page_tuple_smaller(cursor, tuple,
+ offsets, n_uniq, &heap)) {
+ split_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+ } else {
+ split_rec = NULL;
+ }
+ }
+
+ /* 2. Allocate a new page to the index */
+ new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
+ btr_page_get_level(page, mtr), mtr);
+ new_page = buf_block_get_frame(new_block);
+ new_page_zip = buf_block_get_page_zip(new_block);
+ btr_page_create(new_block, new_page_zip, cursor->index,
+ btr_page_get_level(page, mtr), mtr);
+
+ /* 3. Calculate the first record on the upper half-page, and the
+ first record (move_limit) on original page which ends up on the
+ upper half */
+
+ if (split_rec) {
+ first_rec = move_limit = split_rec;
+
+ offsets = rec_get_offsets(split_rec, cursor->index, offsets,
+ n_uniq, &heap);
+
+ insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
+
+ if (UNIV_UNLIKELY(!insert_left && new_page_zip
+ && n_iterations > 0)) {
+ /* If a compressed page has already been split,
+ avoid further splits by inserting the record
+ to an empty page. */
+ split_rec = NULL;
+ goto insert_empty;
+ }
+ } else if (UNIV_UNLIKELY(insert_left)) {
+ ut_a(n_iterations > 0);
+ first_rec = page_rec_get_next(page_get_infimum_rec(page));
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+ } else {
+insert_empty:
+ ut_ad(!split_rec);
+ ut_ad(!insert_left);
+ buf = mem_alloc(rec_get_converted_size(cursor->index,
+ tuple, n_ext));
+
+ first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+ tuple, n_ext);
+ move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+ }
+
+ /* 4. Do first the modifications in the tree structure */
+
+ btr_attach_half_pages(cursor->index, block,
+ first_rec, new_block, direction, mtr);
+
+ /* If the split is made on the leaf level and the insert will fit
+ on the appropriate half-page, we may release the tree x-latch.
+ We can then move the records after releasing the tree latch,
+ thus reducing the tree latch contention. */
+
+ if (split_rec) {
+ insert_will_fit = !new_page_zip
+ && btr_page_insert_fits(cursor, split_rec,
+ offsets, tuple, n_ext, heap);
+ } else {
+ if (!insert_left) {
+ mem_free(buf);
+ buf = NULL;
+ }
+
+ insert_will_fit = !new_page_zip
+ && btr_page_insert_fits(cursor, NULL,
+ NULL, tuple, n_ext, heap);
+ }
+
+ if (insert_will_fit && page_is_leaf(page)) {
+
+ mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
+ MTR_MEMO_X_LOCK);
+ }
+
+ /* 5. Move then the records to the new page */
+ if (direction == FSP_DOWN) {
+ /* fputs("Split left\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || UNIV_UNLIKELY
+ (!page_move_rec_list_start(new_block, block, move_limit,
+ cursor->index, mtr))) {
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_page_zip, new_page,
+ page_zip, page, cursor->index, mtr);
+ page_delete_rec_list_end(move_limit - page + new_page,
+ new_block, cursor->index,
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ lock_move_rec_list_start(
+ new_block, block, move_limit,
+ new_page + PAGE_NEW_INFIMUM);
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block, cursor->index);
+
+ /* Delete the records from the source page. */
+
+ page_delete_rec_list_start(move_limit, block,
+ cursor->index, mtr);
+ }
+
+ left_block = new_block;
+ right_block = block;
+
+ lock_update_split_left(right_block, left_block);
+ } else {
+ /* fputs("Split right\n", stderr); */
+
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || page_zip
+#endif /* UNIV_ZIP_COPY */
+ || UNIV_UNLIKELY
+ (!page_move_rec_list_end(new_block, block, move_limit,
+ cursor->index, mtr))) {
+ /* For some reason, compressing new_page failed,
+ even though it should contain fewer records than
+ the original page. Copy the page byte for byte
+ and then delete the records from both pages
+ as appropriate. Deleting will always succeed. */
+ ut_a(new_page_zip);
+
+ page_zip_copy_recs(new_page_zip, new_page,
+ page_zip, page, cursor->index, mtr);
+ page_delete_rec_list_start(move_limit - page
+ + new_page, new_block,
+ cursor->index, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ lock_move_rec_list_end(new_block, block, move_limit);
+
+ btr_search_move_or_delete_hash_entries(
+ new_block, block, cursor->index);
+
+ /* Delete the records from the source page. */
+
+ page_delete_rec_list_end(move_limit, block,
+ cursor->index,
+ ULINT_UNDEFINED,
+ ULINT_UNDEFINED, mtr);
+ }
+
+ left_block = block;
+ right_block = new_block;
+
+ lock_update_split_right(right_block, left_block);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ut_a(page_zip_validate(page_zip, page));
+ ut_a(page_zip_validate(new_page_zip, new_page));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* At this point, split_rec, move_limit and first_rec may point
+ to garbage on the old page. */
+
+ /* 6. The split and the tree modification is now completed. Decide the
+ page where the tuple should be inserted */
+
+ if (insert_left) {
+ insert_block = left_block;
+ } else {
+ insert_block = right_block;
+ }
+
+ insert_page = buf_block_get_frame(insert_block);
+
+ /* 7. Reposition the cursor for insert and try insertion */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_search(insert_block, cursor->index, tuple,
+ PAGE_CUR_LE, page_cursor);
+
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ cursor->index, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* insert_page_zip
+ = buf_block_get_page_zip(insert_block);
+ ut_a(!insert_page_zip
+ || page_zip_validate(insert_page_zip, insert_page));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (UNIV_LIKELY(rec != NULL)) {
+
+ goto func_exit;
+ }
+
+ /* 8. If insert did not fit, try page reorganization */
+
+ if (UNIV_UNLIKELY
+ (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
+
+ goto insert_failed;
+ }
+
+ page_cur_search(insert_block, cursor->index, tuple,
+ PAGE_CUR_LE, page_cursor);
+ rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+ n_ext, mtr);
+
+ if (UNIV_UNLIKELY(rec == NULL)) {
+ /* The insert did not fit on the page: loop back to the
+ start of the function for a new split */
+insert_failed:
+ /* We play safe and reset the free bits for new_page */
+ if (!dict_index_is_clust(cursor->index)) {
+ ibuf_reset_free_bits(new_block);
+ }
+
+ /* fprintf(stderr, "Split second round %lu\n",
+ page_get_page_no(page)); */
+ n_iterations++;
+ ut_ad(n_iterations < 2
+ || buf_block_get_page_zip(insert_block));
+ ut_ad(!insert_will_fit);
+
+ goto func_start;
+ }
+
+func_exit:
+ /* Insert fit on the page: update the free bits for the
+ left and right pages in the same mtr */
+
+ if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
+ ibuf_update_free_bits_for_two_pages_low(
+ buf_block_get_zip_size(left_block),
+ left_block, right_block, mtr);
+ }
+
+#if 0
+ fprintf(stderr, "Split and insert done %lu %lu\n",
+ buf_block_get_page_no(left_block),
+ buf_block_get_page_no(right_block));
+#endif
+
+ ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
+ ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+
+ mem_heap_free(heap);
+ return(rec);
+}
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+static
+void
+btr_level_list_remove(
+/*==================*/
+ ulint space, /*!< in: space where removed */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* page, /*!< in: page to remove */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint prev_page_no;
+ ulint next_page_no;
+
+ ut_ad(page && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(space == page_get_space_id(page));
+ /* Get the previous and next page numbers of page */
+
+ prev_page_no = btr_page_get_prev(page, mtr);
+ next_page_no = btr_page_get_next(page, mtr);
+
+ /* Update page links of the level */
+
+ if (prev_page_no != FIL_NULL) {
+ buf_block_t* prev_block
+ = btr_block_get(space, zip_size, prev_page_no,
+ RW_X_LATCH, mtr);
+ page_t* prev_page
+ = buf_block_get_frame(prev_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(prev_page) == page_is_comp(page));
+ ut_a(btr_page_get_next(prev_page, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_next(prev_page,
+ buf_block_get_page_zip(prev_block),
+ next_page_no, mtr);
+ }
+
+ if (next_page_no != FIL_NULL) {
+ buf_block_t* next_block
+ = btr_block_get(space, zip_size, next_page_no,
+ RW_X_LATCH, mtr);
+ page_t* next_page
+ = buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_page) == page_is_comp(page));
+ ut_a(btr_page_get_prev(next_page, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_set_prev(next_page,
+ buf_block_get_page_zip(next_block),
+ prev_page_no, mtr);
+ }
+}
+
+/****************************************************************//**
+Writes the redo log record for setting an index record as the predefined
+minimum record. */
+UNIV_INLINE
+void
+btr_set_min_rec_mark_log(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(rec, type, mtr);
+
+ /* Write rec offset as a 2-byte ulint */
+ mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
+}
+#else /* !UNIV_HOTBACKUP */
+# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ rec_t* rec;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ ut_a(!page_is_comp(page) == !comp);
+
+ rec = page + mach_read_from_2(ptr);
+
+ btr_set_min_rec_mark(rec, mtr);
+ }
+
+ return(ptr + 2);
+}
+
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+ rec_t* rec, /*!< in: record */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint info_bits;
+
+ if (UNIV_LIKELY(page_rec_is_comp(rec))) {
+ info_bits = rec_get_info_bits(rec, TRUE);
+
+ rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+ btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
+ } else {
+ info_bits = rec_get_info_bits(rec, FALSE);
+
+ rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+ btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page whose node pointer is deleted */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t cursor;
+ ibool compressed;
+ ulint err;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+ /* Delete node pointer on father page */
+ btr_page_get_father(index, block, mtr, &cursor);
+
+ compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
+ mtr);
+ ut_a(err == DB_SUCCESS);
+
+ if (!compressed) {
+ btr_cur_compress_if_useful(&cursor, mtr);
+ }
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height. */
+static
+void
+btr_lift_page_up(
+/*=============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level;
+ must not be empty: use
+ btr_discard_only_page_on_level if the last
+ record from the page should be removed */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* father_block;
+ page_t* father_page;
+ ulint page_level;
+ page_zip_des_t* father_page_zip;
+ page_t* page = buf_block_get_frame(block);
+ ulint root_page_no;
+ buf_block_t* blocks[BTR_MAX_LEVELS];
+ ulint n_blocks; /*!< last used index in blocks[] */
+ ulint i;
+
+ ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
+ ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+ page_level = btr_page_get_level(page, mtr);
+ root_page_no = dict_index_get_page(index);
+
+ {
+ btr_cur_t cursor;
+ mem_heap_t* heap = mem_heap_create(100);
+ ulint* offsets;
+ buf_block_t* b;
+
+ offsets = btr_page_get_father_block(NULL, heap, index,
+ block, mtr, &cursor);
+ father_block = btr_cur_get_block(&cursor);
+ father_page_zip = buf_block_get_page_zip(father_block);
+ father_page = buf_block_get_frame(father_block);
+
+ n_blocks = 0;
+
+ /* Store all ancestor pages so we can reset their
+ levels later on. We have to do all the searches on
+ the tree now because later on, after we've replaced
+ the first level, the tree is in an inconsistent state
+ and can not be searched. */
+ for (b = father_block;
+ buf_block_get_page_no(b) != root_page_no; ) {
+ ut_a(n_blocks < BTR_MAX_LEVELS);
+
+ offsets = btr_page_get_father_block(offsets, heap,
+ index, b,
+ mtr, &cursor);
+
+ blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Make the father empty */
+ btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+
+ /* Copy the records to the father page one by one. */
+ if (0
+#ifdef UNIV_ZIP_COPY
+ || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+ || UNIV_UNLIKELY
+ (!page_copy_rec_list_end(father_block, block,
+ page_get_infimum_rec(page),
+ index, mtr))) {
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(father_page_zip);
+ ut_a(page_zip);
+
+ /* Copy the page byte for byte. */
+ page_zip_copy_recs(father_page_zip, father_page,
+ page_zip, page, index, mtr);
+
+ /* Update the lock table and possible hash index. */
+
+ lock_move_rec_list_end(father_block, block,
+ page_get_infimum_rec(page));
+
+ btr_search_move_or_delete_hash_entries(father_block, block,
+ index);
+ }
+
+ lock_update_copy_and_discard(father_block, block);
+
+ /* Go upward to root page, decrementing levels by one. */
+ for (i = 0; i < n_blocks; i++, page_level++) {
+ page_t* page = buf_block_get_frame(blocks[i]);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
+
+ ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
+
+ btr_page_set_level(page, page_zip, page_level, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ /* We play it safe and reset the free bits for the father */
+ if (!dict_index_is_clust(index)) {
+ ibuf_reset_free_bits(father_block);
+ }
+ ut_ad(page_validate(father_page, index));
+ ut_ad(btr_check_node_ptr(index, father_block, mtr));
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
+ the page must not be empty: in record delete
+ use btr_discard_page if the page would become
+ empty */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ ulint space;
+ ulint zip_size;
+ ulint left_page_no;
+ ulint right_page_no;
+ buf_block_t* merge_block;
+ page_t* merge_page;
+ page_zip_des_t* merge_page_zip;
+ ibool is_left;
+ buf_block_t* block;
+ page_t* page;
+ btr_cur_t father_cursor;
+ mem_heap_t* heap;
+ ulint* offsets;
+ ulint data_size;
+ ulint n_recs;
+ ulint max_ins_size;
+ ulint max_ins_size_reorg;
+ ulint level;
+
+ block = btr_cur_get_block(cursor);
+ page = btr_cur_get_page(cursor);
+ index = btr_cur_get_index(cursor);
+ ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ level = btr_page_get_level(page, mtr);
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+
+ left_page_no = btr_page_get_prev(page, mtr);
+ right_page_no = btr_page_get_next(page, mtr);
+
+#if 0
+ fprintf(stderr, "Merge left page %lu right %lu \n",
+ left_page_no, right_page_no);
+#endif
+
+ heap = mem_heap_create(100);
+ offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+ &father_cursor);
+
+ /* Decide the page to which we try to merge and which will inherit
+ the locks */
+
+ is_left = left_page_no != FIL_NULL;
+
+ if (is_left) {
+
+ merge_block = btr_block_get(space, zip_size, left_page_no,
+ RW_X_LATCH, mtr);
+ merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(merge_page, mtr)
+ == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+ } else if (right_page_no != FIL_NULL) {
+
+ merge_block = btr_block_get(space, zip_size, right_page_no,
+ RW_X_LATCH, mtr);
+ merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_prev(merge_page, mtr)
+ == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+ } else {
+ /* The page is the only one on the level, lift the records
+ to the father */
+ btr_lift_page_up(index, block, mtr);
+ mem_heap_free(heap);
+ return(TRUE);
+ }
+
+ n_recs = page_get_n_recs(page);
+ data_size = page_get_data_size(page);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(merge_page) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+ merge_page, n_recs);
+ if (data_size > max_ins_size_reorg) {
+
+ /* No space for merge */
+err_exit:
+ /* We play it safe and reset the free bits. */
+ if (zip_size
+ && page_is_leaf(merge_page)
+ && !dict_index_is_clust(index)) {
+ ibuf_reset_free_bits(merge_block);
+ }
+
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ ut_ad(page_validate(merge_page, index));
+
+ max_ins_size = page_get_max_insert_size(merge_page, n_recs);
+
+ if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+
+ /* We have to reorganize merge_page */
+
+ if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
+ index, mtr))) {
+
+ goto err_exit;
+ }
+
+ max_ins_size = page_get_max_insert_size(merge_page, n_recs);
+
+ ut_ad(page_validate(merge_page, index));
+ ut_ad(max_ins_size == max_ins_size_reorg);
+
+ if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+
+ /* Add fault tolerance, though this should
+ never happen */
+
+ goto err_exit;
+ }
+ }
+
+ merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+ if (UNIV_LIKELY_NULL(merge_page_zip)) {
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+ ut_a(page_zip_validate(merge_page_zip, merge_page));
+ ut_a(page_zip_validate(page_zip, page));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* Move records to the merge page */
+ if (is_left) {
+ rec_t* orig_pred = page_copy_rec_list_start(
+ merge_block, block, page_get_supremum_rec(page),
+ index, mtr);
+
+ if (UNIV_UNLIKELY(!orig_pred)) {
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(space, zip_size, page, mtr);
+
+ btr_node_ptr_delete(index, block, mtr);
+ lock_update_merge_left(merge_block, orig_pred, block);
+ } else {
+ rec_t* orig_succ;
+#ifdef UNIV_BTR_DEBUG
+ byte fil_page_prev[4];
+#endif /* UNIV_BTR_DEBUG */
+
+ if (UNIV_LIKELY_NULL(merge_page_zip)) {
+ /* The function page_zip_compress(), which will be
+ invoked by page_copy_rec_list_end() below,
+ requires that FIL_PAGE_PREV be FIL_NULL.
+ Clear the field, but prepare to restore it. */
+#ifdef UNIV_BTR_DEBUG
+ memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+#endif /* UNIV_BTR_DEBUG */
+#if FIL_NULL != 0xffffffff
+# error "FIL_NULL != 0xffffffff"
+#endif
+ memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
+ }
+
+ orig_succ = page_copy_rec_list_end(merge_block, block,
+ page_get_infimum_rec(page),
+ cursor->index, mtr);
+
+ if (UNIV_UNLIKELY(!orig_succ)) {
+ ut_a(merge_page_zip);
+#ifdef UNIV_BTR_DEBUG
+ /* FIL_PAGE_PREV was restored from merge_page_zip. */
+ ut_a(!memcmp(fil_page_prev,
+ merge_page + FIL_PAGE_PREV, 4));
+#endif /* UNIV_BTR_DEBUG */
+ goto err_exit;
+ }
+
+ btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+ if (UNIV_LIKELY_NULL(merge_page_zip)) {
+ /* Restore FIL_PAGE_PREV in order to avoid an assertion
+ failure in btr_level_list_remove(), which will set
+ the field again to FIL_NULL. Even though this makes
+ merge_page and merge_page_zip inconsistent for a
+ split second, it is harmless, because the pages
+ are X-latched. */
+ memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(space, zip_size, page, mtr);
+
+ /* Replace the address of the old child node (= page) with the
+ address of the merge page to the right */
+
+ btr_node_ptr_set_child_page_no(
+ btr_cur_get_rec(&father_cursor),
+ btr_cur_get_page_zip(&father_cursor),
+ offsets, right_page_no, mtr);
+ btr_node_ptr_delete(index, merge_block, mtr);
+
+ lock_update_merge_right(merge_block, orig_succ, block);
+ }
+
+ mem_heap_free(heap);
+
+ if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. This has to be done in a
+ separate mini-transaction that is committed before the
+ main mini-transaction. We cannot update the insert
+ buffer bitmap in this mini-transaction, because
+ btr_compress() can be invoked recursively without
+ committing the mini-transaction in between. Since
+ insert buffer bitmap pages have a lower rank than
+ B-tree pages, we must not access other pages in the
+ same mini-transaction after accessing an insert buffer
+ bitmap page. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (zip_size) {
+ /* Because the free bits may be incremented
+ and we cannot update the insert buffer bitmap
+ in the same mini-transaction, the only safe
+ thing we can do here is the pessimistic
+ approach: reset the free bits. */
+ ibuf_reset_free_bits(merge_block);
+ } else {
+ /* On uncompressed pages, the free bits will
+ never increase here. Thus, it is safe to
+ write the bits accurately in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(merge_block,
+ UNIV_PAGE_SIZE,
+ ULINT_UNDEFINED);
+ }
+ }
+
+ ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+ return(TRUE);
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level. This will empty
+the whole B-tree, leaving just an empty root page. This function
+should never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page which is the only on its level */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_level = 0;
+ trx_id_t max_trx_id;
+
+ /* Save the PAGE_MAX_TRX_ID from the leaf page. */
+ max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
+
+ while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+ btr_cur_t cursor;
+ buf_block_t* father;
+ const page_t* page = buf_block_get_frame(block);
+
+ ut_a(page_get_n_recs(page) == 1);
+ ut_a(page_level == btr_page_get_level(page, mtr));
+ ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+ ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ btr_search_drop_page_hash_index(block);
+
+ btr_page_get_father(index, block, mtr, &cursor);
+ father = btr_cur_get_block(&cursor);
+
+ lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ block = father;
+ page_level++;
+ }
+
+ /* block is the root page, which must be empty, except
+ for the node pointer to the (now discarded) block(s). */
+
+#ifdef UNIV_BTR_DEBUG
+ if (!dict_index_is_ibuf(index)) {
+ const page_t* root = buf_block_get_frame(block);
+ const ulint space = dict_index_get_space(index);
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + root, space));
+ ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + root, space));
+ }
+#endif /* UNIV_BTR_DEBUG */
+
+ btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+
+ if (!dict_index_is_clust(index)) {
+ /* We play it safe and reset the free bits for the root */
+ ibuf_reset_free_bits(block);
+
+ if (page_is_leaf(buf_block_get_frame(block))) {
+ ut_a(!ut_dulint_is_zero(max_trx_id));
+ page_set_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ max_trx_id, mtr);
+ }
+ }
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ ulint space;
+ ulint zip_size;
+ ulint left_page_no;
+ ulint right_page_no;
+ buf_block_t* merge_block;
+ page_t* merge_page;
+ buf_block_t* block;
+ page_t* page;
+ rec_t* node_ptr;
+
+ block = btr_cur_get_block(cursor);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+
+ /* Decide the page which will inherit the locks */
+
+ left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
+ right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
+
+ if (left_page_no != FIL_NULL) {
+ merge_block = btr_block_get(space, zip_size, left_page_no,
+ RW_X_LATCH, mtr);
+ merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(merge_page, mtr)
+ == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+ } else if (right_page_no != FIL_NULL) {
+ merge_block = btr_block_get(space, zip_size, right_page_no,
+ RW_X_LATCH, mtr);
+ merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_prev(merge_page, mtr)
+ == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+ } else {
+ btr_discard_only_page_on_level(index, block, mtr);
+
+ return;
+ }
+
+ page = buf_block_get_frame(block);
+ ut_a(page_is_comp(merge_page) == page_is_comp(page));
+ btr_search_drop_page_hash_index(block);
+
+ if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
+
+ /* We have to mark the leftmost node pointer on the right
+ side page as the predefined minimum record */
+ node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
+
+ ut_ad(page_rec_is_user_rec(node_ptr));
+
+ /* This will make page_zip_validate() fail on merge_page
+ until btr_level_list_remove() completes. This is harmless,
+ because everything will take place within a single
+ mini-transaction and because writing to the redo log
+ is an atomic operation (performed by mtr_commit()). */
+ btr_set_min_rec_mark(node_ptr, mtr);
+ }
+
+ btr_node_ptr_delete(index, block, mtr);
+
+ /* Remove the page from the level list */
+ btr_level_list_remove(space, zip_size, page, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* merge_page_zip
+ = buf_block_get_page_zip(merge_block);
+ ut_a(!merge_page_zip
+ || page_zip_validate(merge_page_zip, merge_page));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (left_page_no != FIL_NULL) {
+ lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+ block);
+ } else {
+ lock_update_discard(merge_block,
+ lock_get_min_heap_no(merge_block),
+ block);
+ }
+
+ /* Free the file page */
+ btr_page_free(index, block, mtr);
+
+ ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index) /*!< in: index tree */
+{
+ page_t* root;
+ fseg_header_t* seg;
+ mtr_t mtr;
+
+ if (dict_index_is_ibuf(index)) {
+ fputs("Sorry, cannot print info of an ibuf tree:"
+ " use ibuf functions\n", stderr);
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ root = btr_root_get(index, &mtr);
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+ fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+
+ if (!(index->type & DICT_UNIVERSAL)) {
+
+ seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+ fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+ fseg_print(seg, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ ulint width, /*!< in: print this many entries from start
+ and end */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const page_t* page = buf_block_get_frame(block);
+ page_cur_t cursor;
+ ulint n_recs;
+ ulint i = 0;
+ mtr_t mtr2;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
+ (ulong) btr_page_get_level(page, mtr),
+ (ulong) buf_block_get_page_no(block));
+
+ page_print(block, index, width, width);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cursor);
+ page_cur_move_to_next(&cursor);
+
+ while (!page_cur_is_after_last(&cursor)) {
+
+ if (page_is_leaf(page)) {
+
+ /* If this is the leaf level, do nothing */
+
+ } else if ((i <= width) || (i >= n_recs - width)) {
+
+ const rec_t* node_ptr;
+
+ mtr_start(&mtr2);
+
+ node_ptr = page_cur_get_rec(&cursor);
+
+ *offsets = rec_get_offsets(node_ptr, index, *offsets,
+ ULINT_UNDEFINED, heap);
+ btr_print_recursive(index,
+ btr_node_ptr_get_child(node_ptr,
+ index,
+ *offsets,
+ &mtr2),
+ width, heap, offsets, &mtr2);
+ mtr_commit(&mtr2);
+ }
+
+ page_cur_move_to_next(&cursor);
+ i++;
+ }
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width) /*!< in: print this many entries from start
+ and end */
+{
+ mtr_t mtr;
+ buf_block_t* root;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ fputs("--------------------------\n"
+ "INDEX TREE PRINT\n", stderr);
+
+ mtr_start(&mtr);
+
+ root = btr_root_block_get(index, &mtr);
+
+ btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ mtr_commit(&mtr);
+
+ btr_validate_index(index, NULL);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ ulint* offsets;
+ btr_cur_t cursor;
+ page_t* page = buf_block_get_frame(block);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
+
+ return(TRUE);
+ }
+
+ heap = mem_heap_create(256);
+ offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+ &cursor);
+
+ if (page_is_leaf(page)) {
+
+ goto func_exit;
+ }
+
+ tuple = dict_index_build_node_ptr(
+ index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+ btr_page_get_level(page, mtr));
+
+ ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
+func_exit:
+ mem_heap_free(heap);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+ const page_t* page, /*!< in: index page */
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index) /*!< in: index */
+{
+ fputs("InnoDB: Record in ", stderr);
+ dict_index_name_print(stderr, NULL, index);
+ fprintf(stderr, ", page %lu, at offset %lu\n",
+ page_get_page_no(page), (ulint) page_offset(rec));
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error) /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+{
+ ulint len;
+ ulint n;
+ ulint i;
+ const page_t* page;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page = page_align(rec);
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ /* The insert buffer index tree can contain records from any
+ other index: we cannot check the number of fields or
+ their length */
+
+ return(TRUE);
+ }
+
+ if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
+ != dict_table_is_comp(index->table))) {
+ btr_index_rec_validate_report(page, rec, index);
+ fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
+ (ulong) !!page_is_comp(page),
+ (ulong) dict_table_is_comp(index->table));
+
+ return(FALSE);
+ }
+
+ n = dict_index_get_n_fields(index);
+
+ if (!page_is_comp(page)
+ && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
+ btr_index_rec_validate_report(page, rec, index);
+ fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
+ (ulong) rec_get_n_fields_old(rec), (ulong) n);
+
+ if (dump_on_error) {
+ buf_page_print(page, 0);
+
+ fputs("InnoDB: corrupt record ", stderr);
+ rec_print_old(stderr, rec);
+ putc('\n', stderr);
+ }
+ return(FALSE);
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ for (i = 0; i < n; i++) {
+ ulint fixed_size = dict_col_get_fixed_size(
+ dict_index_get_nth_col(index, i), page_is_comp(page));
+
+ rec_get_nth_field_offs(offsets, i, &len);
+
+ /* Note that if fixed_size != 0, it equals the
+ length of a fixed-size column in the clustered index.
+ A prefix index of the column is of fixed, but different
+ length. When fixed_size == 0, prefix_len is the maximum
+ length of the prefix index column. */
+
+ if ((dict_index_get_nth_field(index, i)->prefix_len == 0
+ && len != UNIV_SQL_NULL && fixed_size
+ && len != fixed_size)
+ || (dict_index_get_nth_field(index, i)->prefix_len > 0
+ && len != UNIV_SQL_NULL
+ && len
+ > dict_index_get_nth_field(index, i)->prefix_len)) {
+
+ btr_index_rec_validate_report(page, rec, index);
+ fprintf(stderr,
+ "InnoDB: field %lu len is %lu,"
+ " should be %lu\n",
+ (ulong) i, (ulong) len, (ulong) fixed_size);
+
+ if (dump_on_error) {
+ buf_page_print(page, 0);
+
+ fputs("InnoDB: corrupt record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ }
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(FALSE);
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return TRUE if ok */
+static
+ibool
+btr_index_page_validate(
+/*====================*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index) /*!< in: index */
+{
+ page_cur_t cur;
+ ibool ret = TRUE;
+
+ page_cur_set_before_first(block, &cur);
+ page_cur_move_to_next(&cur);
+
+ for (;;) {
+ if (page_cur_is_after_last(&cur)) {
+
+ break;
+ }
+
+ if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+
+ return(FALSE);
+ }
+
+ page_cur_move_to_next(&cur);
+ }
+
+ return(ret);
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block) /*!< in: index page */
+{
+ fprintf(stderr, "InnoDB: Error in page %lu of ",
+ buf_block_get_page_no(block));
+ dict_index_name_print(stderr, NULL, index);
+ if (level) {
+ fprintf(stderr, ", index tree level %lu", level);
+ }
+ putc('\n', stderr);
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: B-tree level */
+ const buf_block_t* block1, /*!< in: first index page */
+ const buf_block_t* block2) /*!< in: second index page */
+{
+ fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
+ buf_block_get_page_no(block1),
+ buf_block_get_page_no(block2));
+ dict_index_name_print(stderr, NULL, index);
+ if (level) {
+ fprintf(stderr, ", index tree level %lu", level);
+ }
+ putc('\n', stderr);
+}
+
+/************************************************************//**
+Validates index tree level.
+@return TRUE if ok */
+static
+ibool
+btr_validate_level(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ trx_t* trx, /*!< in: transaction or NULL */
+ ulint level) /*!< in: level number */
+{
+ ulint space;
+ ulint zip_size;
+ buf_block_t* block;
+ page_t* page;
+ buf_block_t* right_block = 0; /* remove warning */
+ page_t* right_page = 0; /* remove warning */
+ page_t* father_page;
+ btr_cur_t node_cur;
+ btr_cur_t right_node_cur;
+ rec_t* rec;
+ ulint right_page_no;
+ ulint left_page_no;
+ page_cur_t cursor;
+ dtuple_t* node_ptr_tuple;
+ ibool ret = TRUE;
+ mtr_t mtr;
+ mem_heap_t* heap = mem_heap_create(256);
+ ulint* offsets = NULL;
+ ulint* offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+ page_zip_des_t* page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+
+ mtr_start(&mtr);
+
+ mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+ block = btr_root_block_get(index, &mtr);
+ page = buf_block_get_frame(block);
+
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+
+ while (level != btr_page_get_level(page, &mtr)) {
+ const rec_t* node_ptr;
+
+ ut_a(space == buf_block_get_space(block));
+ ut_a(space == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(!page_is_leaf(page));
+
+ page_cur_set_before_first(block, &cursor);
+ page_cur_move_to_next(&cursor);
+
+ node_ptr = page_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(node_ptr, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+ page = buf_block_get_frame(block);
+ }
+
+ /* Now we are on the desired level. Loop through the pages on that
+ level. */
+loop:
+ if (trx_is_interrupted(trx)) {
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+ return(ret);
+ }
+ mem_heap_empty(heap);
+ offsets = offsets2 = NULL;
+ mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+ page_zip = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* Check ordering etc. of records */
+
+ if (!page_validate(page, index)) {
+ btr_validate_report1(index, level, block);
+
+ ret = FALSE;
+ } else if (level == 0) {
+ /* We are on level 0. Check that the records have the right
+ number of fields, and field lengths are right. */
+
+ if (!btr_index_page_validate(block, index)) {
+
+ ret = FALSE;
+ }
+ }
+
+ ut_a(btr_page_get_level(page, &mtr) == level);
+
+ right_page_no = btr_page_get_next(page, &mtr);
+ left_page_no = btr_page_get_prev(page, &mtr);
+
+ ut_a(page_get_n_recs(page) > 0 || (level == 0
+ && page_get_page_no(page)
+ == dict_index_get_page(index)));
+
+ if (right_page_no != FIL_NULL) {
+ const rec_t* right_rec;
+ right_block = btr_block_get(space, zip_size, right_page_no,
+ RW_X_LATCH, &mtr);
+ right_page = buf_block_get_frame(right_block);
+ if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
+ != page_get_page_no(page))) {
+ btr_validate_report2(index, level, block, right_block);
+ fputs("InnoDB: broken FIL_PAGE_NEXT"
+ " or FIL_PAGE_PREV links\n", stderr);
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+
+ ret = FALSE;
+ }
+
+ if (UNIV_UNLIKELY(page_is_comp(right_page)
+ != page_is_comp(page))) {
+ btr_validate_report2(index, level, block, right_block);
+ fputs("InnoDB: 'compact' flag mismatch\n", stderr);
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+
+ ret = FALSE;
+
+ goto node_ptr_fails;
+ }
+
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ right_rec = page_rec_get_next(page_get_infimum_rec(
+ right_page));
+ offsets = rec_get_offsets(rec, index,
+ offsets, ULINT_UNDEFINED, &heap);
+ offsets2 = rec_get_offsets(right_rec, index,
+ offsets2, ULINT_UNDEFINED, &heap);
+ if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
+ offsets, offsets2,
+ index) >= 0)) {
+
+ btr_validate_report2(index, level, block, right_block);
+
+ fputs("InnoDB: records in wrong order"
+ " on adjacent pages\n", stderr);
+
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+
+ fputs("InnoDB: record ", stderr);
+ rec = page_rec_get_prev(page_get_supremum_rec(page));
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+ fputs("InnoDB: record ", stderr);
+ rec = page_rec_get_next(
+ page_get_infimum_rec(right_page));
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+
+ ret = FALSE;
+ }
+ }
+
+ if (level > 0 && left_page_no == FIL_NULL) {
+ ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+ page_rec_get_next(page_get_infimum_rec(page)),
+ page_is_comp(page)));
+ }
+
+ if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+
+ /* Check father node pointers */
+
+ rec_t* node_ptr;
+
+ offsets = btr_page_get_father_block(offsets, heap, index,
+ block, &mtr, &node_cur);
+ father_page = btr_cur_get_page(&node_cur);
+ node_ptr = btr_cur_get_rec(&node_cur);
+
+ btr_cur_position(
+ index, page_rec_get_prev(page_get_supremum_rec(page)),
+ block, &node_cur);
+ offsets = btr_page_get_father_node_ptr(offsets, heap,
+ &node_cur, &mtr);
+
+ if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
+ || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
+ offsets)
+ != buf_block_get_page_no(block))) {
+
+ btr_validate_report1(index, level, block);
+
+ fputs("InnoDB: node pointer to the page is wrong\n",
+ stderr);
+
+ buf_page_print(father_page, 0);
+ buf_page_print(page, 0);
+
+ fputs("InnoDB: node ptr ", stderr);
+ rec_print(stderr, node_ptr, index);
+
+ rec = btr_cur_get_rec(&node_cur);
+ fprintf(stderr, "\n"
+ "InnoDB: node ptr child page n:o %lu\n",
+ (ulong) btr_node_ptr_get_child_page_no(
+ rec, offsets));
+
+ fputs("InnoDB: record on page ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ ret = FALSE;
+
+ goto node_ptr_fails;
+ }
+
+ if (!page_is_leaf(page)) {
+ node_ptr_tuple = dict_index_build_node_ptr(
+ index,
+ page_rec_get_next(page_get_infimum_rec(page)),
+ 0, heap, btr_page_get_level(page, &mtr));
+
+ if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
+ offsets)) {
+ const rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ btr_validate_report1(index, level, block);
+
+ buf_page_print(father_page, 0);
+ buf_page_print(page, 0);
+
+ fputs("InnoDB: Error: node ptrs differ"
+ " on levels > 0\n"
+ "InnoDB: node ptr ", stderr);
+ rec_print_new(stderr, node_ptr, offsets);
+ fputs("InnoDB: first rec ", stderr);
+ rec_print(stderr, first_rec, index);
+ putc('\n', stderr);
+ ret = FALSE;
+
+ goto node_ptr_fails;
+ }
+ }
+
+ if (left_page_no == FIL_NULL) {
+ ut_a(node_ptr == page_rec_get_next(
+ page_get_infimum_rec(father_page)));
+ ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
+ }
+
+ if (right_page_no == FIL_NULL) {
+ ut_a(node_ptr == page_rec_get_prev(
+ page_get_supremum_rec(father_page)));
+ ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
+ } else {
+ const rec_t* right_node_ptr
+ = page_rec_get_next(node_ptr);
+
+ offsets = btr_page_get_father_block(
+ offsets, heap, index, right_block,
+ &mtr, &right_node_cur);
+ if (right_node_ptr
+ != page_get_supremum_rec(father_page)) {
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != right_node_ptr) {
+ ret = FALSE;
+ fputs("InnoDB: node pointer to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+
+ buf_page_print(father_page, 0);
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+ }
+ } else {
+ page_t* right_father_page
+ = btr_cur_get_page(&right_node_cur);
+
+ if (btr_cur_get_rec(&right_node_cur)
+ != page_rec_get_next(
+ page_get_infimum_rec(
+ right_father_page))) {
+ ret = FALSE;
+ fputs("InnoDB: node pointer 2 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+
+ buf_page_print(father_page, 0);
+ buf_page_print(right_father_page, 0);
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+ }
+
+ if (page_get_page_no(right_father_page)
+ != btr_page_get_next(father_page, &mtr)) {
+
+ ret = FALSE;
+ fputs("InnoDB: node pointer 3 to"
+ " the right page is wrong\n",
+ stderr);
+
+ btr_validate_report1(index, level,
+ block);
+
+ buf_page_print(father_page, 0);
+ buf_page_print(right_father_page, 0);
+ buf_page_print(page, 0);
+ buf_page_print(right_page, 0);
+ }
+ }
+ }
+ }
+
+node_ptr_fails:
+ /* Commit the mini-transaction to release the latch on 'page'.
+ Re-acquire the latch on right_page, which will become 'page'
+ on the next loop. The page has already been checked. */
+ mtr_commit(&mtr);
+
+ if (right_page_no != FIL_NULL) {
+ mtr_start(&mtr);
+
+ block = btr_block_get(space, zip_size, right_page_no,
+ RW_X_LATCH, &mtr);
+ page = buf_block_get_frame(block);
+
+ goto loop;
+ }
+
+ mem_heap_free(heap);
+ return(ret);
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction or NULL */
+{
+ mtr_t mtr;
+ page_t* root;
+ ulint i;
+ ulint n;
+
+ mtr_start(&mtr);
+ mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+ root = btr_root_get(index, &mtr);
+ n = btr_page_get_level(root, &mtr);
+
+ for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
+ if (!btr_validate_level(index, trx, n - i)) {
+
+ mtr_commit(&mtr);
+
+ return(FALSE);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c
new file mode 100644
index 00000000000..3fc2b48162a
--- /dev/null
+++ b/storage/xtradb/btr/btr0cur.c
@@ -0,0 +1,5256 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.c
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+ NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#include "row0upd.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "trx0rec.h"
+#include "trx0roll.h" /* trx_is_recv() */
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+
+#ifdef UNIV_DEBUG
+/** If the following is set to TRUE, this module prints a lot of
+trace information of individual record operations */
+UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
+#endif /* UNIV_DEBUG */
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint btr_cur_n_non_sea = 0;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint btr_cur_n_sea = 0;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint btr_cur_n_sea_old = 0;
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
+ page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
+ FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
+ part header, in bytes */
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor positioned on a page */
+ ulint height, /*!< in: height of the page in tree;
+ 0 means leaf node */
+ ulint root_height); /*!< in: root node height in tree */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in: record */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr); /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+/***********************************************************//**
+Gets the externally stored size of a record, in units of a database page.
+@return externally stored part, in units of a database page */
+static
+ulint
+btr_rec_get_externally_stored_len(
+/*==============================*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+The following function is used to set the deleted bit of a record. */
+UNIV_INLINE
+void
+btr_rec_set_deleted_flag(
+/*=====================*/
+ rec_t* rec, /*!< in/out: physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page (or NULL) */
+ ulint flag) /*!< in: nonzero if delete marked */
+{
+ if (page_rec_is_comp(rec)) {
+ rec_set_deleted_flag_new(rec, page_zip, flag);
+ } else {
+ ut_ad(!page_zip);
+ rec_set_deleted_flag_old(rec, flag);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*==================== B-TREE SEARCH =========================*/
+
+/********************************************************************//**
+Latches the leaf page or pages requested. */
+static
+void
+btr_cur_latch_leaves(
+/*=================*/
+ page_t* page, /*!< in: leaf page where the search
+ converged */
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the leaf */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in: cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint mode;
+ ulint left_page_no;
+ ulint right_page_no;
+ buf_block_t* get_block;
+
+ ut_ad(page && mtr);
+
+ switch (latch_mode) {
+ case BTR_SEARCH_LEAF:
+ case BTR_MODIFY_LEAF:
+ mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
+ get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+ return;
+ case BTR_MODIFY_TREE:
+ /* x-latch also brothers from left to right */
+ left_page_no = btr_page_get_prev(page, mtr);
+
+ if (left_page_no != FIL_NULL) {
+ get_block = btr_block_get(space, zip_size,
+ left_page_no,
+ RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(page));
+ ut_a(btr_page_get_next(get_block->frame, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+ }
+
+ get_block = btr_block_get(space, zip_size, page_no,
+ RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+
+ right_page_no = btr_page_get_next(page, mtr);
+
+ if (right_page_no != FIL_NULL) {
+ get_block = btr_block_get(space, zip_size,
+ right_page_no,
+ RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(page));
+ ut_a(btr_page_get_prev(get_block->frame, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+ }
+
+ return;
+
+ case BTR_SEARCH_PREV:
+ case BTR_MODIFY_PREV:
+ mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
+ /* latch also left brother */
+ left_page_no = btr_page_get_prev(page, mtr);
+
+ if (left_page_no != FIL_NULL) {
+ get_block = btr_block_get(space, zip_size,
+ left_page_no, mode, mtr);
+ cursor->left_block = get_block;
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame)
+ == page_is_comp(page));
+ ut_a(btr_page_get_next(get_block->frame, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+ }
+
+ get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
+
+ if (srv_pass_corrupt_table && !get_block) {
+ return;
+ }
+ ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+ get_block->check_index_page_at_flush = TRUE;
+ return;
+ }
+
+ ut_error;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
+
+If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ Inserts should always be made using
+ PAGE_CUR_LE to search the position! */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+ BTR_INSERT and BTR_ESTIMATE;
+ cursor->left_block is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+ NOTE that if has_search_latch
+ is != 0, we maybe do not have a latch set
+ on the cursor page, we assume
+ the caller uses his search latch
+ to protect the record! */
+ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+ ulint has_search_latch,/*!< in: info on the latch mode the
+ caller currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ page_t* page;
+ buf_block_t* guess;
+ rec_t* node_ptr;
+ ulint page_no;
+ ulint space;
+ ulint up_match;
+ ulint up_bytes;
+ ulint low_match;
+ ulint low_bytes;
+ ulint height;
+ ulint savepoint;
+ ulint page_mode;
+ ulint insert_planned;
+ ulint estimate;
+ ulint ignore_sec_unique;
+ ulint root_height = 0; /* remove warning */
+#ifdef BTR_CUR_ADAPT
+ btr_search_t* info;
+#endif
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+ /* Currently, PAGE_CUR_LE is the only search mode used for searches
+ ending to upper levels */
+
+ ut_ad(level == 0 || mode == PAGE_CUR_LE);
+ ut_ad(dict_index_check_search_tuple(index, tuple));
+ ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
+ ut_ad(dtuple_check_typed(tuple));
+
+#ifdef UNIV_DEBUG
+ cursor->up_match = ULINT_UNDEFINED;
+ cursor->low_match = ULINT_UNDEFINED;
+#endif
+ insert_planned = latch_mode & BTR_INSERT;
+ estimate = latch_mode & BTR_ESTIMATE;
+ ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
+ latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
+ | BTR_IGNORE_SEC_UNIQUE);
+
+ ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
+
+ cursor->flag = BTR_CUR_BINARY;
+ cursor->index = index;
+
+#ifndef BTR_CUR_ADAPT
+ guess = NULL;
+#else
+ info = btr_search_get_info(index);
+
+ guess = info->root_guess;
+
+#ifdef BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_searches++;
+#endif
+ if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
+ && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
+ && !estimate
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ && mode != PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ /* If !has_search_latch, we do a dirty read of
+ btr_search_enabled below, and btr_search_guess_on_hash()
+ will have to check it again. */
+ && UNIV_LIKELY(btr_search_enabled)
+ && btr_search_guess_on_hash(index, info, tuple, mode,
+ latch_mode, cursor,
+ has_search_latch, mtr)) {
+
+ /* Search using the hash index succeeded */
+
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_GE);
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ ut_ad(cursor->low_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ btr_cur_n_sea++;
+
+ return;
+ }
+#endif /* BTR_CUR_HASH_ADAPT */
+#endif /* BTR_CUR_ADAPT */
+ btr_cur_n_non_sea++;
+
+ /* If the hash search did not succeed, do binary search down the
+ tree */
+
+ if (has_search_latch) {
+ /* Release possible search latch to obey latching order */
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched leaf node(s) */
+
+ savepoint = mtr_set_savepoint(mtr);
+
+ if (latch_mode == BTR_MODIFY_TREE) {
+ mtr_x_lock(dict_index_get_lock(index), mtr);
+
+ } else if (latch_mode == BTR_CONT_MODIFY_TREE) {
+ /* Do nothing */
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ } else {
+ mtr_s_lock(dict_index_get_lock(index), mtr);
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ space = dict_index_get_space(index);
+ page_no = dict_index_get_page(index);
+
+ up_match = 0;
+ up_bytes = 0;
+ low_match = 0;
+ low_bytes = 0;
+
+ height = ULINT_UNDEFINED;
+
+ /* We use these modified search modes on non-leaf levels of the
+ B-tree. These let us end up in the right B-tree leaf. In that leaf
+ we use the original search mode. */
+
+ switch (mode) {
+ case PAGE_CUR_GE:
+ page_mode = PAGE_CUR_L;
+ break;
+ case PAGE_CUR_G:
+ page_mode = PAGE_CUR_LE;
+ break;
+ default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ page_mode = mode;
+ break;
+ }
+
+ /* Loop and search until we arrive at the desired level */
+
+ for (;;) {
+ ulint zip_size;
+ buf_block_t* block;
+ ulint rw_latch;
+ ulint buf_mode;
+
+ zip_size = dict_table_zip_size(index->table);
+ rw_latch = RW_NO_LATCH;
+ buf_mode = BUF_GET;
+
+ if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) {
+
+ rw_latch = latch_mode;
+
+ if (insert_planned
+ && ibuf_should_try(index, ignore_sec_unique)) {
+
+ /* Try insert to the insert buffer if the
+ page is not in the buffer pool */
+
+ buf_mode = BUF_GET_IF_IN_POOL;
+ }
+ }
+
+retry_page_get:
+ block = buf_page_get_gen(space, zip_size, page_no,
+ rw_latch, guess, buf_mode,
+ file, line, mtr);
+ if (block == NULL) {
+ if (srv_pass_corrupt_table && buf_mode != BUF_GET_IF_IN_POOL) {
+ page_cursor->block = 0;
+ page_cursor->rec = 0;
+ if (estimate) {
+ cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+ }
+ break;
+ }
+ ut_a(buf_mode == BUF_GET_IF_IN_POOL);
+
+ /* This must be a search to perform an insert;
+ try insert to the insert buffer */
+
+ ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+ ut_ad(insert_planned);
+ ut_ad(cursor->thr);
+
+ if (ibuf_insert(tuple, index, space, zip_size,
+ page_no, cursor->thr)) {
+ /* Insertion to the insert buffer succeeded */
+ cursor->flag = BTR_CUR_INSERT_TO_IBUF;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ goto func_exit;
+ }
+
+ /* Insert to the insert buffer did not succeed:
+ retry page get */
+
+ buf_mode = BUF_GET;
+
+ goto retry_page_get;
+ }
+
+ page = buf_block_get_frame(block);
+
+ if (srv_pass_corrupt_table && !page) {
+ page_cursor->block = 0;
+ page_cursor->rec = 0;
+ if (estimate) {
+ cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+ }
+ break;
+ }
+ ut_a(page);
+
+ block->check_index_page_at_flush = TRUE;
+
+ if (rw_latch != RW_NO_LATCH) {
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
+
+ ut_ad(0 == ut_dulint_cmp(index->id,
+ btr_page_get_index_id(page)));
+
+ if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page, mtr);
+ root_height = height;
+ cursor->tree_height = root_height + 1;
+#ifdef BTR_CUR_ADAPT
+ if (block != guess) {
+ info->root_guess = block;
+ }
+#endif
+ }
+
+ if (height == 0) {
+ if (rw_latch == RW_NO_LATCH) {
+
+ btr_cur_latch_leaves(page, space, zip_size,
+ page_no, latch_mode,
+ cursor, mtr);
+ }
+
+ if ((latch_mode != BTR_MODIFY_TREE)
+ && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+
+ /* Release the tree s-latch */
+
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint,
+ dict_index_get_lock(index));
+ }
+
+ page_mode = mode;
+ }
+
+ page_cur_search_with_match(block, index, tuple, page_mode,
+ &up_match, &up_bytes,
+ &low_match, &low_bytes,
+ page_cursor);
+
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height, root_height);
+ }
+
+ /* If this is the desired level, leave the loop */
+
+ ut_ad(height == btr_page_get_level(
+ page_cur_get_page(page_cursor), mtr));
+
+ if (level == height) {
+
+ if (level > 0) {
+ /* x-latch the page */
+ page = btr_page_get(space, zip_size,
+ page_no, RW_X_LATCH, mtr);
+ ut_a((ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ }
+
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ height--;
+
+ guess = NULL;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Go to the child node */
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (level == 0) {
+ cursor->low_match = low_match;
+ cursor->low_bytes = low_bytes;
+ cursor->up_match = up_match;
+ cursor->up_bytes = up_bytes;
+
+#ifdef BTR_CUR_ADAPT
+ /* We do a dirty read of btr_search_enabled here. We
+ will properly check btr_search_enabled again in
+ btr_search_build_page_hash_index() before building a
+ page hash index, while holding btr_search_latch. */
+ if (UNIV_LIKELY(btr_search_enabled)) {
+
+ btr_search_info_update(index, cursor);
+ }
+#endif
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_GE);
+ ut_ad(cursor->up_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ ut_ad(cursor->low_match != ULINT_UNDEFINED
+ || mode != PAGE_CUR_LE);
+ }
+
+func_exit:
+ if (has_search_latch) {
+
+ rw_lock_s_lock(&btr_search_latch);
+ }
+}
+
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint height;
+ ulint root_height = 0; /* remove warning */
+ rec_t* node_ptr;
+ ulint estimate;
+ ulint savepoint;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ estimate = latch_mode & BTR_ESTIMATE;
+ latch_mode = latch_mode & ~BTR_ESTIMATE;
+
+ /* Store the position of the tree latch we push to mtr so that we
+ know how to release it when we have latched the leaf node */
+
+ savepoint = mtr_set_savepoint(mtr);
+
+ if (latch_mode == BTR_MODIFY_TREE) {
+ mtr_x_lock(dict_index_get_lock(index), mtr);
+ } else {
+ mtr_s_lock(dict_index_get_lock(index), mtr);
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+ cursor->index = index;
+
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+ page_no = dict_index_get_page(index);
+
+ height = ULINT_UNDEFINED;
+
+ for (;;) {
+ buf_block_t* block;
+ page_t* page;
+ block = buf_page_get_gen(space, zip_size, page_no,
+ RW_NO_LATCH, NULL, BUF_GET,
+ file, line, mtr);
+ page = buf_block_get_frame(block);
+
+ if (srv_pass_corrupt_table && !page) {
+ page_cursor->block = 0;
+ page_cursor->rec = 0;
+ if (estimate) {
+ cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+ }
+ break;
+ }
+ ut_a(page);
+
+ ut_ad(0 == ut_dulint_cmp(index->id,
+ btr_page_get_index_id(page)));
+
+ block->check_index_page_at_flush = TRUE;
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page, mtr);
+ root_height = height;
+ }
+
+ if (height == 0) {
+ btr_cur_latch_leaves(page, space, zip_size, page_no,
+ latch_mode, cursor, mtr);
+
+ /* In versions <= 3.23.52 we had forgotten to
+ release the tree latch here. If in an index scan
+ we had to scan far to find a record visible to the
+ current transaction, that could starve others
+ waiting for the tree latch. */
+
+ if ((latch_mode != BTR_MODIFY_TREE)
+ && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+
+ /* Release the tree s-latch */
+
+ mtr_release_s_latch_at_savepoint(
+ mtr, savepoint,
+ dict_index_get_lock(index));
+ }
+ }
+
+ if (from_left) {
+ page_cur_set_before_first(block, page_cursor);
+ } else {
+ page_cur_set_after_last(block, page_cursor);
+ }
+
+ if (height == 0) {
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height,
+ root_height);
+ }
+
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ if (from_left) {
+ page_cur_move_to_next(page_cursor);
+ } else {
+ page_cur_move_to_prev(page_cursor);
+ }
+
+ if (estimate) {
+ btr_cur_add_path_info(cursor, height, root_height);
+ }
+
+ height--;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Go to the child node */
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint height;
+ rec_t* node_ptr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ if (latch_mode == BTR_MODIFY_TREE) {
+ mtr_x_lock(dict_index_get_lock(index), mtr);
+ } else {
+ mtr_s_lock(dict_index_get_lock(index), mtr);
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+ cursor->index = index;
+
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+ page_no = dict_index_get_page(index);
+
+ height = ULINT_UNDEFINED;
+
+ for (;;) {
+ buf_block_t* block;
+ page_t* page;
+
+ block = buf_page_get_gen(space, zip_size, page_no,
+ RW_NO_LATCH, NULL, BUF_GET,
+ file, line, mtr);
+ page = buf_block_get_frame(block);
+
+ if (srv_pass_corrupt_table && !page) {
+ page_cursor->block = 0;
+ page_cursor->rec = 0;
+ break;
+ }
+ ut_a(page);
+
+ ut_ad(0 == ut_dulint_cmp(index->id,
+ btr_page_get_index_id(page)));
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page, mtr);
+ }
+
+ if (height == 0) {
+ btr_cur_latch_leaves(page, space, zip_size, page_no,
+ latch_mode, cursor, mtr);
+ }
+
+ page_cur_open_on_rnd_user_rec(block, page_cursor);
+
+ if (height == 0) {
+
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ height--;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Go to the child node */
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree
+after the given path
+@return TRUE if the position is at the first page, and cursor must point
+ the first record for used by the caller.*/
+UNIV_INTERN
+ibool
+btr_cur_open_at_rnd_pos_after_path(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_path_t* first_rec_path,
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ btr_path_t* slot;
+ ibool is_first_rec = TRUE;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint height;
+ rec_t* node_ptr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ if (latch_mode == BTR_MODIFY_TREE) {
+ mtr_x_lock(dict_index_get_lock(index), mtr);
+ } else {
+ mtr_s_lock(dict_index_get_lock(index), mtr);
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+ cursor->index = index;
+
+ space = dict_index_get_space(index);
+ zip_size = dict_table_zip_size(index->table);
+ page_no = dict_index_get_page(index);
+
+ height = ULINT_UNDEFINED;
+ slot = first_rec_path;
+
+ for (;;) {
+ buf_block_t* block;
+ page_t* page;
+
+ block = buf_page_get_gen(space, zip_size, page_no,
+ RW_NO_LATCH, NULL, BUF_GET,
+ __FILE__, __LINE__, mtr);
+ page = buf_block_get_frame(block);
+ ut_ad(0 == ut_dulint_cmp(index->id,
+ btr_page_get_index_id(page)));
+
+ if (height == ULINT_UNDEFINED) {
+ /* We are in the root node */
+
+ height = btr_page_get_level(page, mtr);
+ }
+
+ if (height == 0) {
+ btr_cur_latch_leaves(page, space, zip_size, page_no,
+ latch_mode, cursor, mtr);
+ }
+
+ if (is_first_rec && slot->nth_rec != ULINT_UNDEFINED) {
+ if (height == 0) {
+ /* must open the first rec */
+ page_cur_open_on_nth_user_rec(block, page_cursor, slot->nth_rec);
+ } else {
+ is_first_rec = page_cur_open_on_rnd_user_rec_after_nth(block,
+ page_cursor, slot->nth_rec);
+ }
+ } else {
+ is_first_rec = FALSE;
+ page_cur_open_on_rnd_user_rec(block, page_cursor);
+ }
+
+ if (height == 0) {
+ break;
+ }
+
+ ut_ad(height > 0);
+
+ height--;
+ slot++;
+
+ node_ptr = page_cur_get_rec(page_cursor);
+ offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Go to the child node */
+ page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return (is_first_rec);
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+@return pointer to inserted record if succeed, else NULL */
+static
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
+ have been stored to tuple */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ block = btr_cur_get_block(cursor);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Now, try the insert */
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ cursor->index, n_ext, mtr);
+
+ if (UNIV_UNLIKELY(!rec)) {
+ /* If record did not fit, reorganize */
+
+ if (btr_page_reorganize(block, cursor->index, mtr)) {
+
+ page_cur_search(block, cursor->index, tuple,
+ PAGE_CUR_LE, page_cursor);
+
+ rec = page_cur_tuple_insert(page_cursor, tuple,
+ cursor->index, n_ext, mtr);
+ }
+ }
+
+ return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INLINE
+ulint
+btr_cur_ins_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if
+ not zero, the parameters index and thr
+ should be specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ ibool* inherit)/*!< out: TRUE if the inserted new record maybe
+ should inherit LOCK_GAP type locks from the
+ successor record */
+{
+ dict_index_t* index;
+ ulint err;
+ rec_t* rec;
+ roll_ptr_t roll_ptr;
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+
+ err = lock_rec_insert_check_and_lock(flags, rec,
+ btr_cur_get_block(cursor),
+ index, thr, mtr, inherit);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
+
+ err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
+ thr, index, entry,
+ NULL, 0, NULL,
+ &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Now we can fill in the roll ptr field in entry */
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+
+ row_upd_index_entry_sys_field(entry, index,
+ DATA_ROLL_PTR, roll_ptr);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Report information about a transaction. */
+static
+void
+btr_cur_trx_report(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index, /*!< in: index */
+ const char* op) /*!< in: operation */
+{
+ fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
+ TRX_ID_PREP_PRINTF(trx->id));
+ fputs(op, stderr);
+ dict_index_name_print(stderr, trx, index);
+ putc('\n', stderr);
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ DB_SUCCESS on a leaf page of a secondary
+ index in a compressed tablespace, the
+ mtr must be committed before latching
+ any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ page_t* page;
+ ulint max_size;
+ rec_t* dummy_rec;
+ ibool leaf;
+ ibool reorg;
+ ibool inherit;
+ ulint zip_size;
+ ulint rec_size;
+ ulint err;
+
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(DB_CORRUPTION);
+ }
+ ut_a(block);
+
+ page = buf_block_get_frame(block);
+ index = cursor->index;
+ zip_size = buf_block_get_zip_size(block);
+#ifdef UNIV_DEBUG_VALGRIND
+ if (zip_size) {
+ UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ if (!dtuple_check_typed_no_assert(entry)) {
+ fputs("InnoDB: Error in a tuple to insert into ", stderr);
+ dict_index_name_print(stderr, thr_get_trx(thr), index);
+ }
+#ifdef UNIV_DEBUG
+ if (btr_cur_print_record_ops && thr) {
+ btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
+ dtuple_print(stderr, entry);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ max_size = page_get_max_insert_size_after_reorganize(page, 1);
+ leaf = page_is_leaf(page);
+
+ /* Calculate the record size when entry is converted to a record */
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+
+ if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+ dtuple_get_n_fields(entry), zip_size)) {
+
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+ big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ rec_size = rec_get_converted_size(index, entry, n_ext);
+ }
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ /* Estimate the free space of an empty compressed page.
+ Subtract one byte for the encoded heap_no in the
+ modification log. */
+ ulint free_space_zip = page_zip_empty_size(
+ cursor->index->n_fields, zip_size) - 1;
+ ulint n_uniq = dict_index_get_n_unique_in_tree(index);
+
+ ut_ad(dict_table_is_comp(index->table));
+
+ /* There should be enough room for two node pointer
+ records on an empty non-leaf page. This prevents
+ infinite page splits. */
+
+ if (UNIV_LIKELY(entry->n_fields >= n_uniq)
+ && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
+ + rec_get_converted_size_comp_prefix(
+ index, entry->fields, n_uniq,
+ NULL)
+ /* On a compressed page, there is
+ a two-byte entry in the dense
+ page directory for every record.
+ But there is no record header. */
+ - (REC_N_NEW_EXTRA_BYTES - 2)
+ > free_space_zip / 2)) {
+
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(
+ index, entry, big_rec_vec);
+ }
+
+ return(DB_TOO_BIG_RECORD);
+ }
+ }
+
+ /* If there have been many consecutive inserts, and we are on the leaf
+ level, check if we have to split the page to reserve enough free space
+ for future updates of records. */
+
+ if (dict_index_is_clust(index)
+ && (page_get_n_recs(page) >= 2)
+ && UNIV_LIKELY(leaf)
+ && (dict_index_get_space_reserve() + rec_size > max_size)
+ && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
+ || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
+fail:
+ err = DB_FAIL;
+fail_err:
+
+ if (big_rec_vec) {
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ return(err);
+ }
+
+ if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+ || max_size < rec_size)
+ && UNIV_LIKELY(page_get_n_recs(page) > 1)
+ && page_get_max_insert_size(page, 1) < rec_size) {
+
+ goto fail;
+ }
+
+ /* Check locks and write to the undo log, if specified */
+ err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &inherit);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+ goto fail_err;
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ /* Now, try the insert */
+
+ {
+ const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
+ *rec = page_cur_tuple_insert(page_cursor, entry, index,
+ n_ext, mtr);
+ reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+
+ if (UNIV_UNLIKELY(reorg)) {
+ ut_a(zip_size);
+ ut_a(*rec);
+ }
+ }
+
+ if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
+ /* If the record did not fit, reorganize */
+ if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
+ ut_a(zip_size);
+
+ goto fail;
+ }
+
+ ut_ad(zip_size
+ || page_get_max_insert_size(page, 1) == max_size);
+
+ reorg = TRUE;
+
+ page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
+
+ *rec = page_cur_tuple_insert(page_cursor, entry, index,
+ n_ext, mtr);
+
+ if (UNIV_UNLIKELY(!*rec)) {
+ if (UNIV_LIKELY(zip_size != 0)) {
+
+ goto fail;
+ }
+
+ fputs("InnoDB: Error: cannot insert tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs(" into ", stderr);
+ dict_index_name_print(stderr, thr_get_trx(thr), index);
+ fprintf(stderr, "\nInnoDB: max insert size %lu\n",
+ (ulong) max_size);
+ ut_error;
+ }
+ }
+
+#ifdef BTR_CUR_HASH_ADAPT
+ if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
+ btr_search_update_hash_node_on_insert(cursor);
+ } else {
+ btr_search_update_hash_on_insert(cursor);
+ }
+#endif
+
+ if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+ lock_update_insert(block, *rec);
+ }
+
+#if 0
+ fprintf(stderr, "Insert into page %lu, max ins size %lu,"
+ " rec %lu ind type %lu\n",
+ buf_block_get_page_no(block), max_size,
+ rec_size + PAGE_DIR_SLOT_SIZE, index->type);
+#endif
+ if (leaf && !dict_index_is_clust(index)) {
+ /* Update the free bits of the B-tree page in the
+ insert buffer bitmap. */
+
+ /* The free bits in the insert buffer bitmap must
+ never exceed the free space on a page. It is safe to
+ decrement or reset the bits in the bitmap in a
+ mini-transaction that is committed before the
+ mini-transaction that affects the free space. */
+
+ /* It is unsafe to increment the bits in a separately
+ committed mini-transaction, because in crash recovery,
+ the free bits could momentarily be set too high. */
+
+ if (zip_size) {
+ /* Update the bits in the same mini-transaction. */
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ /* Decrement the bits in a separate
+ mini-transaction. */
+ ibuf_update_free_bits_if_full(
+ block, max_size,
+ rec_size + PAGE_DIR_SLOT_SIZE);
+ }
+ }
+
+ *big_rec = big_rec_vec;
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index = cursor->index;
+ ulint zip_size = dict_table_zip_size(index->table);
+ big_rec_t* big_rec_vec = NULL;
+ mem_heap_t* heap = NULL;
+ ulint err;
+ ibool dummy_inh;
+ ibool success;
+ ulint n_extents = 0;
+ ulint n_reserved;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ *big_rec = NULL;
+
+ ut_ad(mtr_memo_contains(mtr,
+ dict_index_get_lock(btr_cur_get_index(cursor)),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ /* Try first an optimistic insert; reset the cursor flag: we do not
+ assume anything of how it was positioned */
+
+ cursor->flag = BTR_CUR_BINARY;
+
+ err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
+ big_rec, n_ext, thr, mtr);
+ if (err != DB_FAIL) {
+
+ return(err);
+ }
+
+ /* Retry with a pessimistic insert. Check locks and write to undo log,
+ if specified */
+
+ err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+ thr, mtr, &dummy_inh);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the insert will not fail because
+ of lack of space */
+
+ n_extents = cursor->tree_height / 16 + 3;
+
+ success = fsp_reserve_free_extents(&n_reserved, index->space,
+ n_extents, FSP_NORMAL, mtr);
+ if (!success) {
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+ }
+
+ if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+ dict_table_is_comp(index->table),
+ dict_index_get_n_fields(index),
+ zip_size)) {
+ /* The record is so big that we have to store some fields
+ externally on separate database pages */
+
+ if (UNIV_LIKELY_NULL(big_rec_vec)) {
+ /* This should never happen, but we handle
+ the situation in a robust manner. */
+ ut_ad(0);
+ dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+ }
+
+ big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+ if (big_rec_vec == NULL) {
+
+ if (n_extents > 0) {
+ fil_space_release_free_extents(index->space,
+ n_reserved);
+ }
+ return(DB_TOO_BIG_RECORD);
+ }
+ }
+
+ if (dict_index_get_page(index)
+ == buf_block_get_page_no(btr_cur_get_block(cursor))) {
+
+ /* The page is the root page */
+ *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
+ } else {
+ *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
+
+#ifdef BTR_CUR_ADAPT
+ btr_search_update_hash_on_insert(cursor);
+#endif
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+
+ lock_update_insert(btr_cur_get_block(cursor), *rec);
+ }
+
+ if (n_extents > 0) {
+ fil_space_release_free_extents(index->space, n_reserved);
+ }
+
+ *big_rec = big_rec_vec;
+
+ return(DB_SUCCESS);
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return DB_SUCCESS, DB_WAIT_LOCK, or error number */
+UNIV_INLINE
+ulint
+btr_cur_upd_lock_and_undo(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on record to update */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ roll_ptr_t* roll_ptr)/*!< out: roll pointer */
+{
+ dict_index_t* index;
+ rec_t* rec;
+ ulint err;
+
+ ut_ad(cursor && update && thr && roll_ptr);
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+
+ if (!dict_index_is_clust(index)) {
+ /* We do undo logging only when we update a clustered index
+ record */
+ return(lock_sec_rec_modify_check_and_lock(
+ flags, btr_cur_get_block(cursor), rec,
+ index, thr, mtr));
+ }
+
+ /* Check if we have to wait for a lock: enqueue an explicit lock
+ request if yes */
+
+ err = DB_SUCCESS;
+
+ if (!(flags & BTR_NO_LOCKING_FLAG)) {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ err = lock_clust_rec_modify_check_and_lock(
+ flags, btr_cur_get_block(cursor), rec, index,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap), thr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ /* Append the info about the update in the undo log */
+
+ err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
+ index, NULL, update,
+ cmpl_info, rec, roll_ptr);
+ return(err);
+}
+
+/***********************************************************//**
+Writes a redo log record of updating a record in-place. */
+UNIV_INLINE
+void
+btr_cur_update_in_place_log(
+/*========================*/
+ ulint flags, /*!< in: flags */
+ rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index where cursor positioned */
+ const upd_t* update, /*!< in: update vector */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr, /*!< in: roll ptr */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ page_t* page = page_align(rec);
+ ut_ad(flags < 256);
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
+ ? MLOG_COMP_REC_UPDATE_IN_PLACE
+ : MLOG_REC_UPDATE_IN_PLACE,
+ 1 + DATA_ROLL_PTR_LEN + 14 + 2
+ + MLOG_BUF_MARGIN);
+
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash recovery */
+ return;
+ }
+
+ /* The code below assumes index is a clustered index: change index to
+ the clustered index if we are updating a secondary index record (or we
+ could as well skip writing the sys col values to the log in this case
+ because they are not needed for a secondary index record update) */
+
+ index = dict_table_get_first_index(index->table);
+
+ mach_write_to_1(log_ptr, flags);
+ log_ptr++;
+
+ log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
+ mtr);
+ mach_write_to_2(log_ptr, page_offset(rec));
+ log_ptr += 2;
+
+ row_upd_index_write_log(update, log_ptr, mtr);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index) /*!< in: index corresponding to page */
+{
+ ulint flags;
+ rec_t* rec;
+ upd_t* update;
+ ulint pos;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ ulint rec_offset;
+ mem_heap_t* heap;
+ ulint* offsets;
+
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ flags = mach_read_from_1(ptr);
+ ptr++;
+
+ ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ rec_offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ ut_a(rec_offset <= UNIV_PAGE_SIZE);
+
+ heap = mem_heap_create(256);
+
+ ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
+
+ if (!ptr || !page) {
+
+ goto func_exit;
+ }
+
+ ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+ rec = page + rec_offset;
+
+ /* We do not need to reserve btr_search_latch, as the page is only
+ being recovered, and there cannot be a hash index to it. */
+
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
+ pos, trx_id, roll_ptr);
+ }
+
+ row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+func_exit:
+ mem_heap_free(heap);
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+@return TRUE if enough place */
+static
+ibool
+btr_cur_update_alloc_zip(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ buf_block_t* block, /*!< in/out: buffer page */
+ dict_index_t* index, /*!< in: the index corresponding to the block */
+ ulint length, /*!< in: size needed */
+ ibool create, /*!< in: TRUE=delete-and-insert,
+ FALSE=update-in-place */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ut_a(page_zip == buf_block_get_page_zip(block));
+ ut_ad(page_zip);
+ ut_ad(!dict_index_is_ibuf(index));
+
+ if (page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ return(TRUE);
+ }
+
+ if (!page_zip->m_nonempty) {
+ /* The page has been freshly compressed, so
+ recompressing it will not help. */
+ return(FALSE);
+ }
+
+ if (!page_zip_compress(page_zip, buf_block_get_frame(block),
+ index, mtr)) {
+ /* Unable to compress the page */
+ return(FALSE);
+ }
+
+ /* After recompressing a page, we must make sure that the free
+ bits in the insert buffer bitmap will not exceed the free
+ space on the page. Because this function will not attempt
+ recompression unless page_zip_available() fails above, it is
+ safe to reset the free bits if page_zip_available() fails
+ again, below. The free bits can safely be reset in a separate
+ mini-transaction. If page_zip_available() succeeds below, we
+ can be sure that the page_zip_compress() above did not reduce
+ the free space available on the page. */
+
+ if (!page_zip_available(page_zip, dict_index_is_clust(index),
+ length, create)) {
+ /* Out of space: reset the free bits. */
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(buf_block_get_frame(block))) {
+ ibuf_reset_free_bits(block);
+ }
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ dict_index_t* index;
+ buf_block_t* block;
+ page_zip_des_t* page_zip;
+ ulint err;
+ rec_t* rec;
+ roll_ptr_t roll_ptr = ut_dulint_zero;
+ trx_t* trx;
+ ulint was_delete_marked;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+
+ trx = thr_get_trx(thr);
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+#ifdef UNIV_DEBUG
+ if (btr_cur_print_record_ops && thr) {
+ btr_cur_trx_report(trx, index, "update ");
+ rec_print_new(stderr, rec, offsets);
+ }
+#endif /* UNIV_DEBUG */
+
+ block = btr_cur_get_block(cursor);
+ page_zip = buf_block_get_page_zip(block);
+
+ /* Check that enough space is available on the compressed page. */
+ if (UNIV_LIKELY_NULL(page_zip)
+ && !btr_cur_update_alloc_zip(page_zip, block, index,
+ rec_offs_size(offsets), FALSE, mtr)) {
+ return(DB_ZIP_OVERFLOW);
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+ }
+
+ if (block->is_hashed) {
+ /* The function row_upd_changes_ord_field_binary works only
+ if the update vector was built for a clustered index, we must
+ NOT call it if index is secondary */
+
+ if (!dict_index_is_clust(index)
+ || row_upd_changes_ord_field_binary(NULL, index, update)) {
+
+ /* Remove possible hash index pointer to this record */
+ btr_search_update_hash_on_delete(cursor);
+ }
+
+ rw_lock_x_lock(&btr_search_latch);
+ }
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ row_upd_rec_sys_fields(rec, NULL,
+ index, offsets, trx, roll_ptr);
+ }
+
+ was_delete_marked = rec_get_deleted_flag(
+ rec, page_is_comp(buf_block_get_frame(block)));
+
+ row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+ if (block->is_hashed) {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+
+ if (page_zip && !dict_index_is_clust(index)
+ && page_is_leaf(buf_block_get_frame(block))) {
+ /* Update the free bits in the insert buffer. */
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ btr_cur_update_in_place_log(flags, rec, index, update,
+ trx, roll_ptr, mtr);
+
+ if (was_delete_marked
+ && !rec_get_deleted_flag(rec, page_is_comp(
+ buf_block_get_frame(block)))) {
+ /* The new updated record owns its possible externally
+ stored fields */
+
+ btr_cur_unmark_extern_fields(page_zip,
+ rec, index, offsets, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
+DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
+there is not enough space left on the compressed page */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ dict_index_t* index;
+ page_cur_t* page_cursor;
+ ulint err;
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ rec_t* orig_rec;
+ ulint max_size;
+ ulint new_rec_size;
+ ulint old_rec_size;
+ dtuple_t* new_entry;
+ roll_ptr_t roll_ptr;
+ trx_t* trx;
+ mem_heap_t* heap;
+ ulint i;
+ ulint n_ext;
+ ulint* offsets;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ orig_rec = rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+
+ heap = mem_heap_create(1024);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+#ifdef UNIV_DEBUG
+ if (btr_cur_print_record_ops && thr) {
+ btr_cur_trx_report(thr_get_trx(thr), index, "update ");
+ rec_print_new(stderr, rec, offsets);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
+
+ /* The simplest and the most common case: the update does not
+ change the size of any field and none of the updated fields is
+ externally stored in rec or update, and there is enough space
+ on the compressed page to log the update. */
+
+ mem_heap_free(heap);
+ return(btr_cur_update_in_place(flags, cursor, update,
+ cmpl_info, thr, mtr));
+ }
+
+ if (rec_offs_any_extern(offsets)) {
+any_extern:
+ /* Externally stored fields are treated in pessimistic
+ update */
+
+ mem_heap_free(heap);
+ return(DB_OVERFLOW);
+ }
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+ goto any_extern;
+ }
+ }
+
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+ &n_ext, heap);
+ /* We checked above that there are no externally stored fields. */
+ ut_a(!n_ext);
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr.
+ Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ FALSE, heap);
+ old_rec_size = rec_offs_size(offsets);
+ new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (UNIV_LIKELY_NULL(page_zip)
+ && !btr_cur_update_alloc_zip(page_zip, block, index,
+ new_rec_size, TRUE, mtr)) {
+ err = DB_ZIP_OVERFLOW;
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(new_rec_size
+ >= (page_get_free_space_of_empty(page_is_comp(page))
+ / 2))) {
+
+ err = DB_OVERFLOW;
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_get_data_size(page)
+ - old_rec_size + new_rec_size
+ < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
+
+ /* The page would become too empty */
+
+ err = DB_UNDERFLOW;
+ goto err_exit;
+ }
+
+ max_size = old_rec_size
+ + page_get_max_insert_size_after_reorganize(page, 1);
+
+ if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+ && (max_size >= new_rec_size))
+ || (page_get_n_recs(page) <= 1))) {
+
+ /* There was not enough space, or it did not pay to
+ reorganize: for simplicity, we decide what to do assuming a
+ reorganization is needed, though it might not be necessary */
+
+ err = DB_OVERFLOW;
+ goto err_exit;
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ /* Ok, we may do the replacement. Store on the page infimum the
+ explicit locks on rec, before deleting rec (see the comment in
+ btr_cur_pessimistic_update). */
+
+ lock_rec_store_on_page_infimum(block, rec);
+
+ btr_search_update_hash_on_delete(cursor);
+
+ /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
+ invokes rec_offs_make_valid() to point to the copied record that
+ the fields of new_entry point to. We have to undo it here. */
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
+
+ page_cur_delete_rec(page_cursor, index, offsets, mtr);
+
+ page_cur_move_to_prev(page_cursor);
+
+ trx = thr_get_trx(thr);
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+ roll_ptr);
+ row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+ trx->id);
+ }
+
+ /* There are no externally stored columns in new_entry */
+ rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
+ ut_a(rec); /* <- We calculated above the insert would fit */
+
+ if (page_zip && !dict_index_is_clust(index)
+ && page_is_leaf(page)) {
+ /* Update the free bits in the insert buffer. */
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ /* Restore the old explicit lock state on the record */
+
+ lock_rec_restore_from_page_infimum(block, rec, block);
+
+ page_cur_move_to_next(page_cursor);
+
+ err = DB_SUCCESS;
+err_exit:
+ mem_heap_free(heap);
+ return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+void
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+ buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: updated record */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+ buf_block_t* prev_block;
+ ulint space;
+ ulint zip_size;
+ ulint prev_page_no;
+
+ page = buf_block_get_frame(block);
+
+ if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+ /* Updated record is not the first user record on its page */
+
+ return;
+ }
+
+ space = buf_block_get_space(block);
+ zip_size = buf_block_get_zip_size(block);
+ prev_page_no = btr_page_get_prev(page, mtr);
+
+ ut_ad(prev_page_no != FIL_NULL);
+ prev_block = buf_page_get_with_no_latch(space, zip_size,
+ prev_page_no, mtr);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(prev_block->frame, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ /* We must already have an x-latch on prev_block! */
+ ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
+
+ lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+ PAGE_HEAP_NO_SUPREMUM,
+ page_rec_get_heap_no(rec));
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or NULL */
+ const upd_t* update, /*!< in: update vector; this is allowed also
+ contain trx id and roll ptr fields, but
+ the values in update vector have no effect */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ big_rec_t* big_rec_vec = NULL;
+ big_rec_t* dummy_big_rec;
+ dict_index_t* index;
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ rec_t* rec;
+ page_cur_t* page_cursor;
+ dtuple_t* new_entry;
+ ulint err;
+ ulint optim_err;
+ roll_ptr_t roll_ptr;
+ trx_t* trx;
+ ibool was_first;
+ ulint n_extents = 0;
+ ulint n_reserved;
+ ulint n_ext;
+ ulint* offsets = NULL;
+
+ *big_rec = NULL;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ /* The insert buffer tree should never be updated in place. */
+ ut_ad(!dict_index_is_ibuf(index));
+
+ optim_err = btr_cur_optimistic_update(flags, cursor, update,
+ cmpl_info, thr, mtr);
+
+ switch (optim_err) {
+ case DB_UNDERFLOW:
+ case DB_OVERFLOW:
+ case DB_ZIP_OVERFLOW:
+ break;
+ default:
+ return(optim_err);
+ }
+
+ /* Do lock checking and undo logging */
+ err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+ thr, mtr, &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (optim_err == DB_OVERFLOW) {
+ ulint reserve_flag;
+
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the update will not fail because
+ of lack of space */
+
+ n_extents = cursor->tree_height / 16 + 3;
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+ reserve_flag = FSP_CLEANING;
+ } else {
+ reserve_flag = FSP_NORMAL;
+ }
+
+ if (!fsp_reserve_free_extents(&n_reserved, index->space,
+ n_extents, reserve_flag, mtr)) {
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+ }
+
+ if (!*heap) {
+ *heap = mem_heap_create(1024);
+ }
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
+
+ trx = thr_get_trx(thr);
+
+ new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+ &n_ext, *heap);
+ /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
+ invokes rec_offs_make_valid() to point to the copied record that
+ the fields of new_entry point to. We have to undo it here. */
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ rec_offs_make_valid(rec, index, offsets);
+
+ /* The page containing the clustered index record
+ corresponding to new_entry is latched in mtr. If the
+ clustered index record is delete-marked, then its externally
+ stored fields cannot have been purged yet, because then the
+ purge would also have removed the clustered index record
+ itself. Thus the following call is safe. */
+ row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+ FALSE, *heap);
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+ roll_ptr);
+ row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+ trx->id);
+ }
+
+ if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
+ /* We are in a transaction rollback undoing a row
+ update: we must free possible externally stored fields
+ which got new values in the update, if they are not
+ inherited values. They can be inherited if we have
+ updated the primary key to another value, and then
+ update it back again. */
+
+ ut_ad(big_rec_vec == NULL);
+
+ btr_rec_free_updated_extern_fields(
+ index, rec, page_zip, offsets, update,
+ trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
+ }
+
+ /* We have to set appropriate extern storage bits in the new
+ record to be inserted: we have to remember which fields were such */
+
+ ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
+ n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ut_ad(page_is_comp(page));
+ if (page_zip_rec_needs_ext(
+ rec_get_converted_size(index, new_entry, n_ext),
+ TRUE,
+ dict_index_get_n_fields(index),
+ page_zip_get_size(page_zip))) {
+
+ goto make_external;
+ }
+ } else if (page_zip_rec_needs_ext(
+ rec_get_converted_size(index, new_entry, n_ext),
+ page_is_comp(page), 0, 0)) {
+make_external:
+ big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
+ if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+ err = DB_TOO_BIG_RECORD;
+ goto return_after_reservations;
+ }
+ }
+
+ /* Store state of explicit locks on rec on the page infimum record,
+ before deleting rec. The page infimum acts as a dummy carrier of the
+ locks, taking care also of lock releases, before we can move the locks
+ back on the actual record. There is a special case: if we are
+ inserting on the root page and the insert causes a call of
+ btr_root_raise_and_insert. Therefore we cannot in the lock system
+ delete the lock structs set on the root page even if the root
+ page carries just node pointers. */
+
+ lock_rec_store_on_page_infimum(block, rec);
+
+ btr_search_update_hash_on_delete(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cursor = btr_cur_get_page_cur(cursor);
+
+ page_cur_delete_rec(page_cursor, index, offsets, mtr);
+
+ page_cur_move_to_prev(page_cursor);
+
+ rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
+
+ if (rec) {
+ lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+ rec, block);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, heap);
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+ btr_cur_unmark_extern_fields(page_zip,
+ rec, index, offsets, mtr);
+ }
+
+ btr_cur_compress_if_useful(cursor, mtr);
+
+ if (page_zip && !dict_index_is_clust(index)
+ && page_is_leaf(page)) {
+ /* Update the free bits in the insert buffer. */
+ ibuf_update_free_bits_zip(block, mtr);
+ }
+
+ err = DB_SUCCESS;
+ goto return_after_reservations;
+ } else {
+ ut_a(optim_err != DB_UNDERFLOW);
+
+ /* Out of space: reset the free bits. */
+ if (!dict_index_is_clust(index)
+ && page_is_leaf(page)) {
+ ibuf_reset_free_bits(block);
+ }
+ }
+
+ /* Was the record to be updated positioned as the first user
+ record on its page? */
+ was_first = page_cur_is_before_first(page_cursor);
+
+ /* The first parameter means that no lock checking and undo logging
+ is made in the insert */
+
+ err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+ | BTR_NO_LOCKING_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ cursor, new_entry, &rec,
+ &dummy_big_rec, n_ext, NULL, mtr);
+ ut_a(rec);
+ ut_a(err == DB_SUCCESS);
+ ut_a(dummy_big_rec == NULL);
+
+ if (dict_index_is_sec_or_ibuf(index)) {
+ /* Update PAGE_MAX_TRX_ID in the index page header.
+ It was not updated by btr_cur_pessimistic_insert()
+ because of BTR_NO_LOCKING_FLAG. */
+ buf_block_t* rec_block;
+
+ rec_block = btr_cur_get_block(cursor);
+
+ page_update_max_trx_id(rec_block,
+ buf_block_get_page_zip(rec_block),
+ trx->id, mtr);
+ }
+
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+ /* The new inserted record owns its possible externally
+ stored fields */
+ buf_block_t* rec_block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+ page = buf_block_get_frame(rec_block);
+#endif /* UNIV_ZIP_DEBUG */
+ page_zip = buf_block_get_page_zip(rec_block);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, heap);
+ btr_cur_unmark_extern_fields(page_zip,
+ rec, index, offsets, mtr);
+ }
+
+ lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+ rec, block);
+
+ /* If necessary, restore also the correct lock state for a new,
+ preceding supremum record created in a page split. While the old
+ record was nonexistent, the supremum might have inherited its locks
+ from a wrong record. */
+
+ if (!was_first) {
+ btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
+ rec, mtr);
+ }
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (n_extents > 0) {
+ fil_space_release_free_extents(index->space, n_reserved);
+ }
+
+ *big_rec = big_rec_vec;
+
+ return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/****************************************************************//**
+Writes the redo log record for delete marking or unmarking of an index
+record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_clust_rec_log(
+/*===============================*/
+ ulint flags, /*!< in: flags */
+ rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index of the record */
+ ibool val, /*!< in: value to set */
+ trx_t* trx, /*!< in: deleting transaction */
+ roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ ut_ad(flags < 256);
+ ut_ad(val <= 1);
+
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+ log_ptr = mlog_open_and_write_index(mtr, rec, index,
+ page_rec_is_comp(rec)
+ ? MLOG_COMP_REC_CLUST_DELETE_MARK
+ : MLOG_REC_CLUST_DELETE_MARK,
+ 1 + 1 + DATA_ROLL_PTR_LEN
+ + 14 + 2);
+
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash recovery */
+ return;
+ }
+
+ mach_write_to_1(log_ptr, flags);
+ log_ptr++;
+ mach_write_to_1(log_ptr, val);
+ log_ptr++;
+
+ log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
+ mtr);
+ mach_write_to_2(log_ptr, page_offset(rec));
+ log_ptr += 2;
+
+ mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index) /*!< in: index corresponding to page */
+{
+ ulint flags;
+ ulint val;
+ ulint pos;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ ulint offset;
+ rec_t* rec;
+
+ ut_ad(!page
+ || !!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ flags = mach_read_from_1(ptr);
+ ptr++;
+ val = mach_read_from_1(ptr);
+ ptr++;
+
+ ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ ut_a(offset <= UNIV_PAGE_SIZE);
+
+ if (page) {
+ rec = page + offset;
+
+ /* We do not need to reserve btr_search_latch, as the page
+ is only being recovered, and there cannot be a hash index to
+ it. */
+
+ btr_rec_set_deleted_flag(rec, page_zip, val);
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ row_upd_rec_sys_fields_in_recovery(
+ rec, page_zip,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ pos, trx_id, roll_ptr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+ }
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ buf_block_t* block;
+ roll_ptr_t roll_ptr;
+ ulint err;
+ rec_t* rec;
+ page_zip_des_t* page_zip;
+ trx_t* trx;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ rec = btr_cur_get_rec(cursor);
+ index = cursor->index;
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+#ifdef UNIV_DEBUG
+ if (btr_cur_print_record_ops && thr) {
+ btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
+ rec_print_new(stderr, rec, offsets);
+ }
+#endif /* UNIV_DEBUG */
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+
+ err = lock_clust_rec_modify_check_and_lock(flags,
+ btr_cur_get_block(cursor),
+ rec, index, offsets, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
+ index, NULL, NULL, 0, rec,
+ &roll_ptr);
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ block = btr_cur_get_block(cursor);
+
+ if (block->is_hashed) {
+ rw_lock_x_lock(&btr_search_latch);
+ }
+
+ page_zip = buf_block_get_page_zip(block);
+
+ btr_rec_set_deleted_flag(rec, page_zip, val);
+
+ trx = thr_get_trx(thr);
+
+ if (!(flags & BTR_KEEP_SYS_FLAG)) {
+ row_upd_rec_sys_fields(rec, page_zip,
+ index, offsets, trx, roll_ptr);
+ }
+
+ if (block->is_hashed) {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+
+ btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
+ roll_ptr, mtr);
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/****************************************************************//**
+Writes the redo log record for a delete mark setting of a secondary
+index record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_sec_rec_log(
+/*=============================*/
+ rec_t* rec, /*!< in: record */
+ ibool val, /*!< in: value to set */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ ut_ad(val <= 1);
+
+ log_ptr = mlog_open(mtr, 11 + 1 + 2);
+
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash recovery:
+ in that case mlog_open returns NULL */
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(
+ rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
+ mach_write_to_1(log_ptr, val);
+ log_ptr++;
+
+ mach_write_to_2(log_ptr, page_offset(rec));
+ log_ptr += 2;
+
+ mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
+{
+ ulint val;
+ ulint offset;
+ rec_t* rec;
+
+ if (end_ptr < ptr + 3) {
+
+ return(NULL);
+ }
+
+ val = mach_read_from_1(ptr);
+ ptr++;
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ ut_a(offset <= UNIV_PAGE_SIZE);
+
+ if (page) {
+ rec = page + offset;
+
+ /* We do not need to reserve btr_search_latch, as the page
+ is only being recovered, and there cannot be a hash index to
+ it. */
+
+ btr_rec_set_deleted_flag(rec, page_zip, val);
+ }
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+ ulint flags, /*!< in: locking flag */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ rec_t* rec;
+ ulint err;
+
+ block = btr_cur_get_block(cursor);
+ rec = btr_cur_get_rec(cursor);
+
+#ifdef UNIV_DEBUG
+ if (btr_cur_print_record_ops && thr) {
+ btr_cur_trx_report(thr_get_trx(thr), cursor->index,
+ "del mark ");
+ rec_print(stderr, rec, cursor->index);
+ }
+#endif /* UNIV_DEBUG */
+
+ err = lock_sec_rec_modify_check_and_lock(flags,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, thr, mtr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ ut_ad(!!page_rec_is_comp(rec)
+ == dict_table_is_comp(cursor->index->table));
+
+ if (block->is_hashed) {
+ rw_lock_x_lock(&btr_search_latch);
+ }
+
+ btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
+
+ if (block->is_hashed) {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+
+ btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Clear a secondary index record's delete mark. This function is only
+used by the insert buffer insert merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+ rec_t* rec, /*!< in/out: record to delete unmark */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page
+ corresponding to rec, or NULL
+ when the tablespace is
+ uncompressed */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* We do not need to reserve btr_search_latch, as the page has just
+ been read to the buffer pool and there cannot be a hash index to it. */
+
+ btr_rec_set_deleted_flag(rec, page_zip, FALSE);
+
+ btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to compress;
+ cursor does not stay valid if compression
+ occurs */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr_memo_contains(mtr,
+ dict_index_get_lock(btr_cur_get_index(cursor)),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ return(btr_cur_compress_recommendation(cursor, mtr)
+ && btr_compress(cursor, mtr));
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
+ delete; cursor stays valid: if deletion
+ succeeds, on function exit it points to the
+ successor of the deleted record */
+ mtr_t* mtr) /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+{
+ buf_block_t* block;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ibool no_compress_needed;
+ rec_offs_init(offsets_);
+
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+ /* This is intended only for leaf page deletions */
+
+ block = btr_cur_get_block(cursor);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(DB_CORRUPTION);
+ }
+ ut_a(block);
+
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+ rec = btr_cur_get_rec(cursor);
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ no_compress_needed = !rec_offs_any_extern(offsets)
+ && btr_cur_can_delete_without_compress(
+ cursor, rec_offs_size(offsets), mtr);
+
+ if (no_compress_needed) {
+
+ page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ ulint max_ins = 0;
+
+ lock_update_delete(block, rec);
+
+ btr_search_update_hash_on_delete(cursor);
+
+ if (!page_zip) {
+ max_ins = page_get_max_insert_size_after_reorganize(
+ page, 1);
+ }
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+ cursor->index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (dict_index_is_clust(cursor->index)
+ || dict_index_is_ibuf(cursor->index)
+ || !page_is_leaf(page)) {
+ /* The insert buffer does not handle
+ inserts to clustered indexes, to
+ non-leaf pages of secondary index B-trees,
+ or to the insert buffer. */
+ } else if (page_zip) {
+ ibuf_update_free_bits_zip(block, mtr);
+ } else {
+ ibuf_update_free_bits_low(block, max_ins, mtr);
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(no_compress_needed);
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ dict_index_t* index;
+ rec_t* rec;
+ dtuple_t* node_ptr;
+ ulint n_extents = 0;
+ ulint n_reserved;
+ ibool success;
+ ibool ret = FALSE;
+ ulint level;
+ mem_heap_t* heap;
+ ulint* offsets;
+
+ block = btr_cur_get_block(cursor);
+ page = buf_block_get_frame(block);
+ index = btr_cur_get_index(cursor);
+
+ ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ if (!has_reserved_extents) {
+ /* First reserve enough free space for the file segments
+ of the index tree, so that the node pointer updates will
+ not fail because of lack of space */
+
+ n_extents = cursor->tree_height / 32 + 1;
+
+ success = fsp_reserve_free_extents(&n_reserved,
+ index->space,
+ n_extents,
+ FSP_CLEANING, mtr);
+ if (!success) {
+ *err = DB_OUT_OF_FILE_SPACE;
+
+ return(FALSE);
+ }
+ }
+
+ heap = mem_heap_create(1024);
+ rec = btr_cur_get_rec(cursor);
+ page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+ if (rec_offs_any_extern(offsets)) {
+ btr_rec_free_externally_stored_fields(index,
+ rec, offsets, page_zip,
+ rb_ctx, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
+ && UNIV_UNLIKELY(dict_index_get_page(index)
+ != buf_block_get_page_no(block))) {
+
+ /* If there is only one record, drop the whole page in
+ btr_discard_page, if this is not the root page */
+
+ btr_discard_page(cursor, mtr);
+
+ *err = DB_SUCCESS;
+ ret = TRUE;
+
+ goto return_after_reservations;
+ }
+
+ lock_update_delete(block, rec);
+ level = btr_page_get_level(page, mtr);
+
+ if (level > 0
+ && UNIV_UNLIKELY(rec == page_rec_get_next(
+ page_get_infimum_rec(page)))) {
+
+ rec_t* next_rec = page_rec_get_next(rec);
+
+ if (btr_page_get_prev(page, mtr) == FIL_NULL) {
+
+ /* If we delete the leftmost node pointer on a
+ non-leaf level, we must mark the new leftmost node
+ pointer as the predefined minimum record */
+
+ /* This will make page_zip_validate() fail until
+ page_cur_delete_rec() completes. This is harmless,
+ because everything will take place within a single
+ mini-transaction and because writing to the redo log
+ is an atomic operation (performed by mtr_commit()). */
+ btr_set_min_rec_mark(next_rec, mtr);
+ } else {
+ /* Otherwise, if we delete the leftmost node pointer
+ on a page, we have to change the father node pointer
+ so that it is equal to the new leftmost node pointer
+ on the page */
+
+ btr_node_ptr_delete(index, block, mtr);
+
+ node_ptr = dict_index_build_node_ptr(
+ index, next_rec, buf_block_get_page_no(block),
+ heap, level);
+
+ btr_insert_on_non_leaf_level(index,
+ level + 1, node_ptr, mtr);
+ }
+ }
+
+ btr_search_update_hash_on_delete(cursor);
+
+ page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ ut_ad(btr_check_node_ptr(index, block, mtr));
+
+ *err = DB_SUCCESS;
+
+return_after_reservations:
+ mem_heap_free(heap);
+
+ if (ret == FALSE) {
+ ret = btr_cur_compress_if_useful(cursor, mtr);
+ }
+
+ if (n_extents > 0) {
+ fil_space_release_free_extents(index->space, n_reserved);
+ }
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+ btr_cur_t* cursor, /*!< in: cursor positioned on a page */
+ ulint height, /*!< in: height of the page in tree;
+ 0 means leaf node */
+ ulint root_height) /*!< in: root node height in tree */
+{
+ btr_path_t* slot;
+ rec_t* rec;
+
+ ut_a(cursor->path_arr);
+
+ if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
+ /* Do nothing; return empty path */
+
+ slot = cursor->path_arr;
+ slot->nth_rec = ULINT_UNDEFINED;
+
+ return;
+ }
+
+ if (height == 0) {
+ /* Mark end of slots for path */
+ slot = cursor->path_arr + root_height + 1;
+ slot->nth_rec = ULINT_UNDEFINED;
+ }
+
+ rec = btr_cur_get_rec(cursor);
+
+ slot = cursor->path_arr + (root_height - height);
+
+ slot->nth_rec = page_rec_get_n_recs_before(rec);
+ slot->n_recs = page_get_n_recs(page_align(rec));
+}
+
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */
+ ulint mode1, /*!< in: search mode for range start */
+ const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */
+ ulint mode2) /*!< in: search mode for range end */
+{
+ btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
+ btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
+ btr_cur_t cursor;
+ btr_path_t* slot1;
+ btr_path_t* slot2;
+ ibool diverged;
+ ibool diverged_lot;
+ ulint divergence_level;
+ ib_int64_t n_rows;
+ ulint i;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path1;
+
+ if (dtuple_get_n_fields(tuple1) > 0) {
+
+ btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+ } else {
+ btr_cur_open_at_index_side(TRUE, index,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path2;
+
+ if (dtuple_get_n_fields(tuple2) > 0) {
+
+ btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+ } else {
+ btr_cur_open_at_index_side(FALSE, index,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ /* We have the path information for the range in path1 and path2 */
+
+ n_rows = 1;
+ diverged = FALSE; /* This becomes true when the path is not
+ the same any more */
+ diverged_lot = FALSE; /* This becomes true when the paths are
+ not the same or adjacent any more */
+ divergence_level = 1000000; /* This is the level where paths diverged
+ a lot */
+ for (i = 0; ; i++) {
+ ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+ slot1 = path1 + i;
+ slot2 = path2 + i;
+
+ if (slot1->nth_rec == ULINT_UNDEFINED
+ || slot2->nth_rec == ULINT_UNDEFINED) {
+
+ if (i > divergence_level + 1) {
+ /* In trees whose height is > 1 our algorithm
+ tends to underestimate: multiply the estimate
+ by 2: */
+
+ n_rows = n_rows * 2;
+ }
+
+ /* Do not estimate the number of rows in the range
+ to over 1 / 2 of the estimated rows in the whole
+ table */
+
+ if (n_rows > index->table->stat_n_rows / 2) {
+ n_rows = index->table->stat_n_rows / 2;
+
+ /* If there are just 0 or 1 rows in the table,
+ then we estimate all rows are in the range */
+
+ if (n_rows == 0) {
+ n_rows = index->table->stat_n_rows;
+ }
+ }
+
+ return(n_rows);
+ }
+
+ if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+ diverged = TRUE;
+
+ if (slot1->nth_rec < slot2->nth_rec) {
+ n_rows = slot2->nth_rec - slot1->nth_rec;
+
+ if (n_rows > 1) {
+ diverged_lot = TRUE;
+ divergence_level = i;
+ }
+ } else {
+ /* Maybe the tree has changed between
+ searches */
+
+ return(10);
+ }
+
+ } else if (diverged && !diverged_lot) {
+
+ if (slot1->nth_rec < slot1->n_recs
+ || slot2->nth_rec > 1) {
+
+ diverged_lot = TRUE;
+ divergence_level = i;
+
+ n_rows = 0;
+
+ if (slot1->nth_rec < slot1->n_recs) {
+ n_rows += slot1->n_recs
+ - slot1->nth_rec;
+ }
+
+ if (slot2->nth_rec > 1) {
+ n_rows += slot2->nth_rec - 1;
+ }
+ }
+ } else if (diverged_lot) {
+
+ n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
+ / 2;
+ }
+ }
+}
+
+/*******************************************************************//**
+Estimates the number of pages which have not null value of the key of n_cols.
+@return estimated number of pages */
+UNIV_INTERN
+ulint
+btr_estimate_n_pages_not_null(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint n_cols, /*!< in: The cols should be not null */
+ btr_path_t* path1) /*!< in: path1[BTR_PATH_ARRAY_N_SLOTS] */
+{
+ dtuple_t* tuple1;
+ btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
+ btr_cur_t cursor;
+ btr_path_t* slot1;
+ btr_path_t* slot2;
+ ibool diverged;
+ ibool diverged_lot;
+ ulint divergence_level;
+ ulint n_pages;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(n_cols * sizeof(dfield_t)
+ + sizeof(dtuple_t));
+
+ /* make tuple1 (NULL,NULL,,,) from n_cols */
+ tuple1 = dtuple_create(heap, n_cols);
+ dict_index_copy_types(tuple1, index, n_cols);
+
+ for (i = 0; i < n_cols; i++) {
+ dfield_set_null(dtuple_get_nth_field(tuple1, i));
+ }
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path1;
+
+ btr_cur_search_to_nth_level(index, 0, tuple1, PAGE_CUR_G,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+
+ mtr_commit(&mtr);
+
+
+
+ mtr_start(&mtr);
+
+ cursor.path_arr = path2;
+
+ btr_cur_open_at_index_side(FALSE, index,
+ BTR_SEARCH_LEAF | BTR_ESTIMATE,
+ &cursor, &mtr);
+
+ mtr_commit(&mtr);
+
+ mem_heap_free(heap);
+
+ /* We have the path information for the range in path1 and path2 */
+
+ n_pages = 1;
+ diverged = FALSE; /* This becomes true when the path is not
+ the same any more */
+ diverged_lot = FALSE; /* This becomes true when the paths are
+ not the same or adjacent any more */
+ divergence_level = 1000000; /* This is the level where paths diverged
+ a lot */
+ for (i = 0; ; i++) {
+ ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+ slot1 = path1 + i;
+ slot2 = path2 + i;
+
+ if ((slot1 + 1)->nth_rec == ULINT_UNDEFINED
+ || (slot2 + 1)->nth_rec == ULINT_UNDEFINED) {
+
+ if (i > divergence_level + 1) {
+ /* In trees whose height is > 1 our algorithm
+ tends to underestimate: multiply the estimate
+ by 2: */
+
+ n_pages = n_pages * 2;
+ }
+
+ /* Do not estimate the number of rows in the range
+ to over 1 / 2 of the estimated rows in the whole
+ table */
+
+ if (n_pages > index->stat_n_leaf_pages / 2) {
+ n_pages = index->stat_n_leaf_pages / 2;
+
+ /* If there are just 0 or 1 rows in the table,
+ then we estimate all rows are in the range */
+
+ if (n_pages == 0) {
+ n_pages = index->stat_n_leaf_pages;
+ }
+ }
+
+ return(n_pages);
+ }
+
+ if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+ diverged = TRUE;
+
+ if (slot1->nth_rec < slot2->nth_rec) {
+ n_pages = slot2->nth_rec - slot1->nth_rec;
+
+ if (n_pages > 1) {
+ diverged_lot = TRUE;
+ divergence_level = i;
+ }
+ } else {
+ /* Maybe the tree has changed between
+ searches */
+
+ return(10);
+ }
+
+ } else if (diverged && !diverged_lot) {
+
+ if (slot1->nth_rec < slot1->n_recs
+ || slot2->nth_rec > 1) {
+
+ diverged_lot = TRUE;
+ divergence_level = i;
+
+ n_pages = 0;
+
+ if (slot1->nth_rec < slot1->n_recs) {
+ n_pages += slot1->n_recs
+ - slot1->nth_rec;
+ }
+
+ if (slot2->nth_rec > 1) {
+ n_pages += slot2->nth_rec - 1;
+ }
+ }
+ } else if (diverged_lot) {
+
+ n_pages = (n_pages * (slot1->n_recs + slot2->n_recs))
+ / 2;
+ }
+ }
+}
+
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+ dict_index_t* index) /*!< in: index */
+{
+ btr_cur_t cursor;
+ page_t* page;
+ rec_t* rec;
+ ulint n_cols;
+ ulint matched_fields;
+ ulint matched_bytes;
+ ib_int64_t* n_diff;
+ ullint n_sample_pages; /* number of pages to sample */
+ ulint not_empty_flag = 0;
+ ulint total_external_size = 0;
+ ulint i;
+ ulint j;
+ ullint add_on;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
+ ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets_rec = offsets_rec_;
+ ulint* offsets_next_rec= offsets_next_rec_;
+ ulint stats_method = srv_stats_method;
+ btr_path_t first_rec_path[BTR_PATH_ARRAY_N_SLOTS];
+ ulint effective_pages; /* effective leaf pages */
+ rec_offs_init(offsets_rec_);
+ rec_offs_init(offsets_next_rec_);
+
+ n_cols = dict_index_get_n_unique(index);
+
+ if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+ /* estimate effective pages and path for the first effective record */
+ /* TODO: make it work also for n_cols > 1. */
+ effective_pages = btr_estimate_n_pages_not_null(index, 1 /*k*/, first_rec_path);
+
+ if (!effective_pages) {
+ dict_index_stat_mutex_enter(index);
+ for (j = 0; j <= n_cols; j++) {
+ index->stat_n_diff_key_vals[j] = (ib_int64_t)index->stat_n_leaf_pages;
+ }
+ dict_index_stat_mutex_exit(index);
+ return;
+ } else if (effective_pages > index->stat_n_leaf_pages) {
+ effective_pages = index->stat_n_leaf_pages;
+ }
+ } else {
+ effective_pages = index->stat_n_leaf_pages;
+ }
+
+ n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
+
+ /* It makes no sense to test more pages than are contained
+ in the index, thus we lower the number if it is too high */
+ if (srv_stats_sample_pages > effective_pages) {
+ if (effective_pages > 0) {
+ n_sample_pages = effective_pages;
+ } else {
+ n_sample_pages = 1;
+ }
+ } else {
+ n_sample_pages = srv_stats_sample_pages;
+ }
+
+ /* We sample some pages in the index to get an estimate */
+
+ for (i = 0; i < n_sample_pages; i++) {
+ rec_t* supremum;
+ ibool is_first_page = TRUE;
+ mtr_start(&mtr);
+
+ if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+ is_first_page = btr_cur_open_at_rnd_pos_after_path(index, BTR_SEARCH_LEAF,
+ first_rec_path, &cursor, &mtr);
+ } else {
+ btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
+ }
+
+ /* Count the number of different key values for each prefix of
+ the key on this index page. If the prefix does not determine
+ the index record uniquely in the B-tree, then we subtract one
+ because otherwise our algorithm would give a wrong estimate
+ for an index where there is just one key value. */
+
+ page = btr_cur_get_page(&cursor);
+
+ if (srv_pass_corrupt_table && !page) {
+ break;
+ }
+ ut_a(page);
+
+ supremum = page_get_supremum_rec(page);
+ if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS && is_first_page) {
+ /* the cursor should be the first record of the page. */
+ /* Counting should be started from here. */
+ rec = btr_cur_get_rec(&cursor);
+ } else {
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ }
+
+ if (rec != supremum) {
+ not_empty_flag = 1;
+ offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+ ULINT_UNDEFINED, &heap);
+ }
+
+ while (rec != supremum) {
+ rec_t* next_rec;
+ next_rec = page_rec_get_next(rec);
+ if (next_rec == supremum) {
+ break;
+ }
+
+ matched_fields = 0;
+ matched_bytes = 0;
+ offsets_next_rec = rec_get_offsets(next_rec, index,
+ offsets_next_rec,
+ n_cols, &heap);
+
+ cmp_rec_rec_with_match(rec, next_rec,
+ offsets_rec, offsets_next_rec,
+ index, &matched_fields,
+ &matched_bytes,
+ (stats_method==SRV_STATS_METHOD_NULLS_NOT_EQUAL) ?
+ SRV_STATS_METHOD_NULLS_NOT_EQUAL :
+ SRV_STATS_METHOD_NULLS_EQUAL);
+
+ for (j = matched_fields + 1; j <= n_cols; j++) {
+ /* We add one if this index record has
+ a different prefix from the previous */
+
+ n_diff[j]++;
+ }
+
+ total_external_size
+ += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+
+ rec = next_rec;
+ /* Initialize offsets_rec for the next round
+ and assign the old offsets_rec buffer to
+ offsets_next_rec. */
+ {
+ ulint* offsets_tmp = offsets_rec;
+ offsets_rec = offsets_next_rec;
+ offsets_next_rec = offsets_tmp;
+ }
+ }
+
+
+ if (n_cols == dict_index_get_n_unique_in_tree(index)) {
+
+ /* If there is more than one leaf page in the tree,
+ we add one because we know that the first record
+ on the page certainly had a different prefix than the
+ last record on the previous index page in the
+ alphabetical order. Before this fix, if there was
+ just one big record on each clustered index page, the
+ algorithm grossly underestimated the number of rows
+ in the table. */
+
+ if (btr_page_get_prev(page, &mtr) != FIL_NULL
+ || btr_page_get_next(page, &mtr) != FIL_NULL) {
+
+ n_diff[n_cols]++;
+ }
+ }
+
+ offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+ ULINT_UNDEFINED, &heap);
+ total_external_size += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ mtr_commit(&mtr);
+ }
+
+ /* If we saw k borders between different key values on
+ n_sample_pages leaf pages, we can estimate how many
+ there will be in index->stat_n_leaf_pages */
+
+ /* We must take into account that our sample actually represents
+ also the pages used for external storage of fields (those pages are
+ included in index->stat_n_leaf_pages) */
+
+ dict_index_stat_mutex_enter(index);
+
+ for (j = 0; j <= n_cols; j++) {
+ index->stat_n_diff_key_vals[j]
+ = ((n_diff[j]
+ * (ib_int64_t)effective_pages
+ + n_sample_pages - 1
+ + total_external_size
+ + not_empty_flag)
+ / (n_sample_pages
+ + total_external_size));
+
+ /* If the tree is small, smaller than
+ 10 * n_sample_pages + total_external_size, then
+ the above estimate is ok. For bigger trees it is common that we
+ do not see any borders between key values in the few pages
+ we pick. But still there may be n_sample_pages
+ different key values, or even more. Let us try to approximate
+ that: */
+
+ add_on = effective_pages
+ / (10 * (n_sample_pages
+ + total_external_size));
+
+ if (add_on > n_sample_pages) {
+ add_on = n_sample_pages;
+ }
+
+ index->stat_n_diff_key_vals[j] += add_on;
+
+ if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+ /* index->stat_n_diff_key_vals[k] is used for calc rec_per_key,
+ as "stats.records / index->stat_n_diff_key_vals[x]".
+ So it should be adjusted to the value which is based on whole of the index. */
+ index->stat_n_diff_key_vals[j] =
+ index->stat_n_diff_key_vals[j] * (ib_int64_t)index->stat_n_leaf_pages
+ / (ib_int64_t)effective_pages;
+ }
+ }
+
+ dict_index_stat_mutex_exit(index);
+
+ mem_free(n_diff);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the externally stored size of a record, in units of a database page.
+@return externally stored part, in units of a database page */
+static
+ulint
+btr_rec_get_externally_stored_len(
+/*==============================*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_fields;
+ byte* data;
+ ulint local_len;
+ ulint extern_len;
+ ulint total_extern_len = 0;
+ ulint i;
+
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ extern_len = mach_read_from_4(data + local_len
+ + BTR_EXTERN_LEN + 4);
+
+ total_extern_len += ut_calc_align(extern_len,
+ UNIV_PAGE_SIZE);
+ }
+ }
+
+ return(total_extern_len / UNIV_PAGE_SIZE);
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: clustered index record */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint i, /*!< in: field number */
+ ibool val, /*!< in: value to set */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ byte* data;
+ ulint local_len;
+ ulint byte_val;
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+ if (val) {
+ byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
+ } else {
+ byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
+ }
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+ page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
+ } else if (UNIV_LIKELY(mtr != NULL)) {
+
+ mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
+ MLOG_1BYTE, mtr);
+ } else {
+ mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+ }
+}
+
+/*******************************************************************//**
+Marks not updated extern fields as not-owned by this record. The ownership
+is transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field.
+@return TRUE if BLOB ownership was transferred */
+UNIV_INTERN
+ibool
+btr_cur_mark_extern_inherited_fields(
+/*=================================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ ulint n;
+ ulint j;
+ ulint i;
+ ibool change_ownership = FALSE;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+ if (!rec_offs_any_extern(offsets)) {
+
+ return(FALSE);
+ }
+
+ n = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ /* Check it is not in updated fields */
+
+ if (update) {
+ for (j = 0; j < upd_get_n_fields(update);
+ j++) {
+ if (upd_get_nth_field(update, j)
+ ->field_no == i) {
+
+ goto updated;
+ }
+ }
+ }
+
+ btr_cur_set_ownership_of_extern_field(
+ page_zip, rec, index, offsets, i, FALSE, mtr);
+
+ change_ownership = TRUE;
+updated:
+ ;
+ }
+ }
+
+ return(change_ownership);
+}
+
+/*******************************************************************//**
+The complement of the previous function: in an update entry may inherit
+some externally stored fields from a record. We must mark them as inherited
+in entry, so that they are not freed in a rollback. */
+UNIV_INTERN
+void
+btr_cur_mark_dtuple_inherited_extern(
+/*=================================*/
+ dtuple_t* entry, /*!< in/out: updated entry to be
+ inserted to clustered index */
+ const upd_t* update) /*!< in: update vector */
+{
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+ byte* data;
+ ulint len;
+ ulint j;
+
+ if (!dfield_is_ext(dfield)) {
+ continue;
+ }
+
+ /* Check if it is in updated fields */
+
+ for (j = 0; j < upd_get_n_fields(update); j++) {
+ if (upd_get_nth_field(update, j)->field_no == i) {
+
+ goto is_updated;
+ }
+ }
+
+ data = dfield_get_data(dfield);
+ len = dfield_get_len(dfield);
+ data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
+ |= BTR_EXTERN_INHERITED_FLAG;
+
+is_updated:
+ ;
+ }
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
+{
+ ulint n;
+ ulint i;
+
+ ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+ n = rec_offs_n_fields(offsets);
+
+ if (!rec_offs_any_extern(offsets)) {
+
+ return;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ btr_cur_set_ownership_of_extern_field(
+ page_zip, rec, index, offsets, i, TRUE, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Marks all extern fields in a dtuple as owned by the record. */
+UNIV_INTERN
+void
+btr_cur_unmark_dtuple_extern_fields(
+/*================================*/
+ dtuple_t* entry) /*!< in/out: clustered index entry */
+{
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+ dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+ if (dfield_is_ext(dfield)) {
+ byte* data = dfield_get_data(dfield);
+ ulint len = dfield_get_len(dfield);
+
+ data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
+ &= ~BTR_EXTERN_OWNER_FLAG;
+ }
+ }
+}
+
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector. We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const upd_t* update, /*!< in: update vector */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint n_pushed = 0;
+ ulint n;
+ const upd_field_t* uf;
+
+ ut_ad(tuple);
+ ut_ad(update);
+
+ uf = update->fields;
+ n = upd_get_n_fields(update);
+
+ for (; n--; uf++) {
+ if (dfield_is_ext(&uf->new_val)) {
+ dfield_t* field
+ = dtuple_get_nth_field(tuple, uf->field_no);
+
+ if (!dfield_is_ext(field)) {
+ dfield_set_ext(field);
+ n_pushed++;
+ }
+
+ switch (uf->orig_len) {
+ byte* data;
+ ulint len;
+ byte* buf;
+ case 0:
+ break;
+ case BTR_EXTERN_FIELD_REF_SIZE:
+ /* Restore the original locally stored
+ part of the column. In the undo log,
+ InnoDB writes a longer prefix of externally
+ stored columns, so that column prefixes
+ in secondary indexes can be reconstructed. */
+ dfield_set_data(field, (byte*) dfield_get_data(field)
+ + dfield_get_len(field)
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_set_ext(field);
+ break;
+ default:
+ /* Reconstruct the original locally
+ stored part of the column. The data
+ will have to be copied. */
+ ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+
+ data = dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ buf = mem_heap_alloc(heap, uf->orig_len);
+ /* Copy the locally stored prefix. */
+ memcpy(buf, data,
+ uf->orig_len
+ - BTR_EXTERN_FIELD_REF_SIZE);
+ /* Copy the BLOB pointer. */
+ memcpy(buf + uf->orig_len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ dfield_set_data(field, buf, uf->orig_len);
+ dfield_set_ext(field);
+ }
+ }
+ }
+
+ return(n_pushed);
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return part length */
+static
+ulint
+btr_blob_get_part_len(
+/*==================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return page number or FIL_NULL if no more pages */
+static
+ulint
+btr_blob_get_next_page_no(
+/*======================*/
+ const byte* blob_header) /*!< in: blob header */
+{
+ return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/*******************************************************************//**
+Deallocate a buffer block that was reserved for a BLOB part. */
+static
+void
+btr_blob_free(
+/*==========*/
+ buf_block_t* block, /*!< in: buffer block */
+ ibool all, /*!< in: TRUE=remove also the compressed page
+ if there is one */
+ mtr_t* mtr) /*!< in: mini-transaction to commit */
+{
+ ulint space = buf_block_get_space(block);
+ ulint page_no = buf_block_get_page_no(block);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+ mtr_commit(mtr);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(&block->mutex);
+
+ /* Only free the block if it is still allocated to
+ the same file page. */
+
+ if (buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE
+ && buf_block_get_space(block) == space
+ && buf_block_get_page_no(block) == page_no) {
+
+ if (buf_LRU_free_block(&block->page, all, NULL, TRUE)
+ != BUF_LRU_FREED
+ && all && block->page.zip.data
+ /* Now, buf_LRU_free_block() may release mutex temporarily */
+ && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
+ && buf_block_get_space(block) == space
+ && buf_block_get_page_no(block) == page_no) {
+ /* Attempt to deallocate the uncompressed page
+ if the whole block cannot be deallocted. */
+
+ buf_LRU_free_block(&block->page, FALSE, NULL, TRUE);
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&block->mutex);
+}
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or error */
+UNIV_INTERN
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree
+ MUST be X-latched */
+ buf_block_t* rec_block, /*!< in/out: block containing rec */
+ rec_t* rec, /*!< in/out: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index);
+ the "external storage" flags in offsets
+ will not correspond to rec when
+ this function returns */
+ big_rec_t* big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr
+ containing the latch to rec and to the
+ tree */
+{
+ ulint rec_page_no;
+ byte* field_ref;
+ ulint extern_len;
+ ulint store_len;
+ ulint page_no;
+ ulint space_id;
+ ulint zip_size;
+ ulint prev_page_no;
+ ulint hint_page_no;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ page_zip_des_t* page_zip;
+ z_stream c_stream;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+ ut_a(dict_index_is_clust(index));
+
+ page_zip = buf_block_get_page_zip(rec_block);
+ ut_a(dict_table_zip_size(index->table)
+ == buf_block_get_zip_size(rec_block));
+
+ space_id = buf_block_get_space(rec_block);
+ zip_size = buf_block_get_zip_size(rec_block);
+ rec_page_no = buf_block_get_page_no(rec_block);
+ ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ int err;
+
+ /* Zlib deflate needs 128 kilobytes for the default
+ window size, plus 512 << memLevel, plus a few
+ kilobytes for small objects. We use reduced memLevel
+ to limit the memory consumption, and preallocate the
+ heap, hoping to avoid memory fragmentation. */
+ heap = mem_heap_create(250000);
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+ }
+
+ /* We have to create a file segment to the tablespace
+ for each field and put the pointer to the field in rec */
+
+ for (i = 0; i < big_rec_vec->n_fields; i++) {
+ ut_ad(rec_offs_nth_extern(offsets,
+ big_rec_vec->fields[i].field_no));
+ {
+ ulint local_len;
+ field_ref = rec_get_nth_field(
+ rec, offsets, big_rec_vec->fields[i].field_no,
+ &local_len);
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+ field_ref += local_len;
+ }
+ extern_len = big_rec_vec->fields[i].len;
+ UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
+ extern_len);
+
+ ut_a(extern_len > 0);
+
+ prev_page_no = FIL_NULL;
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ int err = deflateReset(&c_stream);
+ ut_a(err == Z_OK);
+
+ c_stream.next_in = (void*) big_rec_vec->fields[i].data;
+ c_stream.avail_in = extern_len;
+ }
+
+ for (;;) {
+ buf_block_t* block;
+ page_t* page;
+
+ mtr_start(&mtr);
+
+ if (prev_page_no == FIL_NULL) {
+ hint_page_no = 1 + rec_page_no;
+ } else {
+ hint_page_no = prev_page_no + 1;
+ }
+
+ block = btr_page_alloc(index, hint_page_no,
+ FSP_NO_DIR, 0, &mtr);
+ if (UNIV_UNLIKELY(block == NULL)) {
+
+ mtr_commit(&mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+ }
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ page_no = buf_block_get_page_no(block);
+ page = buf_block_get_frame(block);
+
+ if (prev_page_no != FIL_NULL) {
+ buf_block_t* prev_block;
+ page_t* prev_page;
+
+ prev_block = buf_page_get(space_id, zip_size,
+ prev_page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(prev_block,
+ SYNC_EXTERN_STORAGE);
+ prev_page = buf_block_get_frame(prev_block);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mlog_write_ulint(
+ prev_page + FIL_PAGE_NEXT,
+ page_no, MLOG_4BYTES, &mtr);
+ memcpy(buf_block_get_page_zip(
+ prev_block)
+ ->data + FIL_PAGE_NEXT,
+ prev_page + FIL_PAGE_NEXT, 4);
+ } else {
+ mlog_write_ulint(
+ prev_page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO,
+ page_no, MLOG_4BYTES, &mtr);
+ }
+
+ }
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ int err;
+ page_zip_des_t* blob_page_zip;
+
+ /* Write FIL_PAGE_TYPE to the redo log
+ separately, before logging any other
+ changes to the page, so that the debug
+ assertions in
+ recv_parse_or_apply_log_rec_body() can
+ be made simpler. Before InnoDB Plugin
+ 1.0.4, the initialization of
+ FIL_PAGE_TYPE was logged as part of
+ the mlog_log_string() below. */
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE,
+ prev_page_no == FIL_NULL
+ ? FIL_PAGE_TYPE_ZBLOB
+ : FIL_PAGE_TYPE_ZBLOB2,
+ MLOG_2BYTES, &mtr);
+
+ c_stream.next_out = page
+ + FIL_PAGE_DATA;
+ c_stream.avail_out
+ = page_zip_get_size(page_zip)
+ - FIL_PAGE_DATA;
+
+ err = deflate(&c_stream, Z_FINISH);
+ ut_a(err == Z_OK || err == Z_STREAM_END);
+ ut_a(err == Z_STREAM_END
+ || c_stream.avail_out == 0);
+
+ /* Write the "next BLOB page" pointer */
+ mlog_write_ulint(page + FIL_PAGE_NEXT,
+ FIL_NULL, MLOG_4BYTES, &mtr);
+ /* Initialize the unused "prev page" pointer */
+ mlog_write_ulint(page + FIL_PAGE_PREV,
+ FIL_NULL, MLOG_4BYTES, &mtr);
+ /* Write a back pointer to the record
+ into the otherwise unused area. This
+ information could be useful in
+ debugging. Later, we might want to
+ implement the possibility to relocate
+ BLOB pages. Then, we would need to be
+ able to adjust the BLOB pointer in the
+ record. We do not store the heap
+ number of the record, because it can
+ change in page_zip_reorganize() or
+ btr_page_reorganize(). However, also
+ the page number of the record may
+ change when B-tree nodes are split or
+ merged. */
+ mlog_write_ulint(page
+ + FIL_PAGE_FILE_FLUSH_LSN,
+ space_id,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(page
+ + FIL_PAGE_FILE_FLUSH_LSN + 4,
+ rec_page_no,
+ MLOG_4BYTES, &mtr);
+
+ /* Zero out the unused part of the page. */
+ memset(page + page_zip_get_size(page_zip)
+ - c_stream.avail_out,
+ 0, c_stream.avail_out);
+ mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
+ page_zip_get_size(page_zip)
+ - FIL_PAGE_FILE_FLUSH_LSN,
+ &mtr);
+ /* Copy the page to compressed storage,
+ because it will be flushed to disk
+ from there. */
+ blob_page_zip = buf_block_get_page_zip(block);
+ ut_ad(blob_page_zip);
+ ut_ad(page_zip_get_size(blob_page_zip)
+ == page_zip_get_size(page_zip));
+ memcpy(blob_page_zip->data, page,
+ page_zip_get_size(page_zip));
+
+ if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+ goto next_zip_page;
+ }
+
+ rec_block = buf_page_get(space_id, zip_size,
+ rec_page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(rec_block,
+ SYNC_NO_ORDER_CHECK);
+
+ if (err == Z_STREAM_END) {
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN, 0);
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_LEN + 4,
+ c_stream.total_in);
+ } else {
+ memset(field_ref + BTR_EXTERN_LEN,
+ 0, 8);
+ }
+
+ if (prev_page_no == FIL_NULL) {
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_SPACE_ID,
+ space_id);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no);
+
+ mach_write_to_4(field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_NEXT);
+ }
+
+ page_zip_write_blob_ptr(
+ page_zip, rec, index, offsets,
+ big_rec_vec->fields[i].field_no, &mtr);
+
+next_zip_page:
+ prev_page_no = page_no;
+
+ /* Commit mtr and release the
+ uncompressed page frame to save memory. */
+ btr_blob_free(block, FALSE, &mtr);
+
+ if (err == Z_STREAM_END) {
+ break;
+ }
+ } else {
+ mlog_write_ulint(page + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_BLOB,
+ MLOG_2BYTES, &mtr);
+
+ if (extern_len > (UNIV_PAGE_SIZE
+ - FIL_PAGE_DATA
+ - BTR_BLOB_HDR_SIZE
+ - FIL_PAGE_DATA_END)) {
+ store_len = UNIV_PAGE_SIZE
+ - FIL_PAGE_DATA
+ - BTR_BLOB_HDR_SIZE
+ - FIL_PAGE_DATA_END;
+ } else {
+ store_len = extern_len;
+ }
+
+ mlog_write_string(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_SIZE,
+ (const byte*)
+ big_rec_vec->fields[i].data
+ + big_rec_vec->fields[i].len
+ - extern_len,
+ store_len, &mtr);
+ mlog_write_ulint(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN,
+ store_len, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO,
+ FIL_NULL, MLOG_4BYTES, &mtr);
+
+ extern_len -= store_len;
+
+ rec_block = buf_page_get(space_id, zip_size,
+ rec_page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(rec_block,
+ SYNC_NO_ORDER_CHECK);
+
+ mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_LEN + 4,
+ big_rec_vec->fields[i].len
+ - extern_len,
+ MLOG_4BYTES, &mtr);
+
+ if (prev_page_no == FIL_NULL) {
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_SPACE_ID,
+ space_id,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_PAGE_NO,
+ page_no,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_OFFSET,
+ FIL_PAGE_DATA,
+ MLOG_4BYTES, &mtr);
+ }
+
+ prev_page_no = page_no;
+
+ mtr_commit(&mtr);
+
+ if (extern_len == 0) {
+ break;
+ }
+ }
+ }
+ }
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
+static
+void
+btr_check_blob_fil_page_type(
+/*=========================*/
+ ulint space_id, /*!< in: space id */
+ ulint page_no, /*!< in: page number */
+ const page_t* page, /*!< in: page */
+ ibool read) /*!< in: TRUE=read, FALSE=purge */
+{
+ ulint type = fil_page_get_type(page);
+
+ ut_a(space_id == page_get_space_id(page));
+ ut_a(page_no == page_get_page_no(page));
+
+ if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
+ ulint flags = fil_space_get_flags(space_id);
+
+ if (UNIV_LIKELY
+ ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
+ /* Old versions of InnoDB did not initialize
+ FIL_PAGE_TYPE on BLOB pages. Do not print
+ anything about the type mismatch when reading
+ a BLOB page that is in Antelope format.*/
+ return;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: FIL_PAGE_TYPE=%lu"
+ " on BLOB %s space %lu page %lu flags %lx\n",
+ (ulong) type, read ? "read" : "purge",
+ (ulong) space_id, (ulong) page_no, (ulong) flags);
+ ut_error;
+ }
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ page_zip_des_t* page_zip, /*!< in: compressed page corresponding
+ to rec, or NULL if rec == NULL */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* local_mtr __attribute__((unused))) /*!< in: mtr
+ containing the latch to data an an
+ X-latch to the index tree */
+{
+ page_t* page;
+ ulint space_id;
+ ulint rec_zip_size = dict_table_zip_size(index->table);
+ ulint ext_zip_size;
+ ulint page_no;
+ ulint next_page_no;
+ mtr_t mtr;
+#ifdef UNIV_DEBUG
+ ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+
+ if (rec) {
+ ulint local_len;
+ const byte* f = rec_get_nth_field(rec, offsets,
+ i, &local_len);
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+ f += local_len;
+ ut_ad(f == field_ref);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* In the rollback of uncommitted transactions, we may
+ encounter a clustered index record whose BLOBs have
+ not been written. There is nothing to free then. */
+ ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
+ return;
+ }
+
+ space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
+
+ if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
+ ext_zip_size = fil_space_get_zip_size(space_id);
+ /* This must be an undo log record in the system tablespace,
+ that is, in row_purge_upd_exist_or_extern().
+ Currently, externally stored records are stored in the
+ same tablespace as the referring records. */
+ ut_ad(!page_get_space_id(page_align(field_ref)));
+ ut_ad(!rec);
+ ut_ad(!page_zip);
+ } else {
+ ext_zip_size = rec_zip_size;
+ }
+
+ if (!rec) {
+ /* This is a call from row_purge_upd_exist_or_extern(). */
+ ut_ad(!page_zip);
+ rec_zip_size = 0;
+ }
+
+ for (;;) {
+ buf_block_t* rec_block;
+ buf_block_t* ext_block;
+
+ mtr_start(&mtr);
+
+ rec_block = buf_page_get(page_get_space_id(
+ page_align(field_ref)),
+ rec_zip_size,
+ page_get_page_no(
+ page_align(field_ref)),
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
+ page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+ if (/* There is no external storage data */
+ page_no == FIL_NULL
+ /* This field does not own the externally stored field */
+ || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_OWNER_FLAG)
+ /* Rollback and inherited field */
+ || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
+ && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+ & BTR_EXTERN_INHERITED_FLAG))) {
+
+ /* Do not free */
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ ext_block = buf_page_get(space_id, ext_zip_size, page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
+ page = buf_block_get_frame(ext_block);
+
+ if (ext_zip_size) {
+ /* Note that page_zip will be NULL
+ in row_purge_upd_exist_or_extern(). */
+ switch (fil_page_get_type(page)) {
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ default:
+ ut_error;
+ }
+ next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+
+ btr_page_free_low(index, ext_block, 0, &mtr);
+
+ if (UNIV_LIKELY(page_zip != NULL)) {
+ mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+ next_page_no);
+ mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
+ 0);
+ page_zip_write_blob_ptr(page_zip, rec, index,
+ offsets, i, &mtr);
+ } else {
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_PAGE_NO,
+ next_page_no,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(field_ref
+ + BTR_EXTERN_LEN + 4, 0,
+ MLOG_4BYTES, &mtr);
+ }
+ } else {
+ ut_a(!page_zip);
+ btr_check_blob_fil_page_type(space_id, page_no, page,
+ FALSE);
+
+ next_page_no = mach_read_from_4(
+ page + FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO);
+
+ /* We must supply the page level (= 0) as an argument
+ because we did not store it on the page (we save the
+ space overhead from an index page header. */
+
+ btr_page_free_low(index, ext_block, 0, &mtr);
+
+ mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
+ next_page_no,
+ MLOG_4BYTES, &mtr);
+ /* Zero out the BLOB length. If the server
+ crashes during the execution of this function,
+ trx_rollback_or_clean_all_recovered() could
+ dereference the half-deleted BLOB, fetching a
+ wrong prefix for the BLOB. */
+ mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
+ 0,
+ MLOG_4BYTES, &mtr);
+ }
+
+ /* Commit mtr and release the BLOB block to save memory. */
+ btr_blob_free(ext_block, TRUE, &mtr);
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched */
+ rec_t* rec, /*!< in/out: record */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the index
+ tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+ /* Free possible externally stored fields in the record */
+
+ ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+ n_fields = rec_offs_n_fields(offsets);
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ byte* data
+ = rec_get_nth_field(rec, offsets, i, &len);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ btr_free_externally_stored_field(
+ index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ rec, offsets, page_zip, i, rb_ctx, mtr);
+ }
+ }
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree MUST be
+ X-latched */
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr) /*!< in: mini-transaction handle which contains
+ an X-latch to record page and to the tree */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+
+ /* Free possible externally stored fields in the record */
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+ if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+ ulint len;
+ byte* data = rec_get_nth_field(
+ rec, offsets, ufield->field_no, &len);
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ btr_free_externally_stored_field(
+ index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ rec, offsets, page_zip,
+ ufield->field_no, rb_ctx, mtr);
+ }
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB. The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
+ ulint space_id,/*!< in: space id of the BLOB pages */
+ ulint page_no,/*!< in: page number of the first BLOB page */
+ ulint offset) /*!< in: offset on the first BLOB page */
+{
+ ulint copied_len = 0;
+
+ for (;;) {
+ mtr_t mtr;
+ buf_block_t* block;
+ const page_t* page;
+ const byte* blob_header;
+ ulint part_len;
+ ulint copy_len;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ page = buf_block_get_frame(block);
+
+ btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
+
+ blob_header = page + offset;
+ part_len = btr_blob_get_part_len(blob_header);
+ copy_len = ut_min(part_len, len - copied_len);
+
+ memcpy(buf + copied_len,
+ blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+ copied_len += copy_len;
+
+ page_no = btr_blob_get_next_page_no(blob_header);
+
+ mtr_commit(&mtr);
+
+ if (page_no == FIL_NULL || copy_len != part_len) {
+ UNIV_MEM_ASSERT_RW(buf, copied_len);
+ return(copied_len);
+ }
+
+ /* On other BLOB pages except the first the BLOB header
+ always is at the page data start: */
+
+ offset = FIL_PAGE_DATA;
+
+ ut_ad(copied_len <= len);
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of a compressed BLOB. The clustered index record
+that points to this BLOB must be protected by a lock or a page latch. */
+static
+void
+btr_copy_zblob_prefix(
+/*==================*/
+ z_stream* d_stream,/*!< in/out: the decompressing stream */
+ ulint zip_size,/*!< in: compressed BLOB page size */
+ ulint space_id,/*!< in: space id of the BLOB pages */
+ ulint page_no,/*!< in: page number of the first BLOB page */
+ ulint offset) /*!< in: offset on the first BLOB page */
+{
+ ulint page_type = FIL_PAGE_TYPE_ZBLOB;
+
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ ut_ad(space_id);
+
+ for (;;) {
+ buf_page_t* bpage;
+ int err;
+ ulint next_page_no;
+
+ /* There is no latch on bpage directly. Instead,
+ bpage is protected by the B-tree page latch that
+ is being held on the clustered index record, or,
+ in row_merge_copy_blobs(), by an exclusive table lock. */
+ bpage = buf_page_get_zip(space_id, zip_size, page_no);
+
+ if (UNIV_UNLIKELY(!bpage)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Cannot load"
+ " compressed BLOB"
+ " page %lu space %lu\n",
+ (ulong) page_no, (ulong) space_id);
+ return;
+ }
+
+ if (UNIV_UNLIKELY
+ (fil_page_get_type(bpage->zip.data) != page_type)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Unexpected type %lu of"
+ " compressed BLOB"
+ " page %lu space %lu\n",
+ (ulong) fil_page_get_type(bpage->zip.data),
+ (ulong) page_no, (ulong) space_id);
+ goto end_of_blob;
+ }
+
+ next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+ if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+ /* When the BLOB begins at page header,
+ the compressed data payload does not
+ immediately follow the next page pointer. */
+ offset = FIL_PAGE_DATA;
+ } else {
+ offset += 4;
+ }
+
+ d_stream->next_in = bpage->zip.data + offset;
+ d_stream->avail_in = zip_size - offset;
+
+ err = inflate(d_stream, Z_NO_FLUSH);
+ switch (err) {
+ case Z_OK:
+ if (!d_stream->avail_out) {
+ goto end_of_blob;
+ }
+ break;
+ case Z_STREAM_END:
+ if (next_page_no == FIL_NULL) {
+ goto end_of_blob;
+ }
+ /* fall through */
+ default:
+inflate_error:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: inflate() of"
+ " compressed BLOB"
+ " page %lu space %lu returned %d (%s)\n",
+ (ulong) page_no, (ulong) space_id,
+ err, d_stream->msg);
+ case Z_BUF_ERROR:
+ goto end_of_blob;
+ }
+
+ if (next_page_no == FIL_NULL) {
+ if (!d_stream->avail_in) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unexpected end of"
+ " compressed BLOB"
+ " page %lu space %lu\n",
+ (ulong) page_no,
+ (ulong) space_id);
+ } else {
+ err = inflate(d_stream, Z_FINISH);
+ switch (err) {
+ case Z_STREAM_END:
+ case Z_BUF_ERROR:
+ break;
+ default:
+ goto inflate_error;
+ }
+ }
+
+end_of_blob:
+ buf_page_release_zip(bpage);
+ return;
+ }
+
+ buf_page_release_zip(bpage);
+
+ /* On other BLOB pages except the first
+ the BLOB header always is at the page header: */
+
+ page_no = next_page_no;
+ offset = FIL_PAGE_NEXT;
+ page_type = FIL_PAGE_TYPE_ZBLOB2;
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record. The
+clustered index record that points to this BLOB must be protected by a
+lock or a page latch.
+@return number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+/*========================================*/
+ byte* buf, /*!< out: the externally stored part of
+ the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ ulint space_id,/*!< in: space id of the first BLOB page */
+ ulint page_no,/*!< in: page number of the first BLOB page */
+ ulint offset) /*!< in: offset on the first BLOB page */
+{
+ if (UNIV_UNLIKELY(len == 0)) {
+ return(0);
+ }
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ int err;
+ z_stream d_stream;
+ mem_heap_t* heap;
+
+ /* Zlib inflate needs 32 kilobytes for the default
+ window size, plus a few kilobytes for small objects. */
+ heap = mem_heap_create(40000);
+ page_zip_set_alloc(&d_stream, heap);
+
+ err = inflateInit(&d_stream);
+ ut_a(err == Z_OK);
+
+ d_stream.next_out = buf;
+ d_stream.avail_out = len;
+ d_stream.avail_in = 0;
+
+ btr_copy_zblob_prefix(&d_stream, zip_size,
+ space_id, page_no, offset);
+ inflateEnd(&d_stream);
+ mem_heap_free(heap);
+ UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
+ return(d_stream.total_out);
+ } else {
+ return(btr_copy_blob_prefix(buf, len, space_id,
+ page_no, offset));
+ }
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record. The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+ byte* buf, /*!< out: the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ const byte* data, /*!< in: 'internally' stored part of the
+ field containing also the reference to
+ the external part; must be protected by
+ a lock or a page latch */
+ ulint local_len)/*!< in: length of data, in bytes */
+{
+ ulint space_id;
+ ulint page_no;
+ ulint offset;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(local_len >= len)) {
+ memcpy(buf, data, len);
+ return(len);
+ }
+
+ memcpy(buf, data, local_len);
+ data += local_len;
+
+ ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+ if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+ /* The externally stored part of the column has been
+ (partially) deleted. Signal the half-deleted BLOB
+ to the caller. */
+
+ return(0);
+ }
+
+ space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+
+ page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+
+ offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+
+ return(local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ len - local_len,
+ zip_size,
+ space_id, page_no,
+ offset));
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap. The
+clustered index record must be protected by a lock or a page latch.
+@return the whole field copied to heap */
+static
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+ ulint* len, /*!< out: length of the whole field */
+ const byte* data, /*!< in: 'internally' stored part of the
+ field containing also the reference to
+ the external part; must be protected by
+ a lock or a page latch */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ ulint local_len,/*!< in: length of data */
+ mem_heap_t* heap) /*!< in: mem heap */
+{
+ ulint space_id;
+ ulint page_no;
+ ulint offset;
+ ulint extern_len;
+ byte* buf;
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
+
+ page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
+
+ offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+
+ /* Currently a BLOB cannot be bigger than 4 GB; we
+ leave the 4 upper bytes in the length field unused */
+
+ extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
+
+ buf = mem_heap_alloc(heap, local_len + extern_len);
+
+ memcpy(buf, data, local_len);
+ *len = local_len
+ + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+ extern_len,
+ zip_size,
+ space_id,
+ page_no, offset);
+
+ return(buf);
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+ const rec_t* rec, /*!< in: record in a clustered index;
+ must be protected by a lock or a page latch */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ ulint no, /*!< in: field number */
+ ulint* len, /*!< out: length of the field */
+ mem_heap_t* heap) /*!< in: mem heap */
+{
+ ulint local_len;
+ const byte* data;
+
+ ut_a(rec_offs_nth_extern(offsets, no));
+
+ /* An externally stored field can contain some initial
+ data from the field, and in the last 20 bytes it has the
+ space id, page number, and offset where the rest of the
+ field data is stored, and the data length in addition to
+ the data stored locally. We may need to store some data
+ locally to get the local record length above the 128 byte
+ limit so that field offsets are stored in two bytes, and
+ the extern bit is available in those two bytes. */
+
+ data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+ ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY
+ (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+ field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The externally stored field was not written yet.
+ This record should only be seen by
+ recv_recovery_rollback_active() or any
+ TRX_ISO_READ_UNCOMMITTED transactions. */
+ return(NULL);
+ }
+
+ return(btr_copy_externally_stored_field(len, data,
+ zip_size, local_len, heap));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c
new file mode 100644
index 00000000000..537c26f6bf2
--- /dev/null
+++ b/storage/xtradb/btr/btr0pcur.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.c
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void)
+/*============================*/
+{
+ btr_pcur_t* pcur;
+
+ pcur = mem_alloc(sizeof(btr_pcur_t));
+
+ pcur->btr_cur.index = NULL;
+ btr_pcur_init(pcur);
+
+ return(pcur);
+}
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor) /*!< in, own: persistent cursor */
+{
+ if (cursor->old_rec_buf != NULL) {
+
+ mem_free(cursor->old_rec_buf);
+
+ cursor->old_rec_buf = NULL;
+ }
+
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_n_fields = 0;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+ mem_free(cursor);
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t* page_cursor;
+ buf_block_t* block;
+ rec_t* rec;
+ dict_index_t* index;
+ page_t* page;
+ ulint offs;
+
+ ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ block = btr_pcur_get_block(cursor);
+
+ if (srv_pass_corrupt_table && !block) {
+ return;
+ }
+ ut_a(block);
+
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+ page_cursor = btr_pcur_get_page_cur(cursor);
+
+ rec = page_cur_get_rec(page_cursor);
+ page = page_align(rec);
+ offs = page_offset(rec);
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_a(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) {
+ /* It must be an empty index tree; NOTE that in this case
+ we do not store the modify_clock, but always do a search
+ if we restore the cursor position */
+
+ ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+ ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+ cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+ if (page_rec_is_supremum_low(offs)) {
+
+ cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+ } else {
+ cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+ }
+
+ return;
+ }
+
+ if (page_rec_is_supremum_low(offs)) {
+
+ rec = page_rec_get_prev(rec);
+
+ cursor->rel_pos = BTR_PCUR_AFTER;
+
+ } else if (page_rec_is_infimum_low(offs)) {
+
+ rec = page_rec_get_next(rec);
+
+ cursor->rel_pos = BTR_PCUR_BEFORE;
+ } else {
+ cursor->rel_pos = BTR_PCUR_ON;
+ }
+
+ cursor->old_stored = BTR_PCUR_OLD_STORED;
+ cursor->old_rec = dict_index_copy_rec_order_prefix(
+ index, rec, &cursor->old_n_fields,
+ &cursor->old_rec_buf, &cursor->buf_size);
+
+ cursor->block_when_stored = block;
+ cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate) /*!< in: pcur from which the info is
+ copied */
+{
+ if (pcur_receive->old_rec_buf) {
+ mem_free(pcur_receive->old_rec_buf);
+ }
+
+ ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+ if (pcur_donate->old_rec_buf) {
+
+ pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size);
+
+ ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+ pcur_donate->buf_size);
+ pcur_receive->old_rec = pcur_receive->old_rec_buf
+ + (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+ }
+
+ pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ dtuple_t* tuple;
+ ulint mode;
+ ulint old_mode;
+ mem_heap_t* heap;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+ if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
+ || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED
+ && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) {
+ ut_print_buf(stderr, cursor, sizeof(btr_pcur_t));
+ putc('\n', stderr);
+ if (cursor->trx_if_known) {
+ trx_print(stderr, cursor->trx_if_known, 0);
+ }
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY
+ (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+ || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+
+ /* In these cases we do not try an optimistic restoration,
+ but always do a search */
+
+ btr_cur_open_at_index_side(
+ cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+ index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr);
+
+ cursor->block_when_stored = btr_pcur_get_block(cursor);
+
+ return(FALSE);
+ }
+
+ ut_a(cursor->old_rec);
+ ut_a(cursor->old_n_fields);
+
+ if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF)
+ || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) {
+ /* Try optimistic restoration */
+
+ if (UNIV_LIKELY(buf_page_optimistic_get(
+ latch_mode,
+ cursor->block_when_stored,
+ cursor->modify_clock,
+ file, line, mtr))) {
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ buf_block_dbg_add_level(btr_pcur_get_block(cursor),
+ SYNC_TREE_NODE);
+
+ if (cursor->rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+ const rec_t* rec;
+ const ulint* offsets1;
+ const ulint* offsets2;
+#endif /* UNIV_DEBUG */
+ cursor->latch_mode = latch_mode;
+#ifdef UNIV_DEBUG
+ rec = btr_pcur_get_rec(cursor);
+
+ heap = mem_heap_create(256);
+ offsets1 = rec_get_offsets(
+ cursor->old_rec, index, NULL,
+ cursor->old_n_fields, &heap);
+ offsets2 = rec_get_offsets(
+ rec, index, NULL,
+ cursor->old_n_fields, &heap);
+
+ ut_ad(!cmp_rec_rec(cursor->old_rec,
+ rec, offsets1, offsets2,
+ index));
+ mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+ return(TRUE);
+ }
+
+ return(FALSE);
+ }
+ }
+
+ /* If optimistic restoration did not succeed, open the cursor anew */
+
+ heap = mem_heap_create(256);
+
+ tuple = dict_index_build_data_tuple(index, cursor->old_rec,
+ cursor->old_n_fields, heap);
+
+ /* Save the old search mode of the cursor */
+ old_mode = cursor->search_mode;
+
+ if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) {
+ mode = PAGE_CUR_LE;
+ } else if (cursor->rel_pos == BTR_PCUR_AFTER) {
+ mode = PAGE_CUR_G;
+ } else {
+ ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE);
+ mode = PAGE_CUR_L;
+ }
+
+ btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+ cursor, 0, file, line, mtr);
+
+ /* Restore the old search mode */
+ cursor->search_mode = old_mode;
+
+ if (cursor->rel_pos == BTR_PCUR_ON
+ && btr_pcur_is_on_user_rec(cursor)
+ && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor),
+ rec_get_offsets(
+ btr_pcur_get_rec(cursor), index,
+ NULL, ULINT_UNDEFINED, &heap))) {
+
+ /* We have to store the NEW value for the modify clock, since
+ the cursor can now be on a different page! But we can retain
+ the value of old_rec */
+
+ cursor->block_when_stored = btr_pcur_get_block(cursor);
+ cursor->modify_clock = buf_block_get_modify_clock(
+ cursor->block_when_stored);
+ cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+
+ mem_heap_free(heap);
+
+ /* We have to store new position information, modify_clock etc.,
+ to the cursor because it can now be on a different page, the record
+ under it may have been removed, etc. */
+
+ btr_pcur_store_position(cursor, mtr);
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_release_leaf(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ block = btr_pcur_get_block(cursor);
+
+ btr_leaf_page_release(block, cursor->latch_mode, mtr);
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+
+ cursor->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint next_page_no;
+ ulint space;
+ ulint zip_size;
+ page_t* page;
+ buf_block_t* next_block;
+ page_t* next_page;
+
+ ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ page = btr_pcur_get_page(cursor);
+ next_page_no = btr_page_get_next(page, mtr);
+ space = buf_block_get_space(btr_pcur_get_block(cursor));
+ zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor));
+
+ ut_ad(next_page_no != FIL_NULL);
+
+ next_block = btr_block_get(space, zip_size, next_page_no,
+ cursor->latch_mode, mtr);
+ next_page = buf_block_get_frame(next_block);
+
+ if (srv_pass_corrupt_table && !next_page) {
+ btr_leaf_page_release(btr_pcur_get_block(cursor),
+ cursor->latch_mode, mtr);
+ btr_pcur_get_page_cur(cursor)->block = 0;
+ btr_pcur_get_page_cur(cursor)->rec = 0;
+ return;
+ }
+ ut_a(next_page);
+#ifdef UNIV_BTR_DEBUG
+ ut_a(page_is_comp(next_page) == page_is_comp(page));
+ ut_a(btr_page_get_prev(next_page, mtr)
+ == buf_block_get_page_no(btr_pcur_get_block(cursor)));
+#endif /* UNIV_BTR_DEBUG */
+ next_block->check_index_page_at_flush = TRUE;
+
+ btr_leaf_page_release(btr_pcur_get_block(cursor),
+ cursor->latch_mode, mtr);
+
+ page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+ page_check_dir(next_page);
+}
+
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first
+ record of the current page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint prev_page_no;
+ ulint space;
+ page_t* page;
+ buf_block_t* prev_block;
+ ulint latch_mode;
+ ulint latch_mode2;
+
+ ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ ut_ad(btr_pcur_is_before_first_on_page(cursor));
+ ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr));
+
+ latch_mode = cursor->latch_mode;
+
+ if (latch_mode == BTR_SEARCH_LEAF) {
+
+ latch_mode2 = BTR_SEARCH_PREV;
+
+ } else if (latch_mode == BTR_MODIFY_LEAF) {
+
+ latch_mode2 = BTR_MODIFY_PREV;
+ } else {
+ latch_mode2 = 0; /* To eliminate compiler warning */
+ ut_error;
+ }
+
+ btr_pcur_store_position(cursor, mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+ page = btr_pcur_get_page(cursor);
+
+ prev_page_no = btr_page_get_prev(page, mtr);
+ space = buf_block_get_space(btr_pcur_get_block(cursor));
+
+ if (prev_page_no == FIL_NULL) {
+ } else if (btr_pcur_is_before_first_on_page(cursor)) {
+
+ prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+ btr_leaf_page_release(btr_pcur_get_block(cursor),
+ latch_mode, mtr);
+
+ page_cur_set_after_last(prev_block,
+ btr_pcur_get_page_cur(cursor));
+ } else {
+
+ /* The repositioned cursor did not end on an infimum record on
+ a page. Cursor repositioning acquired a latch also on the
+ previous page, but we do not need the latch: release it. */
+
+ prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+ btr_leaf_page_release(prev_block, latch_mode, mtr);
+ }
+
+ cursor->latch_mode = latch_mode;
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ if (btr_pcur_is_before_first_on_page(cursor)) {
+
+ if (btr_pcur_is_before_first_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_backward_from_page(cursor, mtr);
+
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_prev_on_page(cursor);
+
+ return(TRUE);
+}
+
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
+ file, line, mtr);
+
+ if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
+
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ btr_pcur_move_to_next_user_rec(cursor, mtr);
+ }
+ } else {
+ ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
+
+ /* Not implemented yet */
+
+ ut_error;
+ }
+}
diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c
new file mode 100644
index 00000000000..6628333d32a
--- /dev/null
+++ b/storage/xtradb/btr/btr0sea.c
@@ -0,0 +1,2032 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.c
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "ha0ha.h"
+#include "srv0srv.h"
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+UNIV_INTERN char btr_search_enabled = TRUE;
+UNIV_INTERN ibool btr_search_fully_disabled = FALSE;
+
+/** Mutex protecting btr_search_enabled */
+static mutex_t btr_search_enabled_mutex;
+
+/** A dummy variable to fool the compiler */
+UNIV_INTERN ulint btr_search_this_is_zero = 0;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+UNIV_INTERN ulint btr_search_n_succ = 0;
+/** Number of failed adaptive hash index lookups */
+UNIV_INTERN ulint btr_search_n_hash_fail = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** padding to prevent other memory update
+hotspots from residing on the same memory
+cache line as btr_search_latch */
+UNIV_INTERN byte btr_sea_pad1[64];
+
+/** The latch protecting the adaptive search system: this latch protects the
+(1) positions of records on those pages where a hash index has been built.
+NOTE: It does not protect values of non-ordering fields within a record from
+being updated in-place! We can use fact (1) to perform unique searches to
+indexes. */
+
+/* We will allocate the latch from dynamic memory to get it to the
+same DRAM page as other hotspot semaphores */
+UNIV_INTERN rw_lock_t* btr_search_latch_temp;
+
+/** padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte btr_sea_pad2[64];
+
+/** The adaptive hash index */
+UNIV_INTERN btr_search_sys_t* btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT 16
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT 100
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+ dict_index_t* index, /*!< in: index for which to build, or NULL if
+ not known */
+ buf_block_t* block, /*!< in: index page, s- or x-latched */
+ ulint n_fields,/*!< in: hash this many full fields */
+ ulint n_bytes,/*!< in: hash this many bytes from the next
+ field */
+ ibool left_side);/*!< in: hash for searches from left side? */
+
+/*****************************************************************//**
+This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success. */
+static
+void
+btr_search_check_free_space_in_heap(void)
+/*=====================================*/
+{
+ hash_table_t* table;
+ mem_heap_t* heap;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ table = btr_search_sys->hash_index;
+
+ heap = table->heap;
+
+ /* Note that we peek the value of heap->free_block without reserving
+ the latch: this is ok, because we will not guarantee that there will
+ be enough free space in the hash table. */
+
+ if (heap->free_block == NULL) {
+ buf_block_t* block = buf_block_alloc(0);
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ if (heap->free_block == NULL) {
+ heap->free_block = block;
+ } else {
+ buf_block_free(block);
+ }
+
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+}
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+ ulint hash_size) /*!< in: hash index hash table size */
+{
+ /* We allocate the search latch from dynamic memory:
+ see above at the global variable definition */
+
+ btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t));
+
+ rw_lock_create(&btr_search_latch, SYNC_SEARCH_SYS);
+ mutex_create(&btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF);
+
+ btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
+
+ btr_search_sys->hash_index = ha_create(hash_size, 0, 0);
+}
+
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void)
+/*=====================*/
+{
+ rw_lock_free(&btr_search_latch);
+ mem_free(btr_search_latch_temp);
+ btr_search_latch_temp = NULL;
+ mem_heap_free(btr_search_sys->hash_index->heap);
+ hash_table_free(btr_search_sys->hash_index);
+ mem_free(btr_search_sys);
+ btr_search_sys = NULL;
+}
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void)
+/*====================*/
+{
+ mutex_enter(&btr_search_enabled_mutex);
+ rw_lock_x_lock(&btr_search_latch);
+
+ /* Disable access to hash index, also tell ha_insert_for_fold()
+ stop adding new nodes to hash index, but still allow updating
+ existing nodes */
+ btr_search_enabled = FALSE;
+
+ /* Clear all block->is_hashed flags and remove all entries
+ from btr_search_sys->hash_index. */
+ buf_pool_drop_hash_index();
+
+ /* hash index has been cleaned up, disallow any operation to
+ the hash index */
+ btr_search_fully_disabled = TRUE;
+
+ /* btr_search_enabled_mutex should guarantee this. */
+ ut_ad(!btr_search_enabled);
+
+ rw_lock_x_unlock(&btr_search_latch);
+ mutex_exit(&btr_search_enabled_mutex);
+}
+
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void)
+/*====================*/
+{
+ mutex_enter(&btr_search_enabled_mutex);
+ rw_lock_x_lock(&btr_search_latch);
+
+ btr_search_enabled = TRUE;
+ btr_search_fully_disabled = FALSE;
+
+ rw_lock_x_unlock(&btr_search_latch);
+ mutex_exit(&btr_search_enabled_mutex);
+}
+
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ btr_search_t* info;
+
+ info = mem_heap_alloc(heap, sizeof(btr_search_t));
+
+#ifdef UNIV_DEBUG
+ info->magic_n = BTR_SEARCH_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+ info->ref_count = 0;
+ info->root_guess = NULL;
+
+ info->hash_analysis = 0;
+ info->n_hash_potential = 0;
+
+ info->last_hash_succ = FALSE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_hash_succ = 0;
+ info->n_hash_fail = 0;
+ info->n_patt_succ = 0;
+ info->n_searches = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ /* Set some sensible values */
+ info->n_fields = 1;
+ info->n_bytes = 0;
+
+ info->left_side = TRUE;
+
+ return(info);
+}
+
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+ btr_search_t* info) /*!< in: search info. */
+{
+ ulint ret;
+
+ ut_ad(info);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rw_lock_s_lock(&btr_search_latch);
+ ret = info->ref_count;
+ rw_lock_s_unlock(&btr_search_latch);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent. */
+static
+void
+btr_search_info_update_hash(
+/*========================*/
+ btr_search_t* info, /*!< in/out: search info */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ dict_index_t* index;
+ ulint n_unique;
+ int cmp;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ index = cursor->index;
+
+ if (dict_index_is_ibuf(index)) {
+ /* So many deletes are performed on an insert buffer tree
+ that we do not consider a hash index useful on it: */
+
+ return;
+ }
+
+ n_unique = dict_index_get_n_unique_in_tree(index);
+
+ if (info->n_hash_potential == 0) {
+
+ goto set_new_recomm;
+ }
+
+ /* Test if the search would have succeeded using the recommended
+ hash prefix */
+
+ if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+ info->n_hash_potential++;
+
+ return;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->low_match, cursor->low_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto set_new_recomm;
+ }
+
+ cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+ cursor->up_match, cursor->up_bytes);
+
+ if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+ goto increment_potential;
+ }
+
+set_new_recomm:
+ /* We have to set a new recommendation; skip the hash analysis
+ for a while to avoid unnecessary CPU time usage when there is no
+ chance for success */
+
+ info->hash_analysis = 0;
+
+ cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+ cursor->low_match, cursor->low_bytes);
+ if (cmp == 0) {
+ info->n_hash_potential = 0;
+
+ /* For extra safety, we set some sensible values here */
+
+ info->n_fields = 1;
+ info->n_bytes = 0;
+
+ info->left_side = TRUE;
+
+ } else if (cmp > 0) {
+ info->n_hash_potential = 1;
+
+ if (cursor->up_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+
+ } else if (cursor->low_match < cursor->up_match) {
+
+ info->n_fields = cursor->low_match + 1;
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = cursor->low_match;
+ info->n_bytes = cursor->low_bytes + 1;
+ }
+
+ info->left_side = TRUE;
+ } else {
+ info->n_hash_potential = 1;
+
+ if (cursor->low_match >= n_unique) {
+
+ info->n_fields = n_unique;
+ info->n_bytes = 0;
+
+ } else if (cursor->low_match > cursor->up_match) {
+
+ info->n_fields = cursor->up_match + 1;
+ info->n_bytes = 0;
+ } else {
+ info->n_fields = cursor->up_match;
+ info->n_bytes = cursor->up_bytes + 1;
+ }
+
+ info->left_side = FALSE;
+ }
+}
+
+/*********************************************************************//**
+Updates the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return TRUE if building a (new) hash index on the block is recommended */
+static
+ibool
+btr_search_update_block_hash_info(
+/*==============================*/
+ btr_search_t* info, /*!< in: search info */
+ buf_block_t* block, /*!< in: buffer block */
+ btr_cur_t* cursor __attribute__((unused)))
+ /*!< in: cursor */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+ ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED)
+ || rw_lock_own(&block->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(cursor);
+
+ info->last_hash_succ = FALSE;
+
+ ut_a(buf_block_state_valid(block));
+ ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+ if ((block->n_hash_helps > 0)
+ && (info->n_hash_potential > 0)
+ && (block->n_fields == info->n_fields)
+ && (block->n_bytes == info->n_bytes)
+ && (block->left_side == info->left_side)) {
+
+ if ((block->is_hashed)
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)) {
+
+ /* The search would presumably have succeeded using
+ the hash index */
+
+ info->last_hash_succ = TRUE;
+ }
+
+ block->n_hash_helps++;
+ } else {
+ block->n_hash_helps = 1;
+ block->n_fields = info->n_fields;
+ block->n_bytes = info->n_bytes;
+ block->left_side = info->left_side;
+ }
+
+#ifdef UNIV_DEBUG
+ if (cursor->index->table->does_not_fit_in_memory) {
+ block->n_hash_helps = 0;
+ }
+#endif /* UNIV_DEBUG */
+
+ if ((block->n_hash_helps > page_get_n_recs(block->frame)
+ / BTR_SEARCH_PAGE_BUILD_LIMIT)
+ && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+ if ((!block->is_hashed)
+ || (block->n_hash_helps
+ > 2 * page_get_n_recs(block->frame))
+ || (block->n_fields != block->curr_n_fields)
+ || (block->n_bytes != block->curr_n_bytes)
+ || (block->left_side != block->curr_left_side)) {
+
+ /* Build a new hash index on the page */
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index. */
+static
+void
+btr_search_update_hash_ref(
+/*=======================*/
+ btr_search_t* info, /*!< in: search info */
+ buf_block_t* block, /*!< in: buffer block where cursor positioned */
+ btr_cur_t* cursor) /*!< in: cursor */
+{
+ ulint fold;
+ rec_t* rec;
+ dulint index_id;
+
+ ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(page_align(btr_cur_get_rec(cursor))
+ == buf_block_get_frame(block));
+
+ if (!block->is_hashed) {
+
+ return;
+ }
+
+ ut_a(block->index == cursor->index);
+ ut_a(!dict_index_is_ibuf(cursor->index));
+
+ if ((info->n_hash_potential > 0)
+ && (block->curr_n_fields == info->n_fields)
+ && (block->curr_n_bytes == info->n_bytes)
+ && (block->curr_left_side == info->left_side)) {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return;
+ }
+
+ index_id = cursor->index->id;
+ fold = rec_fold(rec,
+ rec_get_offsets(rec, cursor->index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields,
+ block->curr_n_bytes, index_id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ha_insert_for_fold(btr_search_sys->hash_index, fold,
+ block, rec);
+ }
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+ btr_search_t* info, /*!< in/out: search info */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ buf_block_t* block;
+ ibool build_index;
+ ulint* params;
+ ulint* params2;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ block = btr_cur_get_block(cursor);
+
+ if (srv_pass_corrupt_table && !block) {
+ return;
+ }
+ ut_a(block);
+
+ /* NOTE that the following two function calls do NOT protect
+ info or block->n_fields etc. with any semaphore, to save CPU time!
+ We cannot assume the fields are consistent when we return from
+ those functions! */
+
+ btr_search_info_update_hash(info, cursor);
+
+ build_index = btr_search_update_block_hash_info(info, block, cursor);
+
+ if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+ btr_search_check_free_space_in_heap();
+ }
+
+ if (cursor->flag == BTR_CUR_HASH_FAIL) {
+ /* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ btr_search_update_hash_ref(info, block, cursor);
+
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+
+ if (build_index) {
+ /* Note that since we did not protect block->n_fields etc.
+ with any semaphore, the values can be inconsistent. We have
+ to check inside the function call that they make sense. We
+ also malloc an array and store the values there to make sure
+ the compiler does not let the function call parameters change
+ inside the called function. It might be that the compiler
+ would optimize the call just to pass pointers to block. */
+
+ params = mem_alloc(3 * sizeof(ulint));
+ params[0] = block->n_fields;
+ params[1] = block->n_bytes;
+ params[2] = block->left_side;
+
+ /* Make sure the compiler cannot deduce the values and do
+ optimizations */
+
+ params2 = params + btr_search_this_is_zero;
+
+ btr_search_build_page_hash_index(cursor->index,
+ block,
+ params2[0],
+ params2[1],
+ params2[2]);
+ mem_free(params);
+ }
+}
+
+/******************************************************************//**
+Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@return TRUE if success */
+static
+ibool
+btr_search_check_guess(
+/*===================*/
+ btr_cur_t* cursor, /*!< in: guessed cursor position */
+ ibool can_only_compare_to_cursor_rec,
+ /*!< in: if we do not have a latch on the page
+ of cursor, but only a latch on
+ btr_search_latch, then ONLY the columns
+ of the record UNDER the cursor are
+ protected, not the next or previous record
+ in the chain: we cannot look at the next or
+ previous record to check our guess! */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+ or PAGE_CUR_GE */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ rec_t* rec;
+ ulint n_unique;
+ ulint match;
+ ulint bytes;
+ int cmp;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ibool success = FALSE;
+ rec_offs_init(offsets_);
+
+ n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ match = 0;
+ bytes = 0;
+
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ n_unique, &heap);
+ cmp = page_cmp_dtuple_rec_with_match(tuple, rec,
+ offsets, &match, &bytes);
+
+ if (mode == PAGE_CUR_GE) {
+ if (cmp == 1) {
+ goto exit_func;
+ }
+
+ cursor->up_match = match;
+
+ if (match >= n_unique) {
+ success = TRUE;
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_LE) {
+ if (cmp == -1) {
+ goto exit_func;
+ }
+
+ cursor->low_match = match;
+
+ } else if (mode == PAGE_CUR_G) {
+ if (cmp != -1) {
+ goto exit_func;
+ }
+ } else if (mode == PAGE_CUR_L) {
+ if (cmp != 1) {
+ goto exit_func;
+ }
+ }
+
+ if (can_only_compare_to_cursor_rec) {
+ /* Since we could not determine if our guess is right just by
+ looking at the record under the cursor, return FALSE */
+ goto exit_func;
+ }
+
+ match = 0;
+ bytes = 0;
+
+ if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+ rec_t* prev_rec;
+
+ ut_ad(!page_rec_is_infimum(rec));
+
+ prev_rec = page_rec_get_prev(rec);
+
+ if (page_rec_is_infimum(prev_rec)) {
+ success = btr_page_get_prev(page_align(prev_rec), mtr)
+ == FIL_NULL;
+
+ goto exit_func;
+ }
+
+ offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
+ n_unique, &heap);
+ cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec,
+ offsets, &match, &bytes);
+ if (mode == PAGE_CUR_GE) {
+ success = cmp == 1;
+ } else {
+ success = cmp != -1;
+ }
+
+ goto exit_func;
+ } else {
+ rec_t* next_rec;
+
+ ut_ad(!page_rec_is_supremum(rec));
+
+ next_rec = page_rec_get_next(rec);
+
+ if (page_rec_is_supremum(next_rec)) {
+ if (btr_page_get_next(page_align(next_rec), mtr)
+ == FIL_NULL) {
+
+ cursor->up_match = 0;
+ success = TRUE;
+ }
+
+ goto exit_func;
+ }
+
+ offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+ n_unique, &heap);
+ cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec,
+ offsets, &match, &bytes);
+ if (mode == PAGE_CUR_LE) {
+ success = cmp == -1;
+ cursor->up_match = match;
+ } else {
+ success = cmp != 1;
+ }
+ }
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ btr_search_t* info, /*!< in: index search info */
+ const dtuple_t* tuple, /*!< in: logical record */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that only if has_search_latch
+ is 0, we will have a latch set on
+ the cursor page, otherwise we assume
+ the caller uses his search latch
+ to protect the record! */
+ btr_cur_t* cursor, /*!< out: tree cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, RW_X_LATCH, or 0 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ rec_t* rec;
+ ulint fold;
+ dulint index_id;
+#ifdef notdefined
+ btr_cur_t cursor2;
+ btr_pcur_t pcur;
+#endif
+ ut_ad(index && info && tuple && cursor && mtr);
+ ut_ad((latch_mode == BTR_SEARCH_LEAF)
+ || (latch_mode == BTR_MODIFY_LEAF));
+
+ /* Note that, for efficiency, the struct info may not be protected by
+ any latch here! */
+
+ if (UNIV_UNLIKELY(info->n_hash_potential == 0)) {
+
+ return(FALSE);
+ }
+
+ cursor->n_fields = info->n_fields;
+ cursor->n_bytes = info->n_bytes;
+
+ if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple)
+ < cursor->n_fields + (cursor->n_bytes > 0))) {
+
+ return(FALSE);
+ }
+
+ index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_hash_succ++;
+#endif
+ fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+ cursor->fold = fold;
+ cursor->flag = BTR_CUR_HASH;
+
+ if (UNIV_LIKELY(!has_search_latch)) {
+ rw_lock_s_lock(&btr_search_latch);
+
+ if (UNIV_UNLIKELY(!btr_search_enabled)) {
+ goto failure_unlock;
+ }
+ }
+
+ ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX);
+ ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0);
+
+ rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+
+ if (UNIV_UNLIKELY(!rec)) {
+ goto failure_unlock;
+ }
+
+ block = buf_block_align(rec);
+
+ if (UNIV_LIKELY(!has_search_latch)) {
+
+ if (UNIV_UNLIKELY(
+ !buf_page_get_known_nowait(latch_mode, block,
+ BUF_MAKE_YOUNG,
+ __FILE__, __LINE__,
+ mtr))) {
+ goto failure_unlock;
+ }
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+ }
+
+ if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
+
+ if (UNIV_LIKELY(!has_search_latch)) {
+
+ btr_leaf_page_release(block, latch_mode, mtr);
+ }
+
+ goto failure;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ btr_cur_position(index, rec, block, cursor);
+
+ /* Check the validity of the guess within the page */
+
+ /* If we only have the latch on btr_search_latch, not on the
+ page, it only protects the columns of the record the cursor
+ is positioned on. We cannot look at the next of the previous
+ record to determine if our guess for the cursor position is
+ right. */
+ if (UNIV_EXPECT
+ (ut_dulint_cmp(index_id, btr_page_get_index_id(block->frame)), 0)
+ || !btr_search_check_guess(cursor,
+ has_search_latch,
+ tuple, mode, mtr)) {
+ if (UNIV_LIKELY(!has_search_latch)) {
+ btr_leaf_page_release(block, latch_mode, mtr);
+ }
+
+ goto failure;
+ }
+
+ if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) {
+
+ info->n_hash_potential++;
+ }
+
+#ifdef notdefined
+ /* These lines of code can be used in a debug version to check
+ the correctness of the searched cursor position: */
+
+ info->last_hash_succ = FALSE;
+
+ /* Currently, does not work if the following fails: */
+ ut_ad(!has_search_latch);
+
+ btr_leaf_page_release(block, latch_mode, mtr);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ &cursor2, 0, mtr);
+ if (mode == PAGE_CUR_GE
+ && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
+
+ /* If mode is PAGE_CUR_GE, then the binary search
+ in the index tree may actually take us to the supremum
+ of the previous page */
+
+ info->last_hash_succ = FALSE;
+
+ btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode,
+ &pcur, mtr);
+ ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+ } else {
+ ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+ }
+
+ /* NOTE that it is theoretically possible that the above assertions
+ fail if the page of the cursor gets removed from the buffer pool
+ meanwhile! Thus it might not be a bug. */
+#endif
+ info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ btr_search_n_succ++;
+#endif
+ if (UNIV_LIKELY(!has_search_latch)
+ && buf_page_peek_if_too_old(&block->page)) {
+
+ buf_page_make_young(&block->page);
+ }
+
+ /* Increment the page get statistics though we did not really
+ fix the page: for user info only */
+
+ buf_pool->stat.n_page_gets++;
+
+ return(TRUE);
+
+ /*-------------------------------------------*/
+failure_unlock:
+ if (UNIV_LIKELY(!has_search_latch)) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+failure:
+ cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ info->n_hash_fail++;
+
+ if (info->n_hash_succ > 0) {
+ info->n_hash_succ--;
+ }
+#endif
+ info->last_hash_succ = FALSE;
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+ buf_block_t* block) /*!< in: block containing index page,
+ s- or x-latched, or an index page
+ for which we know that
+ block->buf_fix_count == 0 */
+{
+ hash_table_t* table;
+ ulint n_fields;
+ ulint n_bytes;
+ const page_t* page;
+ const rec_t* rec;
+ ulint fold;
+ ulint prev_fold;
+ dulint index_id;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ ulint i;
+ mem_heap_t* heap;
+ const dict_index_t* index;
+ ulint* offsets;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+retry:
+ rw_lock_s_lock(&btr_search_latch);
+ page = block->frame;
+
+ if (UNIV_LIKELY(!block->is_hashed)) {
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ return;
+ }
+
+ table = btr_search_sys->hash_index;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EX)
+ || (block->page.buf_fix_count == 0));
+#endif /* UNIV_SYNC_DEBUG */
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+ index = block->index;
+ ut_a(!dict_index_is_ibuf(index));
+
+ /* NOTE: The fields of block must not be accessed after
+ releasing btr_search_latch, as the index page might only
+ be s-latched! */
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ ut_a(n_fields + n_bytes > 0);
+
+ n_recs = page_get_n_recs(page);
+
+ /* Calculate and cache fold values into an array for fast deletion
+ from the hash index */
+
+ folds = mem_alloc(n_recs * sizeof(ulint));
+
+ n_cached = 0;
+
+ rec = page_get_infimum_rec(page);
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+ index_id = btr_page_get_index_id(page);
+
+ ut_a(0 == ut_dulint_cmp(index_id, index->id));
+
+ prev_fold = 0;
+
+ heap = NULL;
+ offsets = NULL;
+
+ while (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+ ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+ if (fold == prev_fold && prev_fold != 0) {
+
+ goto next_rec;
+ }
+
+ /* Remove all hash nodes pointing to this page from the
+ hash chain */
+
+ folds[n_cached] = fold;
+ n_cached++;
+next_rec:
+ rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+ prev_fold = fold;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ if (UNIV_UNLIKELY(!block->is_hashed)) {
+ /* Someone else has meanwhile dropped the hash index */
+
+ goto cleanup;
+ }
+
+ ut_a(block->index == index);
+
+ if (UNIV_UNLIKELY(block->curr_n_fields != n_fields)
+ || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) {
+
+ /* Someone else has meanwhile built a new hash index on the
+ page, with different parameters */
+
+ rw_lock_x_unlock(&btr_search_latch);
+
+ mem_free(folds);
+ goto retry;
+ }
+
+ for (i = 0; i < n_cached; i++) {
+
+ ha_remove_all_nodes_to_page(table, folds[i], page);
+ }
+
+ ut_a(index->search_info->ref_count > 0);
+ index->search_info->ref_count--;
+
+ block->is_hashed = FALSE;
+ block->index = NULL;
+
+cleanup:
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ if (UNIV_UNLIKELY(block->n_pointers)) {
+ /* Corruption */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Corruption of adaptive hash index."
+ " After dropping\n"
+ "InnoDB: the hash index to a page of %s,"
+ " still %lu hash nodes remain.\n",
+ index->name, (ulong) block->n_pointers);
+ rw_lock_x_unlock(&btr_search_latch);
+
+ btr_search_validate();
+ } else {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ rw_lock_x_unlock(&btr_search_latch);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ mem_free(folds);
+}
+
+/************************************************************************
+Drops a page hash index based on index */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index_on_index(
+/*=====================================*/
+ dict_index_t* index) /* in: record descriptor */
+{
+ buf_page_t* bpage;
+ hash_table_t* table;
+ buf_block_t* block;
+ ulint n_fields;
+ ulint n_bytes;
+ const page_t* page;
+ const rec_t* rec;
+ ulint fold;
+ ulint prev_fold;
+ dulint index_id;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint* offsets;
+
+ rw_lock_x_lock(&btr_search_latch);
+ mutex_enter(&LRU_list_mutex);
+
+ table = btr_search_sys->hash_index;
+
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while (bpage != NULL) {
+ block = (buf_block_t*) bpage;
+ if (block->index == index && block->is_hashed) {
+ page = block->frame;
+
+ /* from btr_search_drop_page_hash_index() */
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+
+ ut_a(n_fields + n_bytes > 0);
+
+ n_recs = page_get_n_recs(page);
+
+ /* Calculate and cache fold values into an array for fast deletion
+ from the hash index */
+
+ folds = mem_alloc(n_recs * sizeof(ulint));
+
+ n_cached = 0;
+
+ rec = page_get_infimum_rec(page);
+ rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+ index_id = btr_page_get_index_id(page);
+
+ ut_a(0 == ut_dulint_cmp(index_id, index->id));
+
+ prev_fold = 0;
+
+ offsets = NULL;
+
+ while (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+ ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+ if (fold == prev_fold && prev_fold != 0) {
+
+ goto next_rec;
+ }
+
+ /* Remove all hash nodes pointing to this page from the
+ hash chain */
+
+ folds[n_cached] = fold;
+ n_cached++;
+next_rec:
+ rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+ prev_fold = fold;
+ }
+
+ for (i = 0; i < n_cached; i++) {
+
+ ha_remove_all_nodes_to_page(table, folds[i], page);
+ }
+
+ ut_a(index->search_info->ref_count > 0);
+ index->search_info->ref_count--;
+
+ block->is_hashed = FALSE;
+ block->index = NULL;
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ if (UNIV_UNLIKELY(block->n_pointers)) {
+ /* Corruption */
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+" InnoDB: Corruption of adaptive hash index. After dropping\n"
+"InnoDB: the hash index to a page of %s, still %lu hash nodes remain.\n",
+ index->name, (ulong) block->n_pointers);
+ }
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ mem_free(folds);
+ }
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ }
+
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&btr_search_latch);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/********************************************************************//**
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no) /*!< in: page number */
+{
+ buf_block_t* block;
+ mtr_t mtr;
+
+ if (!buf_page_peek_if_search_hashed(space, page_no)) {
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ /* We assume that if the caller has a latch on the page, then the
+ caller has already dropped the hash index for the page, and we never
+ get here. Therefore we can acquire the s-latch to the page without
+ having to fear a deadlock. */
+
+ block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL,
+ BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
+ &mtr);
+ /* Because the buffer pool mutex was released by
+ buf_page_peek_if_search_hashed(), it is possible that the
+ block was removed from the buffer pool by another thread
+ before buf_page_get_gen() got a chance to acquire the buffer
+ pool mutex again. Thus, we must check for a NULL return. */
+
+ if (UNIV_LIKELY(block != NULL)) {
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+
+ btr_search_drop_page_hash_index(block);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+ dict_index_t* index, /*!< in: index for which to build */
+ buf_block_t* block, /*!< in: index page, s- or x-latched */
+ ulint n_fields,/*!< in: hash this many full fields */
+ ulint n_bytes,/*!< in: hash this many bytes from the next
+ field */
+ ibool left_side)/*!< in: hash for searches from left side? */
+{
+ hash_table_t* table;
+ page_t* page;
+ rec_t* rec;
+ rec_t* next_rec;
+ ulint fold;
+ ulint next_fold;
+ dulint index_id;
+ ulint n_cached;
+ ulint n_recs;
+ ulint* folds;
+ rec_t** recs;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(index);
+ ut_a(!dict_index_is_ibuf(index));
+
+ table = btr_search_sys->hash_index;
+ page = buf_block_get_frame(block);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ rw_lock_s_lock(&btr_search_latch);
+
+ if (block->is_hashed && ((block->curr_n_fields != n_fields)
+ || (block->curr_n_bytes != n_bytes)
+ || (block->curr_left_side != left_side))) {
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ btr_search_drop_page_hash_index(block);
+ } else {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+
+ n_recs = page_get_n_recs(page);
+
+ if (n_recs == 0) {
+
+ return;
+ }
+
+ /* Check that the values for hash index build are sensible */
+
+ if (n_fields + n_bytes == 0) {
+
+ return;
+ }
+
+ if (dict_index_get_n_unique_in_tree(index) < n_fields
+ || (dict_index_get_n_unique_in_tree(index) == n_fields
+ && n_bytes > 0)) {
+ return;
+ }
+
+ /* Calculate and cache fold values and corresponding records into
+ an array for fast insertion to the hash index */
+
+ folds = mem_alloc(n_recs * sizeof(ulint));
+ recs = mem_alloc(n_recs * sizeof(rec_t*));
+
+ n_cached = 0;
+
+ index_id = btr_page_get_index_id(page);
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+
+ if (!page_rec_is_supremum(rec)) {
+ ut_a(n_fields <= rec_offs_n_fields(offsets));
+
+ if (n_bytes > 0) {
+ ut_a(n_fields < rec_offs_n_fields(offsets));
+ }
+ }
+
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+ if (left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ for (;;) {
+ next_rec = page_rec_get_next(rec);
+
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+
+ break;
+ }
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index_id);
+
+ if (fold != next_fold) {
+ /* Insert an entry into the hash index */
+
+ if (left_side) {
+
+ folds[n_cached] = next_fold;
+ recs[n_cached] = next_rec;
+ n_cached++;
+ } else {
+ folds[n_cached] = fold;
+ recs[n_cached] = rec;
+ n_cached++;
+ }
+ }
+
+ rec = next_rec;
+ fold = next_fold;
+ }
+
+ btr_search_check_free_space_in_heap();
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ if (UNIV_UNLIKELY(btr_search_fully_disabled)) {
+ goto exit_func;
+ }
+
+ if (block->is_hashed && ((block->curr_n_fields != n_fields)
+ || (block->curr_n_bytes != n_bytes)
+ || (block->curr_left_side != left_side))) {
+ goto exit_func;
+ }
+
+ /* This counter is decremented every time we drop page
+ hash index entries and is incremented here. Since we can
+ rebuild hash index for a page that is already hashed, we
+ have to take care not to increment the counter in that
+ case. */
+ if (!block->is_hashed) {
+ index->search_info->ref_count++;
+ }
+
+ block->is_hashed = TRUE;
+ block->n_hash_helps = 0;
+
+ block->curr_n_fields = n_fields;
+ block->curr_n_bytes = n_bytes;
+ block->curr_left_side = left_side;
+ block->index = index;
+
+ for (i = 0; i < n_cached; i++) {
+
+ ha_insert_for_fold(table, folds[i], block, recs[i]);
+ }
+
+exit_func:
+ rw_lock_x_unlock(&btr_search_latch);
+
+ mem_free(folds);
+ mem_free(recs);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+ buf_block_t* new_block, /*!< in: records are copied
+ to this page */
+ buf_block_t* block, /*!< in: index page from which
+ records were copied, and the
+ copied records will be deleted
+ from this page */
+ dict_index_t* index) /*!< in: record descriptor */
+{
+ ulint n_fields;
+ ulint n_bytes;
+ ibool left_side;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+ ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_a(!new_block->is_hashed || new_block->index == index);
+ ut_a(!block->is_hashed || block->index == index);
+ ut_a(!(new_block->is_hashed || block->is_hashed)
+ || !dict_index_is_ibuf(index));
+
+ rw_lock_s_lock(&btr_search_latch);
+
+ if (new_block->is_hashed) {
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ btr_search_drop_page_hash_index(block);
+
+ return;
+ }
+
+ if (block->is_hashed) {
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+ left_side = block->curr_left_side;
+
+ new_block->n_fields = block->curr_n_fields;
+ new_block->n_bytes = block->curr_n_bytes;
+ new_block->left_side = left_side;
+
+ rw_lock_s_unlock(&btr_search_latch);
+
+ ut_a(n_fields + n_bytes > 0);
+
+ btr_search_build_page_hash_index(index, new_block, n_fields,
+ n_bytes, left_side);
+ ut_ad(n_fields == block->curr_n_fields);
+ ut_ad(n_bytes == block->curr_n_bytes);
+ ut_ad(left_side == block->curr_left_side);
+ return;
+ }
+
+ rw_lock_s_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+ btr_cur_t* cursor) /*!< in: cursor which was positioned on the
+ record to delete using btr_cur_search_...,
+ the record is not yet deleted */
+{
+ hash_table_t* table;
+ buf_block_t* block;
+ rec_t* rec;
+ ulint fold;
+ dulint index_id;
+ ibool found;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ mem_heap_t* heap = NULL;
+ rec_offs_init(offsets_);
+
+ rec = btr_cur_get_rec(cursor);
+
+ block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!block->is_hashed) {
+
+ return;
+ }
+
+ ut_a(block->index == cursor->index);
+ ut_a(block->curr_n_fields + block->curr_n_bytes > 0);
+ ut_a(!dict_index_is_ibuf(cursor->index));
+
+ table = btr_search_sys->hash_index;
+
+ index_id = cursor->index->id;
+ fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ block->curr_n_fields, block->curr_n_bytes, index_id);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ rw_lock_x_lock(&btr_search_latch);
+
+ found = ha_search_and_delete_if_found(table, fold, rec);
+
+ rw_lock_x_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+ btr_cur_t* cursor) /*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+{
+ hash_table_t* table;
+ buf_block_t* block;
+ rec_t* rec;
+
+ rec = btr_cur_get_rec(cursor);
+
+ block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!block->is_hashed) {
+
+ return;
+ }
+
+ ut_a(block->index == cursor->index);
+ ut_a(!dict_index_is_ibuf(cursor->index));
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ if ((cursor->flag == BTR_CUR_HASH)
+ && (cursor->n_fields == block->curr_n_fields)
+ && (cursor->n_bytes == block->curr_n_bytes)
+ && !block->curr_left_side) {
+
+ table = btr_search_sys->hash_index;
+
+ ha_search_and_update_if_found(table, cursor->fold, rec,
+ block, page_rec_get_next(rec));
+
+ rw_lock_x_unlock(&btr_search_latch);
+ } else {
+ rw_lock_x_unlock(&btr_search_latch);
+
+ btr_search_update_hash_on_insert(cursor);
+ }
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+ btr_cur_t* cursor) /*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+{
+ hash_table_t* table;
+ buf_block_t* block;
+ rec_t* rec;
+ rec_t* ins_rec;
+ rec_t* next_rec;
+ dulint index_id;
+ ulint fold;
+ ulint ins_fold;
+ ulint next_fold = 0; /* remove warning (??? bug ???) */
+ ulint n_fields;
+ ulint n_bytes;
+ ibool left_side;
+ ibool locked = FALSE;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ table = btr_search_sys->hash_index;
+
+ btr_search_check_free_space_in_heap();
+
+ rec = btr_cur_get_rec(cursor);
+
+ block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!block->is_hashed) {
+
+ return;
+ }
+
+ ut_a(block->index == cursor->index);
+ ut_a(!dict_index_is_ibuf(cursor->index));
+
+ index_id = cursor->index->id;
+
+ n_fields = block->curr_n_fields;
+ n_bytes = block->curr_n_bytes;
+ left_side = block->curr_left_side;
+
+ ins_rec = page_rec_get_next(rec);
+ next_rec = page_rec_get_next(ins_rec);
+
+ offsets = rec_get_offsets(ins_rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index_id);
+
+ if (!page_rec_is_supremum(next_rec)) {
+ offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+ next_fold = rec_fold(next_rec, offsets, n_fields,
+ n_bytes, index_id);
+ }
+
+ if (!page_rec_is_infimum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ n_fields + (n_bytes > 0), &heap);
+ fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+ } else {
+ if (left_side) {
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ locked = TRUE;
+
+ ha_insert_for_fold(table, ins_fold, block, ins_rec);
+ }
+
+ goto check_next_rec;
+ }
+
+ if (fold != ins_fold) {
+
+ if (!locked) {
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ locked = TRUE;
+ }
+
+ if (!left_side) {
+ ha_insert_for_fold(table, fold, block, rec);
+ } else {
+ ha_insert_for_fold(table, ins_fold, block, ins_rec);
+ }
+ }
+
+check_next_rec:
+ if (page_rec_is_supremum(next_rec)) {
+
+ if (!left_side) {
+
+ if (!locked) {
+ rw_lock_x_lock(&btr_search_latch);
+
+ locked = TRUE;
+ }
+
+ ha_insert_for_fold(table, ins_fold, block, ins_rec);
+ }
+
+ goto function_exit;
+ }
+
+ if (ins_fold != next_fold) {
+
+ if (!locked) {
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ locked = TRUE;
+ }
+
+ if (!left_side) {
+
+ ha_insert_for_fold(table, ins_fold, block, ins_rec);
+ /*
+ fputs("Hash insert for ", stderr);
+ dict_index_name_print(stderr, cursor->index);
+ fprintf(stderr, " fold %lu\n", ins_fold);
+ */
+ } else {
+ ha_insert_for_fold(table, next_fold, block, next_rec);
+ }
+ }
+
+function_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ if (locked) {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void)
+/*=====================*/
+{
+ ha_node_t* node;
+ ulint n_page_dumps = 0;
+ ibool ok = TRUE;
+ ulint i;
+ ulint cell_count;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+
+ /* How many cells to check before temporarily releasing
+ btr_search_latch. */
+ ulint chunk_size = 10000;
+
+ rec_offs_init(offsets_);
+
+ rw_lock_x_lock(&btr_search_latch);
+ //buf_pool_mutex_enter();
+ rw_lock_x_lock(&page_hash_latch);
+
+ cell_count = hash_get_n_cells(btr_search_sys->hash_index);
+
+ for (i = 0; i < cell_count; i++) {
+ /* We release btr_search_latch every once in a while to
+ give other queries a chance to run. */
+ if ((i != 0) && ((i % chunk_size) == 0)) {
+ //buf_pool_mutex_exit();
+ rw_lock_x_unlock(&page_hash_latch);
+ rw_lock_x_unlock(&btr_search_latch);
+ os_thread_yield();
+ rw_lock_x_lock(&btr_search_latch);
+ //buf_pool_mutex_enter();
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+ node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node;
+
+ for (; node != NULL; node = node->next) {
+ const buf_block_t* block
+ = buf_block_align(node->data);
+ const buf_block_t* hash_block;
+
+ if (UNIV_LIKELY(buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE)) {
+
+ /* The space and offset are only valid
+ for file blocks. It is possible that
+ the block is being freed
+ (BUF_BLOCK_REMOVE_HASH, see the
+ assertion and the comment below) */
+ hash_block = buf_block_hash_get(
+ buf_block_get_space(block),
+ buf_block_get_page_no(block));
+ } else {
+ hash_block = NULL;
+ }
+
+ if (hash_block) {
+ ut_a(hash_block == block);
+ } else {
+ /* When a block is being freed,
+ buf_LRU_search_and_free_block() first
+ removes the block from
+ buf_pool->page_hash by calling
+ buf_LRU_block_remove_hashed_page().
+ After that, it invokes
+ btr_search_drop_page_hash_index() to
+ remove the block from
+ btr_search_sys->hash_index. */
+
+ ut_a(buf_block_get_state(block)
+ == BUF_BLOCK_REMOVE_HASH);
+ }
+
+ ut_a(!dict_index_is_ibuf(block->index));
+
+ offsets = rec_get_offsets((const rec_t*) node->data,
+ block->index, offsets,
+ block->curr_n_fields
+ + (block->curr_n_bytes > 0),
+ &heap);
+
+ if (!block->is_hashed || node->fold
+ != rec_fold((rec_t*)(node->data),
+ offsets,
+ block->curr_n_fields,
+ block->curr_n_bytes,
+ btr_page_get_index_id(block->frame))) {
+ const page_t* page = block->frame;
+
+ ok = FALSE;
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error in an adaptive hash"
+ " index pointer to page %lu\n"
+ "InnoDB: ptr mem address %p"
+ " index id %lu %lu,"
+ " node fold %lu, rec fold %lu\n",
+ (ulong) page_get_page_no(page),
+ node->data,
+ (ulong) ut_dulint_get_high(
+ btr_page_get_index_id(page)),
+ (ulong) ut_dulint_get_low(
+ btr_page_get_index_id(page)),
+ (ulong) node->fold,
+ (ulong) rec_fold((rec_t*)(node->data),
+ offsets,
+ block->curr_n_fields,
+ block->curr_n_bytes,
+ btr_page_get_index_id(
+ page)));
+
+ fputs("InnoDB: Record ", stderr);
+ rec_print_new(stderr, (rec_t*)node->data,
+ offsets);
+ fprintf(stderr, "\nInnoDB: on that page."
+ " Page mem address %p, is hashed %lu,"
+ " n fields %lu, n bytes %lu\n"
+ "InnoDB: side %lu\n",
+ (void*) page, (ulong) block->is_hashed,
+ (ulong) block->curr_n_fields,
+ (ulong) block->curr_n_bytes,
+ (ulong) block->curr_left_side);
+
+ if (n_page_dumps < 20) {
+ buf_page_print(page, 0);
+ n_page_dumps++;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < cell_count; i += chunk_size) {
+ ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+ /* We release btr_search_latch every once in a while to
+ give other queries a chance to run. */
+ if (i != 0) {
+ //buf_pool_mutex_exit();
+ rw_lock_x_unlock(&page_hash_latch);
+ rw_lock_x_unlock(&btr_search_latch);
+ os_thread_yield();
+ rw_lock_x_lock(&btr_search_latch);
+ //buf_pool_mutex_enter();
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+ if (!ha_validate(btr_search_sys->hash_index, i, end_index)) {
+ ok = FALSE;
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_x_unlock(&page_hash_latch);
+ rw_lock_x_unlock(&btr_search_latch);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
new file mode 100644
index 00000000000..e6b80bcda55
--- /dev/null
+++ b/storage/xtradb/buf/buf0buddy.c
@@ -0,0 +1,804 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.c
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "buf0buddy.h"
+#ifdef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+#undef THIS_MODULE
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+
+/* Statistic counters */
+
+#ifdef UNIV_DEBUG
+/** Number of frames allocated from the buffer pool to the buddy system.
+Protected by buf_pool_mutex. */
+static ulint buf_buddy_n_frames;
+#endif /* UNIV_DEBUG */
+/** Statistics of the buddy system, indexed by block size.
+Protected by buf_pool_mutex. */
+UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return the buddy relative of page */
+UNIV_INLINE
+byte*
+buf_buddy_get(
+/*==========*/
+ byte* page, /*!< in: compressed page */
+ ulint size) /*!< in: page size in bytes */
+{
+ ut_ad(ut_is_2pow(size));
+ ut_ad(size >= BUF_BUDDY_LOW);
+ ut_ad(size < BUF_BUDDY_HIGH);
+ ut_ad(!ut_align_offset(page, size));
+
+ if (((ulint) page) & size) {
+ return(page - size);
+ } else {
+ return(page + size);
+ }
+}
+
+/**********************************************************************//**
+Add a block to the head of the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(
+/*==================*/
+ buf_page_t* bpage, /*!< in,own: block to be freed */
+ ulint i) /*!< in: index of buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG_VALGRIND
+ buf_page_t* b = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+
+ if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&zip_free_mutex));
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+ ut_ad(buf_pool->zip_free[i].start != bpage);
+ UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i);
+ UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+}
+
+/**********************************************************************//**
+Remove a block from the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(
+/*=======================*/
+ buf_page_t* bpage, /*!< in: block to be removed */
+ ulint i) /*!< in: index of buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG_VALGRIND
+ buf_page_t* prev = UT_LIST_GET_PREV(zip_list, bpage);
+ buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage);
+
+ if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i);
+ if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i);
+
+ ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
+ ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&zip_free_mutex));
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+ UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i);
+ if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+}
+
+/**********************************************************************//**
+Try to allocate a block from buf_pool->zip_free[].
+@return allocated block, or NULL if buf_pool->zip_free[] was empty */
+static
+void*
+buf_buddy_alloc_zip(
+/*================*/
+ ulint i) /*!< in: index of buf_pool->zip_free[] */
+{
+ buf_page_t* bpage;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&zip_free_mutex));
+ ut_a(i < BUF_BUDDY_SIZES);
+
+#ifndef UNIV_DEBUG_VALGRIND
+ /* Valgrind would complain about accessing free memory. */
+ ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+ ut_ad(buf_page_get_state(ut_list_node_313)
+ == BUF_BLOCK_ZIP_FREE)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+ bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
+
+ if (bpage) {
+ UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+
+ buf_buddy_remove_from_free(bpage, i);
+ } else if (i + 1 < BUF_BUDDY_SIZES) {
+ /* Attempt to split. */
+ bpage = buf_buddy_alloc_zip(i + 1);
+
+ if (bpage) {
+ buf_page_t* buddy = (buf_page_t*)
+ (((char*) bpage) + (BUF_BUDDY_LOW << i));
+
+ ut_ad(!buf_pool_contains_zip(buddy));
+ ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
+ buddy->state = BUF_BLOCK_ZIP_FREE;
+ buf_buddy_add_to_free(buddy, i);
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ if (bpage) {
+ memset(bpage, ~i, BUF_BUDDY_LOW << i);
+ }
+#endif /* UNIV_DEBUG */
+
+ UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
+
+ return(bpage);
+}
+
+/**********************************************************************//**
+Deallocate a buffer frame of UNIV_PAGE_SIZE. */
+static
+void
+buf_buddy_block_free(
+/*=================*/
+ void* buf, /*!< in: buffer frame to deallocate */
+ ibool have_page_hash_mutex)
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD_PTR(buf);
+ buf_page_t* bpage;
+ buf_block_t* block;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(!mutex_own(&buf_pool_zip_mutex));
+ ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
+
+ mutex_enter(&zip_hash_mutex);
+
+ HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
+ && bpage->in_zip_hash && !bpage->in_page_hash),
+ ((buf_block_t*) bpage)->frame == buf);
+ ut_a(bpage);
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(bpage->in_zip_hash);
+ ut_d(bpage->in_zip_hash = FALSE);
+ HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+
+ mutex_exit(&zip_hash_mutex);
+
+ ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
+ UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
+
+ block = (buf_block_t*) bpage;
+ mutex_enter(&block->mutex);
+ buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+ mutex_exit(&block->mutex);
+
+ ut_ad(buf_buddy_n_frames > 0);
+ ut_d(buf_buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+ buf_block_t* block) /*!< in: buffer frame to allocate */
+{
+ const ulint fold = BUF_POOL_ZIP_FOLD(block);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(!mutex_own(&buf_pool_zip_mutex));
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ ut_a(block->frame);
+ ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
+
+ ut_ad(!block->page.in_page_hash);
+ ut_ad(!block->page.in_zip_hash);
+ ut_d(block->page.in_zip_hash = TRUE);
+
+ mutex_enter(&zip_hash_mutex);
+ HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+ mutex_exit(&zip_hash_mutex);
+
+ ut_d(buf_buddy_n_frames++);
+}
+
+/**********************************************************************//**
+Allocate a block from a bigger object.
+@return allocated block */
+static
+void*
+buf_buddy_alloc_from(
+/*=================*/
+ void* buf, /*!< in: a block that is free to use */
+ ulint i, /*!< in: index of buf_pool->zip_free[] */
+ ulint j) /*!< in: size of buf as an index
+ of buf_pool->zip_free[] */
+{
+ ulint offs = BUF_BUDDY_LOW << j;
+ ut_ad(j <= BUF_BUDDY_SIZES);
+ ut_ad(j >= i);
+ ut_ad(!ut_align_offset(buf, offs));
+
+ /* Add the unused parts of the block to the free lists. */
+ while (j > i) {
+ buf_page_t* bpage;
+
+ offs >>= 1;
+ j--;
+
+ bpage = (buf_page_t*) ((byte*) buf + offs);
+ ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
+ bpage->state = BUF_BLOCK_ZIP_FREE;
+#ifndef UNIV_DEBUG_VALGRIND
+ /* Valgrind would complain about accessing free memory. */
+ ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+ ut_ad(buf_page_get_state(
+ ut_list_node_313)
+ == BUF_BLOCK_ZIP_FREE)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+ buf_buddy_add_to_free(bpage, j);
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
+The buf_pool_mutex may only be released and reacquired if lru != NULL.
+@return allocated block, possibly NULL if lru==NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool* lru, /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ ibool have_page_hash_mutex)
+{
+ buf_block_t* block;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(!mutex_own(&buf_pool_zip_mutex));
+
+ if (i < BUF_BUDDY_SIZES) {
+ /* Try to allocate from the buddy system. */
+ mutex_enter(&zip_free_mutex);
+ block = buf_buddy_alloc_zip(i);
+
+ if (block) {
+
+ goto func_exit;
+ }
+
+ mutex_exit(&zip_free_mutex);
+ }
+
+ /* Try allocating from the buf_pool->free list. */
+ block = buf_LRU_get_free_only();
+
+ if (block) {
+
+ goto alloc_big;
+ }
+
+ if (!lru) {
+
+ return(NULL);
+ }
+
+ /* Try replacing an uncompressed page in the buffer pool. */
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ if (have_page_hash_mutex) {
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+ block = buf_LRU_get_free_block(0);
+ *lru = TRUE;
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ if (have_page_hash_mutex) {
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+alloc_big:
+ buf_buddy_block_register(block);
+
+ mutex_enter(&zip_free_mutex);
+ block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+ buf_buddy_stat[i].used++;
+ mutex_exit(&zip_free_mutex);
+
+ return(block);
+}
+
+/**********************************************************************//**
+Try to relocate the control block of a compressed page.
+@return TRUE if relocated */
+static
+ibool
+buf_buddy_relocate_block(
+/*=====================*/
+ buf_page_t* bpage, /*!< in: block to relocate */
+ buf_page_t* dpage) /*!< in: free block to relocate to */
+{
+ buf_page_t* b;
+
+ //ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* ut_error; */ /* optimistic */
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* Cannot relocate dirty pages. */
+ return(FALSE);
+
+ case BUF_BLOCK_ZIP_PAGE:
+ break;
+ }
+
+ mutex_enter(&buf_pool_zip_mutex);
+ mutex_enter(&zip_free_mutex);
+
+ if (!buf_page_can_relocate(bpage)) {
+ mutex_exit(&buf_pool_zip_mutex);
+ mutex_exit(&zip_free_mutex);
+ return(FALSE);
+ }
+
+ if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) {
+ mutex_exit(&buf_pool_zip_mutex);
+ mutex_exit(&zip_free_mutex);
+ return(FALSE);
+ }
+
+ buf_relocate(bpage, dpage);
+ ut_d(bpage->state = BUF_BLOCK_ZIP_FREE);
+
+ /* relocate buf_pool->zip_clean */
+ mutex_enter(&flush_list_mutex);
+ b = UT_LIST_GET_PREV(zip_list, dpage);
+ UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage);
+
+ if (b) {
+ UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage);
+ }
+ mutex_exit(&flush_list_mutex);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+ mutex_exit(&buf_pool_zip_mutex);
+ mutex_exit(&zip_free_mutex);
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Try to relocate a block.
+@return TRUE if relocated */
+static
+ibool
+buf_buddy_relocate(
+/*===============*/
+ void* src, /*!< in: block to relocate */
+ void* dst, /*!< in: free block to relocate to */
+ ulint i, /*!< in: index of buf_pool->zip_free[] */
+ ibool have_page_hash_mutex)
+{
+ buf_page_t* bpage;
+ const ulint size = BUF_BUDDY_LOW << i;
+ ullint usec = ut_time_us(NULL);
+ ulint space;
+ ulint page_no;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&zip_free_mutex));
+ ut_ad(!mutex_own(&buf_pool_zip_mutex));
+ ut_ad(!ut_align_offset(src, size));
+ ut_ad(!ut_align_offset(dst, size));
+ UNIV_MEM_ASSERT_W(dst, size);
+
+ /* We assume that all memory from buf_buddy_alloc()
+ is used for either compressed pages or buf_page_t
+ objects covering compressed pages. */
+
+ /* We look inside the allocated objects returned by
+ buf_buddy_alloc() and assume that anything of
+ PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains
+ a valid space_id and page_no in the page header. Should the
+ fields be invalid, we will be unable to relocate the block.
+ We also assume that anything that fits sizeof(buf_page_t)
+ actually is a properly initialized buf_page_t object. */
+
+ if (size >= PAGE_ZIP_MIN_SIZE) {
+ /* This is a compressed page. */
+ mutex_t* mutex;
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&zip_free_mutex);
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+ /* The src block may be split into smaller blocks,
+ some of which may be free. Thus, the
+ mach_read_from_4() calls below may attempt to read
+ from free memory. The memory is "owned" by the buddy
+ allocator (and it has been allocated from the buffer
+ pool), so there is nothing wrong about this. The
+ mach_read_from_4() calls here will only trigger bogus
+ Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
+ space = mach_read_from_4(
+ (const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ page_no = mach_read_from_4(
+ (const byte*) src + FIL_PAGE_OFFSET);
+ /* Suppress Valgrind warnings about conditional jump
+ on uninitialized value. */
+ UNIV_MEM_VALID(&space, sizeof space);
+ UNIV_MEM_VALID(&page_no, sizeof page_no);
+ bpage = buf_page_hash_get(space, page_no);
+
+ if (!bpage || bpage->zip.data != src) {
+ /* The block has probably been freshly
+ allocated by buf_LRU_get_free_block() but not
+ added to buf_pool->page_hash yet. Obviously,
+ it cannot be relocated. */
+
+ if (!have_page_hash_mutex) {
+ mutex_enter(&zip_free_mutex);
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+ return(FALSE);
+ }
+
+ if (page_zip_get_size(&bpage->zip) != size) {
+ /* The block is of different size. We would
+ have to relocate all blocks covered by src.
+ For the sake of simplicity, give up. */
+ ut_ad(page_zip_get_size(&bpage->zip) < size);
+
+ if (!have_page_hash_mutex) {
+ mutex_enter(&zip_free_mutex);
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+ return(FALSE);
+ }
+
+ /* To keep latch order */
+ if (have_page_hash_mutex)
+ mutex_exit(&zip_free_mutex);
+
+ /* The block must have been allocated, but it may
+ contain uninitialized data. */
+ UNIV_MEM_ASSERT_W(src, size);
+
+ mutex = buf_page_get_mutex_enter(bpage);
+
+ mutex_enter(&zip_free_mutex);
+
+ if (mutex && buf_page_can_relocate(bpage)) {
+ /* Relocate the compressed page. */
+ ut_a(bpage->zip.data == src);
+ memcpy(dst, src, size);
+ bpage->zip.data = dst;
+ mutex_exit(mutex);
+success:
+ UNIV_MEM_INVALID(src, size);
+ {
+ buf_buddy_stat_t* buddy_stat
+ = &buf_buddy_stat[i];
+ buddy_stat->relocated++;
+ buddy_stat->relocated_usec
+ += ut_time_us(NULL) - usec;
+ }
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+ return(TRUE);
+ }
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+
+ if (mutex) {
+ mutex_exit(mutex);
+ }
+ } else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
+ /* This must be a buf_page_t object. */
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
+ UNIV_MEM_ASSERT_RW(src, size);
+#endif
+
+ mutex_exit(&zip_free_mutex);
+
+ if (!have_page_hash_mutex) {
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+ if (buf_buddy_relocate_block(src, dst)) {
+ mutex_enter(&zip_free_mutex);
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+
+ goto success;
+ }
+
+ mutex_enter(&zip_free_mutex);
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool have_page_hash_mutex)
+{
+ buf_page_t* bpage;
+ buf_page_t* buddy;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&zip_free_mutex));
+ ut_ad(!mutex_own(&buf_pool_zip_mutex));
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ ut_ad(buf_buddy_stat[i].used > 0);
+
+ buf_buddy_stat[i].used--;
+recombine:
+ UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
+ ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE);
+
+ if (i == BUF_BUDDY_SIZES) {
+ mutex_exit(&zip_free_mutex);
+ buf_buddy_block_free(buf, have_page_hash_mutex);
+ mutex_enter(&zip_free_mutex);
+ return;
+ }
+
+ ut_ad(i < BUF_BUDDY_SIZES);
+ ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+ ut_ad(!buf_pool_contains_zip(buf));
+
+ /* Try to combine adjacent blocks. */
+
+ buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
+
+#ifndef UNIV_DEBUG_VALGRIND
+ /* Valgrind would complain about accessing free memory. */
+
+ if (buddy->state != BUF_BLOCK_ZIP_FREE) {
+
+ goto buddy_nonfree;
+ }
+
+ /* The field buddy->state can only be trusted for free blocks.
+ If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if
+ it is in the free list. */
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
+ UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+
+ if (bpage == buddy) {
+buddy_free:
+ /* The buddy is free: recombine */
+ buf_buddy_remove_from_free(bpage, i);
+buddy_free2:
+ ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
+ ut_ad(!buf_pool_contains_zip(buddy));
+ i++;
+ buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+ goto recombine;
+ }
+
+ ut_a(bpage != buf);
+
+ {
+ buf_page_t* next = UT_LIST_GET_NEXT(zip_list, bpage);
+ UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
+ bpage = next;
+ }
+ }
+
+#ifndef UNIV_DEBUG_VALGRIND
+buddy_nonfree:
+ /* Valgrind would complain about accessing free memory. */
+ ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+ ut_ad(buf_page_get_state(ut_list_node_313)
+ == BUF_BLOCK_ZIP_FREE)));
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ /* The buddy is not free. Is there a free block of this size? */
+ bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
+
+ if (bpage) {
+ /* Remove the block from the free list, because a successful
+ buf_buddy_relocate() will overwrite bpage->list. */
+
+ UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+ buf_buddy_remove_from_free(bpage, i);
+
+ /* Try to relocate the buddy of buf to the free block. */
+ if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) {
+
+ ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
+ goto buddy_free2;
+ }
+
+ buf_buddy_add_to_free(bpage, i);
+
+ /* Try to relocate the buddy of the free block to buf. */
+ buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage),
+ BUF_BUDDY_LOW << i);
+
+#ifndef UNIV_DEBUG_VALGRIND
+ /* Valgrind would complain about accessing free memory. */
+
+ /* The buddy must not be (completely) free, because we
+ always recombine adjacent free blocks.
+
+ (Parts of the buddy can be free in
+ buf_pool->zip_free[j] with j < i.) */
+ ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+ ut_ad(buf_page_get_state(
+ ut_list_node_313)
+ == BUF_BLOCK_ZIP_FREE
+ && ut_list_node_313 != buddy)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+ if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) {
+
+ buf = bpage;
+ UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+ ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
+ goto buddy_free;
+ }
+ }
+
+ /* Free the block to the buddy list. */
+ bpage = buf;
+#ifdef UNIV_DEBUG
+ if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) {
+ /* This area has most likely been allocated for at
+ least one compressed-only block descriptor. Check
+ that there are no live objects in the area. This is
+ not a complete check: it may yield false positives as
+ well as false negatives. Also, due to buddy blocks
+ being recombined, it is possible (although unlikely)
+ that this branch is never reached. */
+
+ char* c;
+
+# ifndef UNIV_DEBUG_VALGRIND
+ /* Valgrind would complain about accessing
+ uninitialized memory. Besides, Valgrind performs a
+ more exhaustive check, at every memory access. */
+ const buf_page_t* b = buf;
+ const buf_page_t* const b_end = (buf_page_t*)
+ ((char*) b + (BUF_BUDDY_LOW << i));
+
+ for (; b < b_end; b++) {
+ /* Avoid false positives (and cause false
+ negatives) by checking for b->space < 1000. */
+
+ if ((b->state == BUF_BLOCK_ZIP_PAGE
+ || b->state == BUF_BLOCK_ZIP_DIRTY)
+ && b->space > 0 && b->space < 1000) {
+ fprintf(stderr,
+ "buddy dirty %p %u (%u,%u) %p,%lu\n",
+ (void*) b,
+ b->state, b->space, b->offset,
+ buf, i);
+ }
+ }
+# endif /* !UNIV_DEBUG_VALGRIND */
+
+ /* Scramble the block. This should make any pointers
+ invalid and trigger a segmentation violation. Because
+ the scrambling can be reversed, it may be possible to
+ track down the object pointing to the freed data by
+ dereferencing the unscrambled bpage->LRU or
+ bpage->list pointers. */
+ for (c = (char*) buf + (BUF_BUDDY_LOW << i);
+ c-- > (char*) buf; ) {
+ *c = ~*c ^ i;
+ }
+ } else {
+ /* Fill large blocks with a constant pattern. */
+ memset(bpage, i, BUF_BUDDY_LOW << i);
+ }
+#endif /* UNIV_DEBUG */
+ bpage->state = BUF_BLOCK_ZIP_FREE;
+ buf_buddy_add_to_free(bpage, i);
+}
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c
new file mode 100644
index 00000000000..1c08bd6d0bf
--- /dev/null
+++ b/storage/xtradb/buf/buf0buf.c
@@ -0,0 +1,4901 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.c
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0buddy.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "log0log.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0zip.h"
+#include "trx0trx.h"
+#include "srv0start.h"
+#include "que0que.h"
+#include "read0read.h"
+#include "row0row.h"
+#include "ha_prototypes.h"
+
+/* prototypes for new functions added to ha_innodb.cc */
+trx_t* innobase_get_trx();
+
+inline void _increment_page_get_statistics(buf_block_t* block, trx_t* trx)
+{
+ ulint block_hash;
+ ulint block_hash_byte;
+ byte block_hash_offset;
+
+ ut_ad(block);
+
+ if (!innobase_get_slow_log() || !trx || !trx->take_stats)
+ return;
+
+ if (!trx->distinct_page_access_hash) {
+ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+ }
+
+ block_hash = ut_hash_ulint((block->page.space << 20) + block->page.space +
+ block->page.offset, DPAH_SIZE << 3);
+ block_hash_byte = block_hash >> 3;
+ block_hash_offset = (byte) block_hash & 0x07;
+ if (block_hash_byte >= DPAH_SIZE)
+ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset);
+ if (block_hash_offset > 7)
+ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset);
+ if ((trx->distinct_page_access_hash[block_hash_byte] & ((byte) 0x01 << block_hash_offset)) == 0)
+ trx->distinct_page_access++;
+ trx->distinct_page_access_hash[block_hash_byte] |= (byte) 0x01 << block_hash_offset;
+ return;
+}
+
+/*
+ IMPLEMENTATION OF THE BUFFER POOL
+ =================================
+
+Performance improvement:
+------------------------
+Thread scheduling in NT may be so slow that the OS wait mechanism should
+not be used even in waiting for disk reads to complete.
+Rather, we should put waiting query threads to the queue of
+waiting jobs, and let the OS thread do something useful while the i/o
+is processed. In this way we could remove most OS thread switches in
+an i/o-intensive benchmark like TPC-C.
+
+A possibility is to put a user space thread library between the database
+and NT. User space thread libraries might be very fast.
+
+SQL Server 7.0 can be configured to use 'fibers' which are lightweight
+threads in NT. These should be studied.
+
+ Buffer frames and blocks
+ ------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+ Buffer pool struct
+ ------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool mutex.
+
+The buf_pool mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool mutex hold time.
+
+ Control blocks
+ --------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+We intend to make the buffer buf_pool size on-line reconfigurable,
+that is, the buf_pool size can be changed without closing the database.
+Then the database administarator may adjust it to be bigger
+at night, for example. The control block array must
+contain enough control blocks for the maximum buffer buf_pool size
+which is used in the particular database.
+If the buf_pool size is cut, we exploit the virtual memory mechanism of
+the OS, and just refrain from using frames at high addresses. Then the OS
+can swap them to disk.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+ Lists of blocks
+ ---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool->free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would noramlly be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+of the LRU list, we make sure that most of the buf_pool stays in the
+main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list. The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame. A block is in unzip_LRU if and
+only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
+holds. The blocks in unzip_LRU will be in same order as they are in
+the common LRU list. That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool->flush_list) contains the blocks
+holding file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+
+The chain of unmodified compressed blocks (buf_pool->zip_clean)
+contains the control blocks (buf_page_t) of those compressed pages
+that are not in buf_pool->flush_list and for which no uncompressed
+page has been allocated in the buffer pool. The control blocks for
+uncompressed pages are accessible via buf_block_t objects that are
+reachable via buf_pool->chunks[].
+
+The chains of free memory blocks (buf_pool->zip_free[]) are used by
+the buddy allocator (buf0buddy.c) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2. These
+blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool. The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+ Loading a file page
+ -------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+ Read-ahead
+ ----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_HOTBACKUP
+/** Value in microseconds */
+static const int WAIT_FOR_READ = 5000;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
+
+/** The buffer buf_pool of the database */
+UNIV_INTERN buf_pool_t* buf_pool = NULL;
+
+/** mutex protecting the buffer pool struct and control blocks, except the
+read-write lock in them */
+UNIV_INTERN mutex_t buf_pool_mutex;
+UNIV_INTERN mutex_t LRU_list_mutex;
+UNIV_INTERN mutex_t flush_list_mutex;
+UNIV_INTERN rw_lock_t page_hash_latch;
+UNIV_INTERN mutex_t free_list_mutex;
+UNIV_INTERN mutex_t zip_free_mutex;
+UNIV_INTERN mutex_t zip_hash_mutex;
+/** mutex protecting the control blocks of compressed-only pages
+(of type buf_page_t, not buf_block_t) */
+UNIV_INTERN mutex_t buf_pool_zip_mutex;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+static ulint buf_dbg_counter = 0; /*!< This is used to insert validation
+ operations in excution in the
+ debug version */
+/** Flag to forbid the release of the buffer pool mutex.
+Protected by buf_pool_mutex. */
+UNIV_INTERN ulint buf_pool_mutex_exit_forbidden = 0;
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/** If this is set TRUE, the program prints info whenever
+read-ahead or flush occurs */
+UNIV_INTERN ibool buf_debug_prints = FALSE;
+#endif /* UNIV_DEBUG */
+
+/* Buffer pool shared memory segment information */
+typedef struct buf_shm_info_struct buf_shm_info_t;
+
+struct buf_shm_info_struct {
+ char head_str[8];
+ ulint binary_id;
+ ibool is_new; /* during initializing */
+ ibool clean; /* clean shutdowned and free */
+ ibool reusable; /* reusable */
+ ulint buf_pool_size; /* backup value */
+ ulint page_size; /* backup value */
+ ulint frame_offset; /* offset of the first frame based on chunk->mem */
+ ulint zip_hash_offset;
+ ulint zip_hash_n;
+
+ ulint checksum;
+
+ buf_pool_t buf_pool_backup;
+ buf_chunk_t chunk_backup;
+
+ ib_uint64_t dummy;
+};
+
+#define BUF_SHM_INFO_HEAD "XTRA_SHM"
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ulint checksum;
+
+ /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+ ..._ARCH_LOG_NO, are written outside the buffer pool to the first
+ pages of data files, we have to skip them in the page checksum
+ calculation.
+ We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+ checksum is stored, and also the last 8 bytes of page because
+ there we store the old formula checksum. */
+
+ checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+ checksum = checksum & 0xFFFFFFFFUL;
+
+ return(checksum);
+}
+
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum_32(
+/*==========================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ulint checksum;
+
+ checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+ FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+ + ut_fold_binary(page + FIL_PAGE_DATA,
+ FIL_PAGE_DATA_ALIGN_32 - FIL_PAGE_DATA)
+ + ut_fold_binary_32(page + FIL_PAGE_DATA_ALIGN_32,
+ UNIV_PAGE_SIZE - FIL_PAGE_DATA_ALIGN_32
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ checksum = checksum & 0xFFFFFFFFUL;
+
+ return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+ const byte* page) /*!< in: buffer page */
+{
+ ulint checksum;
+
+ checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+ checksum = checksum & 0xFFFFFFFFUL;
+
+ return(checksum);
+}
+
+/********************************************************************//**
+Checks if a page is corrupt.
+@return TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size) /*!< in: size of compressed page;
+ 0 for uncompressed pages */
+{
+ ulint checksum_field;
+ ulint old_checksum_field;
+
+ if (UNIV_LIKELY(!zip_size)
+ && memcmp(read_buf + FIL_PAGE_LSN + 4,
+ read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+ /* Stored log sequence numbers at the start and the end
+ of page do not match */
+
+ return(TRUE);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (recv_lsn_checks_on) {
+ ib_uint64_t current_lsn;
+
+ if (log_peek_lsn(&current_lsn)
+ && current_lsn < mach_read_ull(read_buf + FIL_PAGE_LSN)) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: page %lu log sequence number"
+ " %llu\n"
+ "InnoDB: is in the future! Current system "
+ "log sequence number %llu.\n"
+ "InnoDB: Your database may be corrupt or "
+ "you may have copied the InnoDB\n"
+ "InnoDB: tablespace but not the InnoDB "
+ "log files. See\n"
+ "InnoDB: " REFMAN "forcing-recovery.html\n"
+ "InnoDB: for more information.\n",
+ (ulong) mach_read_from_4(read_buf
+ + FIL_PAGE_OFFSET),
+ mach_read_ull(read_buf + FIL_PAGE_LSN),
+ current_lsn);
+ }
+ }
+#endif
+
+ /* If we use checksums validation, make additional check before
+ returning TRUE to ensure that the checksum is not equal to
+ BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
+ disabled. Otherwise, skip checksum calculation and return FALSE */
+
+ if (UNIV_LIKELY(srv_use_checksums)) {
+ checksum_field = mach_read_from_4(read_buf
+ + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ return(checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != page_zip_calc_checksum(read_buf, zip_size));
+ }
+
+ old_checksum_field = mach_read_from_4(
+ read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ /* There are 2 valid formulas for old_checksum_field:
+
+ 1. Very old versions of InnoDB only stored 8 byte lsn to the
+ start and the end of the page.
+
+ 2. Newer InnoDB versions store the old formula checksum
+ there. */
+
+ if (old_checksum_field != mach_read_from_4(read_buf
+ + FIL_PAGE_LSN)
+ && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && old_checksum_field
+ != buf_calc_page_old_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+
+ /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+ (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+ if (!srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+
+ if (srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum_32(read_buf)
+ && checksum_field
+ != buf_calc_page_new_checksum(read_buf)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size) /*!< in: compressed page size, or
+ 0 for uncompressed pages */
+{
+#ifndef UNIV_HOTBACKUP
+ dict_index_t* index;
+#endif /* !UNIV_HOTBACKUP */
+ ulint checksum;
+ ulint checksum_32;
+ ulint old_checksum;
+ ulint size = zip_size;
+
+ if (!size) {
+ size = UNIV_PAGE_SIZE;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+ (ulong) size);
+ ut_print_buf(stderr, read_buf, size);
+ fputs("\nInnoDB: End of page dump\n", stderr);
+
+ if (zip_size) {
+ /* Print compressed page. */
+
+ switch (fil_page_get_type(read_buf)) {
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ checksum = srv_use_checksums
+ ? page_zip_calc_checksum(read_buf, zip_size)
+ : BUF_NO_CHECKSUM_MAGIC;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Compressed BLOB page"
+ " checksum %lu, stored %lu\n"
+ "InnoDB: Page lsn %lu %lu\n"
+ "InnoDB: Page number (if stored"
+ " to page already) %lu,\n"
+ "InnoDB: space id (if stored"
+ " to page already) %lu\n",
+ (ulong) checksum,
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_LSN),
+ (ulong) mach_read_from_4(
+ read_buf + (FIL_PAGE_LSN + 4)),
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET),
+ (ulong) mach_read_from_4(
+ read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ return;
+ default:
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unknown page type %lu,"
+ " assuming FIL_PAGE_INDEX\n",
+ fil_page_get_type(read_buf));
+ /* fall through */
+ case FIL_PAGE_INDEX:
+ checksum = srv_use_checksums
+ ? page_zip_calc_checksum(read_buf, zip_size)
+ : BUF_NO_CHECKSUM_MAGIC;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Compressed page checksum %lu,"
+ " stored %lu\n"
+ "InnoDB: Page lsn %lu %lu\n"
+ "InnoDB: Page number (if stored"
+ " to page already) %lu,\n"
+ "InnoDB: space id (if stored"
+ " to page already) %lu\n",
+ (ulong) checksum,
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_LSN),
+ (ulong) mach_read_from_4(
+ read_buf + (FIL_PAGE_LSN + 4)),
+ (ulong) mach_read_from_4(
+ read_buf + FIL_PAGE_OFFSET),
+ (ulong) mach_read_from_4(
+ read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+ return;
+ case FIL_PAGE_TYPE_XDES:
+ /* This is an uncompressed page. */
+ break;
+ }
+ }
+
+ checksum = srv_use_checksums
+ ? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+ checksum_32 = srv_use_checksums
+ ? buf_calc_page_new_checksum_32(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+ old_checksum = srv_use_checksums
+ ? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Page checksum %lu (32bit_calc: %lu), prior-to-4.0.14-form"
+ " checksum %lu\n"
+ "InnoDB: stored checksum %lu, prior-to-4.0.14-form"
+ " stored checksum %lu\n"
+ "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
+ " at page end %lu\n"
+ "InnoDB: Page number (if stored to page already) %lu,\n"
+ "InnoDB: space id (if created with >= MySQL-4.1.1"
+ " and stored already) %lu\n",
+ (ulong) checksum, (ulong) checksum_32, (ulong) old_checksum,
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+ (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+ (ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ (ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+ (ulong) mach_read_from_4(read_buf
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+
+#ifndef UNIV_HOTBACKUP
+ if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_INSERT) {
+ fprintf(stderr,
+ "InnoDB: Page may be an insert undo log page\n");
+ } else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_UPDATE) {
+ fprintf(stderr,
+ "InnoDB: Page may be an update undo log page\n");
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ switch (fil_page_get_type(read_buf)) {
+ case FIL_PAGE_INDEX:
+ fprintf(stderr,
+ "InnoDB: Page may be an index page where"
+ " index id is %lu %lu\n",
+ (ulong) ut_dulint_get_high(
+ btr_page_get_index_id(read_buf)),
+ (ulong) ut_dulint_get_low(
+ btr_page_get_index_id(read_buf)));
+#ifndef UNIV_HOTBACKUP
+ index = dict_index_find_on_id_low(
+ btr_page_get_index_id(read_buf));
+ if (index) {
+ fputs("InnoDB: (", stderr);
+ dict_index_name_print(stderr, NULL, index);
+ fputs(")\n", stderr);
+ }
+#endif /* !UNIV_HOTBACKUP */
+ break;
+ case FIL_PAGE_INODE:
+ fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+ break;
+ case FIL_PAGE_IBUF_FREE_LIST:
+ fputs("InnoDB: Page may be an insert buffer free list page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ fputs("InnoDB: Page may be a freshly allocated page\n",
+ stderr);
+ break;
+ case FIL_PAGE_IBUF_BITMAP:
+ fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_SYS:
+ fputs("InnoDB: Page may be a system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_TRX_SYS:
+ fputs("InnoDB: Page may be a transaction system page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ fputs("InnoDB: Page may be a file space header page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_XDES:
+ fputs("InnoDB: Page may be an extent descriptor page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_BLOB:
+ fputs("InnoDB: Page may be a BLOB page\n",
+ stderr);
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ fputs("InnoDB: Page may be a compressed BLOB page\n",
+ stderr);
+ break;
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+buf_block_init(
+/*===========*/
+ buf_block_t* block, /*!< in: pointer to control block */
+ byte* frame) /*!< in: pointer to buffer frame */
+{
+ UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
+
+ block->frame = frame;
+
+ block->page.state = BUF_BLOCK_NOT_USED;
+ block->page.buf_fix_count = 0;
+ block->page.io_fix = BUF_IO_NONE;
+
+ block->modify_clock = 0;
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+ block->check_index_page_at_flush = FALSE;
+ block->index = NULL;
+
+#ifdef UNIV_DEBUG
+ block->page.in_page_hash = FALSE;
+ block->page.in_zip_hash = FALSE;
+ block->page.in_flush_list = FALSE;
+ block->page.in_free_list = FALSE;
+#endif /* UNIV_DEBUG */
+ block->page.in_LRU_list = FALSE;
+ block->in_unzip_LRU_list = FALSE;
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ page_zip_des_init(&block->page.zip);
+
+ mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
+ rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
+ ut_ad(rw_lock_validate(&(block->lock)));
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+static
+void
+buf_block_reuse(
+/*============*/
+ buf_block_t* block,
+ ptrdiff_t frame_offset)
+{
+ /* block_init */
+ block->frame += frame_offset;
+
+ UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block);
+
+ block->index = NULL;
+
+#ifdef UNIV_DEBUG
+ /* recreate later */
+ block->page.in_page_hash = FALSE;
+ block->page.in_zip_hash = FALSE;
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ if (block->page.zip.data)
+ block->page.zip.data += frame_offset;
+
+ block->is_hashed = FALSE;
+
+ mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
+ rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
+ ut_ad(rw_lock_validate(&(block->lock)));
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/********************************************************************//**
+Allocates a chunk of buffer frames.
+@return chunk, or NULL on failure */
+static
+buf_chunk_t*
+buf_chunk_init(
+/*===========*/
+ buf_chunk_t* chunk, /*!< out: chunk of buffers */
+ ulint mem_size) /*!< in: requested size in bytes */
+{
+ buf_block_t* block;
+ byte* frame;
+ ulint zip_hash_n = 0;
+ ulint zip_hash_mem_size = 0;
+ hash_table_t* zip_hash_tmp = NULL;
+ ulint i;
+ buf_shm_info_t* shm_info = NULL;
+
+ /* Round down to a multiple of page size,
+ although it already should be. */
+ mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
+
+ srv_buffer_pool_shm_is_reused = FALSE;
+
+ if (srv_buffer_pool_shm_key) {
+ /* zip_hash size */
+ zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2;
+ zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n)
+ + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+ }
+
+ /* Reserve space for the block descriptors. */
+ mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+ + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+ if (srv_buffer_pool_shm_key) {
+ mem_size += ut_2pow_round(sizeof(buf_shm_info_t)
+ + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+ mem_size += zip_hash_mem_size;
+ }
+
+ chunk->mem_size = mem_size;
+
+ if (srv_buffer_pool_shm_key) {
+ ulint binary_id;
+ ibool is_new;
+
+ ut_a(buf_pool->n_chunks == 1);
+
+ fprintf(stderr,
+ "InnoDB: Warning: The innodb_buffer_pool_shm_key option has been specified.\n"
+ "InnoDB: Do not change the following between restarts of the server while this option is being used:\n"
+ "InnoDB: * the mysqld executable between restarts of the server.\n"
+ "InnoDB: * the value of innodb_buffer_pool_size.\n"
+ "InnoDB: * the value of innodb_page_size.\n"
+ "InnoDB: * datafiles created by InnoDB during this session.\n"
+ "InnoDB: Otherwise, data corruption in datafiles may result.\n");
+
+ /* FIXME: This is vague id still */
+ binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get)
+ + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum)
+ + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal)
+ + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str)
+ + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version)
+ + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low)
+ + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func)
+ + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread)
+ + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside)
+ + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity)
+ + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup);
+
+ chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new);
+
+ if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+ return(NULL);
+ }
+init_again:
+#ifdef UNIV_SET_MEM_TO_ZERO
+ if (is_new) {
+ memset(chunk->mem, '\0', chunk->mem_size);
+ }
+#endif
+ /* for ut_fold_binary_32(), these values should be 32-bit aligned */
+ ut_a(sizeof(buf_shm_info_t) % 4 == 0);
+ ut_a((ulint)chunk->mem % 4 == 0);
+ ut_a(chunk->mem_size % 4 == 0);
+
+ shm_info = chunk->mem;
+
+ zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size);
+
+ if (is_new) {
+ strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8);
+ shm_info->binary_id = binary_id;
+ shm_info->is_new = TRUE; /* changed to FALSE when the initialization is finished */
+ shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
+ shm_info->reusable = FALSE; /* changed to TRUE when validation is finished. */
+ shm_info->buf_pool_size = srv_buf_pool_size;
+ shm_info->page_size = srv_page_size;
+ shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size;
+ shm_info->zip_hash_n = zip_hash_n;
+ } else {
+ ulint checksum;
+
+ if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) {
+ fprintf(stderr,
+ "InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n");
+ return(NULL);
+ }
+ if (shm_info->binary_id != binary_id) {
+ fprintf(stderr,
+ "InnoDB: Error: The shared memory segment seems not to be for this binary.\n");
+ return(NULL);
+ }
+ if (shm_info->is_new) {
+ fprintf(stderr,
+ "InnoDB: Error: The shared memory was not initialized yet.\n");
+ return(NULL);
+ }
+ if (shm_info->buf_pool_size != srv_buf_pool_size) {
+ fprintf(stderr,
+ "InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n",
+ shm_info->buf_pool_size, srv_buf_pool_size);
+ return(NULL);
+ }
+ if (shm_info->page_size != srv_page_size) {
+ fprintf(stderr,
+ "InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n",
+ shm_info->page_size, srv_page_size);
+ return(NULL);
+ }
+ if (!shm_info->reusable) {
+ fprintf(stderr,
+ "InnoDB: Warning: The shared memory has unrecoverable contents.\n"
+ "InnoDB: The shared memory segment is initialized.\n");
+ is_new = TRUE;
+ goto init_again;
+ }
+ if (!shm_info->clean) {
+ fprintf(stderr,
+ "InnoDB: Warning: The shared memory was not shut down cleanly.\n"
+ "InnoDB: The shared memory segment is initialized.\n");
+ is_new = TRUE;
+ goto init_again;
+ }
+
+ ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size);
+ ut_a(shm_info->zip_hash_n == zip_hash_n);
+
+ /* check checksum */
+ if (srv_buffer_pool_shm_checksum) {
+ checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
+ chunk->mem_size - sizeof(buf_shm_info_t));
+ } else {
+ checksum = BUF_NO_CHECKSUM_MAGIC;
+ }
+
+ if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC
+ && shm_info->checksum != checksum) {
+ fprintf(stderr,
+ "InnoDB: Error: checksum of the shared memory is not match. "
+ "(stored=%lu calculated=%lu)\n",
+ shm_info->checksum, checksum);
+ return(NULL);
+ }
+
+ /* flag to use the segment. */
+ shm_info->clean = FALSE; /* changed to TRUE when free the segment. */
+ }
+
+ /* init zip_hash contents */
+ if (is_new) {
+ hash_create_init(zip_hash_tmp, zip_hash_n);
+ } else {
+ /* adjust offset is done later */
+ hash_create_reuse(zip_hash_tmp);
+
+ srv_buffer_pool_shm_is_reused = TRUE;
+ }
+ } else {
+ chunk->mem = os_mem_alloc_large(&chunk->mem_size);
+
+ if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+
+ return(NULL);
+ }
+ }
+
+ /* Allocate the block descriptors from
+ the start of the memory block. */
+ if (srv_buffer_pool_shm_key) {
+ chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t));
+ } else {
+ chunk->blocks = chunk->mem;
+ }
+
+ /* Align a pointer to the first frame. Note that when
+ os_large_page_size is smaller than UNIV_PAGE_SIZE,
+ we may allocate one fewer block than requested. When
+ it is bigger, we may allocate more blocks than requested. */
+
+ frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
+ if (srv_buffer_pool_shm_key) {
+ /* reserve zip_hash space and always -1 for reproductibity */
+ chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1;
+ } else {
+ chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
+ - (frame != chunk->mem);
+ }
+
+ /* Subtract the space needed for block descriptors. */
+ {
+ ulint size = chunk->size;
+
+ while (frame < (byte*) (chunk->blocks + size)) {
+ frame += UNIV_PAGE_SIZE;
+ size--;
+ }
+
+ chunk->size = size;
+ }
+
+ if (shm_info && !(shm_info->is_new)) {
+ /* convert the shared memory segment for reuse */
+ ptrdiff_t phys_offset;
+ ptrdiff_t logi_offset;
+ ptrdiff_t blocks_offset;
+ byte* previous_frame_address;
+
+ if (chunk->size < shm_info->chunk_backup.size) {
+ fprintf(stderr,
+ "InnoDB: Error: The buffer pool became smaller because of allocated address.\n"
+ "InnoDB: Retrying may avoid this situation.\n");
+ shm_info->clean = TRUE; /* release the flag for retrying */
+ return(NULL);
+ }
+
+ chunk->size = shm_info->chunk_backup.size;
+ phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset);
+ logi_offset = frame - chunk->blocks[0].frame;
+ previous_frame_address = chunk->blocks[0].frame;
+ blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks;
+
+ if (phys_offset || logi_offset || blocks_offset) {
+ fprintf(stderr,
+ "InnoDB: Buffer pool in the shared memory segment should be converted.\n"
+ "InnoDB: Previous frames in address : %p\n"
+ "InnoDB: Previous frames were located : %p\n"
+ "InnoDB: Current frames should be located: %p\n"
+ "InnoDB: Pysical offset : %ld (%#lx)\n"
+ "InnoDB: Logical offset (frames) : %ld (%#lx)\n"
+ "InnoDB: Logical offset (blocks) : %ld (%#lx)\n",
+ (byte*)chunk->mem + shm_info->frame_offset,
+ chunk->blocks[0].frame, frame,
+ (long) phys_offset, (ulong) phys_offset, (long) logi_offset, (ulong) logi_offset,
+ (long) blocks_offset, (ulong) blocks_offset);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Buffer pool in the shared memory segment can be used as it is.\n");
+ }
+
+ if (phys_offset) {
+ fprintf(stderr,
+ "InnoDB: Aligning physical offset...");
+
+ memmove(frame, (byte*)chunk->mem + shm_info->frame_offset,
+ chunk->size * UNIV_PAGE_SIZE);
+
+ fprintf(stderr,
+ " Done.\n");
+ }
+
+ /* buf_block_t */
+ block = chunk->blocks;
+ for (i = chunk->size; i--; ) {
+ buf_block_reuse(block, logi_offset);
+ block++;
+ }
+
+ if (logi_offset || blocks_offset) {
+ fprintf(stderr,
+ "InnoDB: Aligning logical offset...");
+
+
+ /* buf_pool_t buf_pool_backup */
+ UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list,
+ previous_frame_address, logi_offset, blocks_offset);
+ UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free,
+ previous_frame_address, logi_offset, blocks_offset);
+ UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU,
+ previous_frame_address, logi_offset, blocks_offset);
+ if (shm_info->buf_pool_backup.LRU_old)
+ shm_info->buf_pool_backup.LRU_old =
+ (buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old)
+ + (((byte*)shm_info->buf_pool_backup.LRU_old > previous_frame_address)
+ ? logi_offset : blocks_offset));
+
+ UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU,
+ previous_frame_address, logi_offset, blocks_offset);
+
+ UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean,
+ previous_frame_address, logi_offset, blocks_offset);
+ for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
+ UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i],
+ previous_frame_address, logi_offset, blocks_offset);
+ }
+
+ HASH_OFFSET(zip_hash_tmp, buf_page_t, hash,
+ previous_frame_address, logi_offset, blocks_offset);
+
+ fprintf(stderr,
+ " Done.\n");
+ }
+ } else {
+ /* Init block structs and assign frames for them. Then we
+ assign the frames to the first blocks (we already mapped the
+ memory above). */
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; ) {
+
+ buf_block_init(block, frame);
+
+#ifdef HAVE_valgrind
+ /* Wipe contents of frame to eliminate a Purify warning */
+ memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#endif
+ /* Add the block to the free list */
+ mutex_enter(&free_list_mutex);
+ UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page));
+ ut_d(block->page.in_free_list = TRUE);
+ mutex_exit(&free_list_mutex);
+
+ block++;
+ frame += UNIV_PAGE_SIZE;
+ }
+ }
+
+ if (shm_info) {
+ shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem;
+ }
+
+ return(chunk);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the given buffer chunk that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+static
+buf_block_t*
+buf_chunk_contains_zip(
+/*===================*/
+ buf_chunk_t* chunk, /*!< in: chunk being checked */
+ const void* data) /*!< in: pointer to compressed page */
+{
+ buf_block_t* block;
+ ulint i;
+
+ ut_ad(buf_pool);
+ //ut_ad(buf_pool_mutex_own());
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; block++) {
+ if (block->page.zip.data == data) {
+
+ return(block);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+ const void* data) /*!< in: pointer to compressed page */
+{
+ ulint n;
+ buf_chunk_t* chunk = buf_pool->chunks;
+
+ for (n = buf_pool->n_chunks; n--; chunk++) {
+ buf_block_t* block = buf_chunk_contains_zip(chunk, data);
+
+ if (block) {
+ return(block);
+ }
+ }
+
+ return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks that all file pages in the buffer chunk are in a replaceable state.
+@return address of a non-free block, or NULL if all freed */
+static
+const buf_block_t*
+buf_chunk_not_freed(
+/*================*/
+ buf_chunk_t* chunk) /*!< in: chunk being checked */
+{
+ buf_block_t* block;
+ ulint i;
+
+ ut_ad(buf_pool);
+ //ut_ad(buf_pool_mutex_own()); /*optimistic...*/
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; block++) {
+ ibool ready;
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* The uncompressed buffer pool should never
+ contain compressed block descriptors. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* Skip blocks that are not being used for
+ file pages. */
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ mutex_enter(&block->mutex);
+ ready = buf_flush_ready_for_replace(&block->page);
+ mutex_exit(&block->mutex);
+
+ if (block->page.is_corrupt) {
+ /* corrupt page may remain, it can be skipped */
+ break;
+ }
+
+ if (!ready) {
+
+ return(block);
+ }
+
+ break;
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state.
+@return TRUE if all freed */
+static
+ibool
+buf_chunk_all_free(
+/*===============*/
+ const buf_chunk_t* chunk) /*!< in: chunk being checked */
+{
+ const buf_block_t* block;
+ ulint i;
+
+ ut_ad(buf_pool);
+ ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
+
+ block = chunk->blocks;
+
+ for (i = chunk->size; i--; block++) {
+
+ if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) {
+
+ return(FALSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Frees a chunk of buffer frames. */
+static
+void
+buf_chunk_free(
+/*===========*/
+ buf_chunk_t* chunk) /*!< out: chunk of buffers */
+{
+ buf_block_t* block;
+ const buf_block_t* block_end;
+
+ ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
+
+ block_end = chunk->blocks + chunk->size;
+
+ for (block = chunk->blocks; block < block_end; block++) {
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED);
+ ut_a(!block->page.zip.data);
+
+ ut_ad(!block->page.in_LRU_list);
+ ut_ad(!block->in_unzip_LRU_list);
+ ut_ad(!block->page.in_flush_list);
+ /* Remove the block from the free list. */
+ mutex_enter(&free_list_mutex);
+ ut_ad(block->page.in_free_list);
+ UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+ mutex_exit(&free_list_mutex);
+
+ /* Free the latches. */
+ mutex_free(&block->mutex);
+ rw_lock_free(&block->lock);
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_free(&block->debug_latch);
+#endif /* UNIV_SYNC_DEBUG */
+ UNIV_MEM_UNDESC(block);
+ }
+
+ ut_a(!srv_buffer_pool_shm_key);
+
+ os_mem_free_large(chunk->mem, chunk->mem_size);
+}
+
+/********************************************************************//**
+Creates the buffer pool.
+@return own: buf_pool object, NULL if not enough memory or error */
+UNIV_INTERN
+buf_pool_t*
+buf_pool_init(void)
+/*===============*/
+{
+ buf_chunk_t* chunk;
+ ulint i;
+
+ buf_pool = mem_zalloc(sizeof(buf_pool_t));
+
+ /* 1. Initialize general fields
+ ------------------------------- */
+ mutex_create(&buf_pool_mutex, SYNC_BUF_POOL);
+ mutex_create(&LRU_list_mutex, SYNC_BUF_LRU_LIST);
+ mutex_create(&flush_list_mutex, SYNC_BUF_FLUSH_LIST);
+ rw_lock_create(&page_hash_latch, SYNC_BUF_PAGE_HASH);
+ mutex_create(&free_list_mutex, SYNC_BUF_FREE_LIST);
+ mutex_create(&zip_free_mutex, SYNC_BUF_ZIP_FREE);
+ mutex_create(&zip_hash_mutex, SYNC_BUF_ZIP_HASH);
+
+ mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK);
+
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ buf_pool_mutex_enter();
+
+ buf_pool->n_chunks = 1;
+ buf_pool->chunks = chunk = mem_alloc(sizeof *chunk);
+
+ UT_LIST_INIT(buf_pool->free);
+
+ if (!buf_chunk_init(chunk, srv_buf_pool_size)) {
+ mem_free(chunk);
+ mem_free(buf_pool);
+ buf_pool = NULL;
+ return(NULL);
+ }
+
+ srv_buf_pool_old_size = srv_buf_pool_size;
+ buf_pool->curr_size = chunk->size;
+ srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+
+ buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
+ /* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */
+ if (!srv_buffer_pool_shm_key) {
+ buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
+ }
+
+ buf_pool->last_printout_time = time(NULL);
+
+ /* 2. Initialize flushing fields
+ -------------------------------- */
+
+ for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+ buf_pool->no_flush[i] = os_event_create(NULL);
+ }
+
+ /* 3. Initialize LRU fields
+ --------------------------- */
+ /* All fields are initialized by mem_zalloc(). */
+
+ if (srv_buffer_pool_shm_key) {
+ buf_shm_info_t* shm_info;
+
+ ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
+ shm_info = chunk->mem;
+
+ buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset);
+
+ if(shm_info->is_new) {
+ shm_info->is_new = FALSE; /* initialization was finished */
+ } else {
+ buf_block_t* block = chunk->blocks;
+ buf_page_t* b;
+
+ /* shm_info->buf_pool_backup should be converted */
+ /* at buf_chunk_init(). So copy simply. */
+ buf_pool->flush_list = shm_info->buf_pool_backup.flush_list;
+ buf_pool->freed_page_clock = shm_info->buf_pool_backup.freed_page_clock;
+ buf_pool->free = shm_info->buf_pool_backup.free;
+ buf_pool->LRU = shm_info->buf_pool_backup.LRU;
+ buf_pool->LRU_old = shm_info->buf_pool_backup.LRU_old;
+ buf_pool->LRU_old_len = shm_info->buf_pool_backup.LRU_old_len;
+ buf_pool->unzip_LRU = shm_info->buf_pool_backup.unzip_LRU;
+ buf_pool->zip_clean = shm_info->buf_pool_backup.zip_clean;
+ for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
+ buf_pool->zip_free[i] = shm_info->buf_pool_backup.zip_free[i];
+ }
+
+ for (i = 0; i < chunk->size; i++, block++) {
+ if (buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE) {
+ ut_d(block->page.in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(
+ block->page.space,
+ block->page.offset),
+ &block->page);
+ }
+ }
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(zip_list, b)) {
+ ut_ad(!b->in_flush_list);
+ ut_ad(b->in_LRU_list);
+
+ ut_d(b->in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(b->space, b->offset), b);
+ }
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(flush_list, b)) {
+ ut_ad(b->in_flush_list);
+ ut_ad(b->in_LRU_list);
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_d(b->in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(b->space,
+ b->offset), b);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ }
+
+
+ }
+ }
+
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ buf_pool_mutex_exit();
+
+ btr_search_sys_create(buf_pool->curr_size
+ * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+
+ /* 4. Initialize the buddy allocator fields */
+ /* All fields are initialized by mem_zalloc(). */
+
+ return(buf_pool);
+}
+
+/********************************************************************//**
+Frees the buffer pool at shutdown. This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(void)
+/*===============*/
+{
+ buf_chunk_t* chunk;
+ buf_chunk_t* chunks;
+
+ if (srv_buffer_pool_shm_key) {
+ buf_shm_info_t* shm_info;
+
+ ut_a(buf_pool->n_chunks == 1);
+
+ chunk = buf_pool->chunks;
+ shm_info = chunk->mem;
+ ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
+
+ /* validation the shared memory segment doesn't have unrecoverable contents. */
+ /* Currently, validation became not needed */
+ shm_info->reusable = TRUE;
+
+ memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t));
+ memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t));
+
+ if (srv_fast_shutdown < 2) {
+ if (srv_buffer_pool_shm_checksum) {
+ shm_info->checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
+ chunk->mem_size - sizeof(buf_shm_info_t));
+ } else {
+ shm_info->checksum = BUF_NO_CHECKSUM_MAGIC;
+ }
+ shm_info->clean = TRUE;
+ }
+
+ os_shm_free(chunk->mem, chunk->mem_size);
+ } else {
+ chunks = buf_pool->chunks;
+ chunk = chunks + buf_pool->n_chunks;
+
+ while (--chunk >= chunks) {
+ /* Bypass the checks of buf_chunk_free(), since they
+ would fail at shutdown. */
+ os_mem_free_large(chunk->mem, chunk->mem_size);
+ }
+ }
+
+ mem_free(buf_pool->chunks);
+ hash_table_free(buf_pool->page_hash);
+ if (!srv_buffer_pool_shm_key) {
+ hash_table_free(buf_pool->zip_hash);
+ }
+ mem_free(buf_pool);
+ buf_pool = NULL;
+}
+
+/********************************************************************//**
+Drops the adaptive hash index. To prevent a livelock, this function
+is only to be called while holding btr_search_latch and while
+btr_search_enabled == FALSE. */
+UNIV_INTERN
+void
+buf_pool_drop_hash_index(void)
+/*==========================*/
+{
+ ibool released_search_latch;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(!btr_search_enabled);
+
+ do {
+ buf_chunk_t* chunks = buf_pool->chunks;
+ buf_chunk_t* chunk = chunks + buf_pool->n_chunks;
+
+ released_search_latch = FALSE;
+
+ while (--chunk >= chunks) {
+ buf_block_t* block = chunk->blocks;
+ ulint i = chunk->size;
+
+ for (; i--; block++) {
+ /* block->is_hashed cannot be modified
+ when we have an x-latch on btr_search_latch;
+ see the comment in buf0buf.h */
+
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE
+ || !block->is_hashed) {
+ continue;
+ }
+
+ /* To follow the latching order, we
+ have to release btr_search_latch
+ before acquiring block->latch. */
+ rw_lock_x_unlock(&btr_search_latch);
+ /* When we release the search latch,
+ we must rescan all blocks, because
+ some may become hashed again. */
+ released_search_latch = TRUE;
+
+ rw_lock_x_lock(&block->lock);
+
+ /* This should be guaranteed by the
+ callers, which will be holding
+ btr_search_enabled_mutex. */
+ ut_ad(!btr_search_enabled);
+
+ /* Because we did not buffer-fix the
+ block by calling buf_block_get_gen(),
+ it is possible that the block has been
+ allocated for some other use after
+ btr_search_latch was released above.
+ We do not care which file page the
+ block is mapped to. All we want to do
+ is to drop any hash entries referring
+ to the page. */
+
+ /* It is possible that
+ block->page.state != BUF_FILE_PAGE.
+ Even that does not matter, because
+ btr_search_drop_page_hash_index() will
+ check block->is_hashed before doing
+ anything. block->is_hashed can only
+ be set on uncompressed file pages. */
+
+ btr_search_drop_page_hash_index(block);
+
+ rw_lock_x_unlock(&block->lock);
+
+ rw_lock_x_lock(&btr_search_latch);
+
+ ut_ad(!btr_search_enabled);
+ }
+ }
+ } while (released_search_latch);
+}
+
+/********************************************************************//**
+Relocate a buffer control block. Relocates the block on the LRU list
+and in buf_pool->page_hash. Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+ buf_page_t* bpage, /*!< in/out: control block being relocated;
+ buf_page_get_state(bpage) must be
+ BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+ buf_page_t* dpage) /*!< in/out: destination control block */
+{
+ buf_page_t* b;
+ ulint fold;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+ ut_a(bpage->buf_fix_count == 0);
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset));
+#ifdef UNIV_DEBUG
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_ZIP_PAGE:
+ break;
+ }
+#endif /* UNIV_DEBUG */
+
+ memcpy(dpage, bpage, sizeof *dpage);
+
+ bpage->in_LRU_list = FALSE;
+ ut_d(bpage->in_page_hash = FALSE);
+
+ /* relocate buf_pool->LRU */
+ b = UT_LIST_GET_PREV(LRU, bpage);
+ UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+
+ if (b) {
+ UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
+ }
+
+ if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
+ buf_pool->LRU_old = dpage;
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool->LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool->LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+ } else {
+ /* Check that the "old" flag is consistent in
+ the block and its neighbours. */
+ buf_page_set_old(dpage, buf_page_is_old(dpage));
+#endif /* UNIV_LRU_DEBUG */
+ }
+
+ ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
+ ut_ad(ut_list_node_313->in_LRU_list)));
+
+ /* relocate buf_pool->page_hash */
+ fold = buf_page_address_fold(bpage->space, bpage->offset);
+
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
+}
+
+/********************************************************************//**
+Shrinks the buffer pool. */
+static
+void
+buf_pool_shrink(
+/*============*/
+ ulint chunk_size) /*!< in: number of pages to remove */
+{
+ buf_chunk_t* chunks;
+ buf_chunk_t* chunk;
+ ulint max_size;
+ ulint max_free_size;
+ buf_chunk_t* max_chunk;
+ buf_chunk_t* max_free_chunk;
+
+ ut_ad(!buf_pool_mutex_own());
+
+try_again:
+ btr_search_disable(); /* Empty the adaptive hash index again */
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ if (srv_buffer_pool_shm_key) {
+ /* Cannot support shrink */
+ goto func_done;
+ }
+
+shrink_again:
+ if (buf_pool->n_chunks <= 1) {
+
+ /* Cannot shrink if there is only one chunk */
+ goto func_done;
+ }
+
+ /* Search for the largest free chunk
+ not larger than the size difference */
+ chunks = buf_pool->chunks;
+ chunk = chunks + buf_pool->n_chunks;
+ max_size = max_free_size = 0;
+ max_chunk = max_free_chunk = NULL;
+
+ while (--chunk >= chunks) {
+ if (chunk->size <= chunk_size
+ && chunk->size > max_free_size) {
+ if (chunk->size > max_size) {
+ max_size = chunk->size;
+ max_chunk = chunk;
+ }
+
+ if (buf_chunk_all_free(chunk)) {
+ max_free_size = chunk->size;
+ max_free_chunk = chunk;
+ }
+ }
+ }
+
+ if (!max_free_size) {
+
+ ulint dirty = 0;
+ ulint nonfree = 0;
+ buf_block_t* block;
+ buf_block_t* bend;
+
+ /* Cannot shrink: try again later
+ (do not assign srv_buf_pool_old_size) */
+ if (!max_chunk) {
+
+ goto func_exit;
+ }
+
+ block = max_chunk->blocks;
+ bend = block + max_chunk->size;
+
+ /* Move the blocks of chunk to the end of the
+ LRU list and try to flush them. */
+ for (; block < bend; block++) {
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_NOT_USED:
+ continue;
+ case BUF_BLOCK_FILE_PAGE:
+ break;
+ default:
+ nonfree++;
+ continue;
+ }
+
+ mutex_enter(&block->mutex);
+ /* The following calls will temporarily
+ release block->mutex and buf_pool_mutex.
+ Therefore, we have to always retry,
+ even if !dirty && !nonfree. */
+
+ if (!buf_flush_ready_for_replace(&block->page)) {
+
+ buf_LRU_make_block_old(&block->page);
+ dirty++;
+ } else if (buf_LRU_free_block(&block->page, TRUE, NULL, FALSE)
+ != BUF_LRU_FREED) {
+ nonfree++;
+ }
+
+ mutex_exit(&block->mutex);
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+
+ /* Request for a flush of the chunk if it helps.
+ Do not flush if there are non-free blocks, since
+ flushing will not make the chunk freeable. */
+ if (nonfree) {
+ /* Avoid busy-waiting. */
+ os_thread_sleep(100000);
+ } else if (dirty
+ && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0)
+ == ULINT_UNDEFINED) {
+
+ buf_flush_wait_batch_end(BUF_FLUSH_LRU);
+ }
+
+ goto try_again;
+ }
+
+ max_size = max_free_size;
+ max_chunk = max_free_chunk;
+
+ srv_buf_pool_old_size = srv_buf_pool_size;
+
+ /* Rewrite buf_pool->chunks. Copy everything but max_chunk. */
+ chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks);
+ memcpy(chunks, buf_pool->chunks,
+ (max_chunk - buf_pool->chunks) * sizeof *chunks);
+ memcpy(chunks + (max_chunk - buf_pool->chunks),
+ max_chunk + 1,
+ buf_pool->chunks + buf_pool->n_chunks
+ - (max_chunk + 1));
+ ut_a(buf_pool->curr_size > max_chunk->size);
+ buf_pool->curr_size -= max_chunk->size;
+ srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+ chunk_size -= max_chunk->size;
+ buf_chunk_free(max_chunk);
+ mem_free(buf_pool->chunks);
+ buf_pool->chunks = chunks;
+ buf_pool->n_chunks--;
+
+ /* Allow a slack of one megabyte. */
+ if (chunk_size > 1048576 / UNIV_PAGE_SIZE) {
+
+ goto shrink_again;
+ }
+
+func_done:
+ srv_buf_pool_old_size = srv_buf_pool_size;
+func_exit:
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ btr_search_enable();
+}
+
+/********************************************************************//**
+Rebuild buf_pool->page_hash. */
+static
+void
+buf_pool_page_hash_rebuild(void)
+/*============================*/
+{
+ ulint i;
+ ulint n_chunks;
+ buf_chunk_t* chunk;
+ hash_table_t* page_hash;
+ hash_table_t* zip_hash;
+ buf_page_t* b;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ mutex_enter(&flush_list_mutex);
+
+
+ /* Free, create, and populate the hash table. */
+ hash_table_free(buf_pool->page_hash);
+ buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size);
+ zip_hash = hash_create(2 * buf_pool->curr_size);
+
+ HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
+ BUF_POOL_ZIP_FOLD_BPAGE);
+
+ hash_table_free(buf_pool->zip_hash);
+ buf_pool->zip_hash = zip_hash;
+
+ /* Insert the uncompressed file pages to buf_pool->page_hash. */
+
+ chunk = buf_pool->chunks;
+ n_chunks = buf_pool->n_chunks;
+
+ for (i = 0; i < n_chunks; i++, chunk++) {
+ ulint j;
+ buf_block_t* block = chunk->blocks;
+
+ for (j = 0; j < chunk->size; j++, block++) {
+ if (buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE) {
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(block->page.in_page_hash);
+
+ HASH_INSERT(buf_page_t, hash, page_hash,
+ buf_page_address_fold(
+ block->page.space,
+ block->page.offset),
+ &block->page);
+ }
+ }
+ }
+
+ /* Insert the compressed-only pages to buf_pool->page_hash.
+ All such blocks are either in buf_pool->zip_clean or
+ in buf_pool->flush_list. */
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(zip_list, b)) {
+ ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+ ut_ad(!b->in_flush_list);
+ ut_ad(b->in_LRU_list);
+ ut_ad(b->in_page_hash);
+ ut_ad(!b->in_zip_hash);
+
+ HASH_INSERT(buf_page_t, hash, page_hash,
+ buf_page_address_fold(b->space, b->offset), b);
+ }
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(flush_list, b)) {
+ ut_ad(b->in_flush_list);
+ ut_ad(b->in_LRU_list);
+ ut_ad(b->in_page_hash);
+ ut_ad(!b->in_zip_hash);
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ HASH_INSERT(buf_page_t, hash, page_hash,
+ buf_page_address_fold(b->space,
+ b->offset), b);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Resizes the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_resize(void)
+/*=================*/
+{
+ if (srv_buffer_pool_shm_key) {
+ /* Cannot support resize */
+ return;
+ }
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ if (srv_buf_pool_old_size == srv_buf_pool_size) {
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ return;
+ }
+
+ if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) {
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+
+ /* Disable adaptive hash indexes and empty the index
+ in order to free up memory in the buffer pool chunks. */
+ buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size)
+ / UNIV_PAGE_SIZE);
+ } else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) {
+
+ /* Enlarge the buffer pool by at least one megabyte */
+
+ ulint mem_size
+ = srv_buf_pool_size - srv_buf_pool_curr_size;
+ buf_chunk_t* chunks;
+ buf_chunk_t* chunk;
+
+ chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks);
+
+ memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks
+ * sizeof *chunks);
+
+ chunk = &chunks[buf_pool->n_chunks];
+
+ if (!buf_chunk_init(chunk, mem_size)) {
+ mem_free(chunks);
+ } else {
+ buf_pool->curr_size += chunk->size;
+ srv_buf_pool_curr_size = buf_pool->curr_size
+ * UNIV_PAGE_SIZE;
+ mem_free(buf_pool->chunks);
+ buf_pool->chunks = chunks;
+ buf_pool->n_chunks++;
+ }
+
+ srv_buf_pool_old_size = srv_buf_pool_size;
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ }
+
+ buf_pool_page_hash_rebuild();
+}
+
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+ buf_page_t* bpage) /*!< in: buffer block of a file page */
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ ut_a(buf_page_in_file(bpage));
+
+ buf_LRU_make_block_young(bpage);
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+}
+
+/********************************************************************//**
+Sets the time of the first access of a page and moves a page to the
+start of the buffer pool LRU list if it is too old. This high-level
+function can be used to prevent an important page from slipping
+out of the buffer pool. */
+static
+void
+buf_page_set_accessed_make_young(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: buffer block of a
+ file page */
+ unsigned access_time) /*!< in: bpage->access_time
+ read under mutex protection,
+ or 0 if unknown */
+{
+ ut_ad(!buf_pool_mutex_own());
+ ut_a(buf_page_in_file(bpage));
+
+ if (buf_page_peek_if_too_old(bpage)) {
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ buf_LRU_make_block_young(bpage);
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ } else if (!access_time) {
+ ulint time_ms = ut_time_ms();
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+ //buf_pool_mutex_enter();
+ if (block_mutex) {
+ buf_page_set_accessed(bpage, time_ms);
+ mutex_exit(block_mutex);
+ }
+ //buf_pool_mutex_exit();
+ }
+}
+
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_block_t* block;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+ if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
+ block->check_index_page_at_flush = FALSE;
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+}
+
+/********************************************************************//**
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there.
+@return TRUE if page hash index is built in search system */
+UNIV_INTERN
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_block_t* block;
+ ibool is_hashed;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+ if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+ is_hashed = FALSE;
+ } else {
+ is_hashed = block->is_hashed;
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(is_hashed);
+}
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ bpage = buf_page_hash_get(space, offset);
+
+ if (bpage) {
+ bpage->file_page_was_freed = TRUE;
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(bpage);
+}
+
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ bpage = buf_page_hash_get(space, offset);
+
+ if (bpage) {
+ bpage->file_page_was_freed = FALSE;
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(bpage);
+}
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return pointer to the block */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size */
+ ulint offset) /*!< in: page number */
+{
+ buf_page_t* bpage;
+ mutex_t* block_mutex;
+ ibool must_read;
+ unsigned access_time;
+ trx_t* trx = NULL;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t start_time;
+ ib_uint64_t finish_time;
+
+#ifndef UNIV_LOG_DEBUG
+ ut_ad(!ibuf_inside());
+#endif
+ if (innobase_get_slow_log()) {
+ trx = innobase_get_trx();
+ }
+ buf_pool->stat.n_page_gets++;
+
+ for (;;) {
+ //buf_pool_mutex_enter();
+lookup:
+ rw_lock_s_lock(&page_hash_latch);
+ bpage = buf_page_hash_get(space, offset);
+ if (bpage) {
+ break;
+ }
+
+ /* Page not in buf_pool: needs to be read from file */
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ buf_read_page(space, zip_size, offset, trx);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ }
+
+ if (UNIV_UNLIKELY(!bpage->zip.data)) {
+ /* There is no compressed page. */
+err_exit:
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+ return(NULL);
+ }
+
+ if (srv_pass_corrupt_table) {
+ if (bpage->is_corrupt) {
+ rw_lock_s_unlock(&page_hash_latch);
+ return(NULL);
+ }
+ }
+ ut_a(!(bpage->is_corrupt));
+
+ block_mutex = buf_page_get_mutex_enter(bpage);
+
+ rw_lock_s_unlock(&page_hash_latch);
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ case BUF_BLOCK_ZIP_FREE:
+ if (block_mutex)
+ mutex_exit(block_mutex);
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_a(block_mutex == &buf_pool_zip_mutex);
+ bpage->buf_fix_count++;
+ goto got_block;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_a(block_mutex == &((buf_block_t*) bpage)->mutex);
+
+ /* Discard the uncompressed page frame if possible. */
+ if (buf_LRU_free_block(bpage, FALSE, NULL, FALSE)
+ == BUF_LRU_FREED) {
+
+ mutex_exit(block_mutex);
+ goto lookup;
+ }
+
+ buf_block_buf_fix_inc((buf_block_t*) bpage,
+ __FILE__, __LINE__);
+ goto got_block;
+ }
+
+ ut_error;
+ goto err_exit;
+
+got_block:
+ must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
+ access_time = buf_page_is_accessed(bpage);
+
+ //buf_pool_mutex_exit();
+
+ mutex_exit(block_mutex);
+
+ buf_page_set_accessed_make_young(bpage, access_time);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ ut_a(!bpage->file_page_was_freed);
+#endif
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(bpage->buf_fix_count > 0);
+ ut_a(buf_page_in_file(bpage));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ if (must_read) {
+ /* Let us wait until the read operation
+ completes */
+
+ if (innobase_get_slow_log() && trx && trx->take_stats)
+ {
+ ut_usectime(&sec, &ms);
+ start_time = (ib_uint64_t)sec * 1000000 + ms;
+ } else {
+ start_time = 0;
+ }
+ for (;;) {
+ enum buf_io_fix io_fix;
+
+ mutex_enter(block_mutex);
+ io_fix = buf_page_get_io_fix(bpage);
+ mutex_exit(block_mutex);
+
+ if (io_fix == BUF_IO_READ) {
+
+ os_thread_sleep(WAIT_FOR_READ);
+ } else {
+ break;
+ }
+ }
+ if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_page_get_space(bpage),
+ buf_page_get_page_no(bpage)) == 0);
+#endif
+ return(bpage);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+ buf_block_t* block) /*!< in: block to init */
+{
+ block->check_index_page_at_flush = FALSE;
+ block->index = NULL;
+
+ block->n_hash_helps = 0;
+ block->is_hashed = FALSE;
+ block->n_fields = 1;
+ block->n_bytes = 0;
+ block->left_side = TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check) /*!< in: TRUE=verify the page checksum */
+{
+ const byte* frame = block->page.zip.data;
+ ulint stamp_checksum = mach_read_from_4(
+ frame + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ ut_ad(buf_block_get_zip_size(block));
+ ut_a(buf_block_get_space(block) != 0);
+
+ if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
+ ulint calc_checksum = page_zip_calc_checksum(
+ frame, page_zip_get_size(&block->page.zip));
+
+ if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: compressed page checksum mismatch"
+ " (space %u page %u): %lu != %lu\n",
+ block->page.space, block->page.offset,
+ stamp_checksum, calc_checksum);
+ return(FALSE);
+ }
+ }
+
+ switch (fil_page_get_type(frame)) {
+ case FIL_PAGE_INDEX:
+ if (page_zip_decompress(&block->page.zip,
+ block->frame, TRUE)) {
+ return(TRUE);
+ }
+
+ fprintf(stderr,
+ "InnoDB: unable to decompress space %lu page %lu\n",
+ (ulong) block->page.space,
+ (ulong) block->page.offset);
+ return(FALSE);
+
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ /* Copy to uncompressed storage. */
+ memcpy(block->frame, frame,
+ buf_block_get_zip_size(block));
+ return(TRUE);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: unknown compressed page"
+ " type %lu\n",
+ fil_page_get_type(frame));
+ return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+ const byte* ptr) /*!< in: pointer to a frame */
+{
+ buf_chunk_t* chunk;
+ ulint i;
+
+ /* TODO: protect buf_pool->chunks with a mutex (it will
+ currently remain constant after buf_pool_init()) */
+ for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
+ lint offs = ptr - chunk->blocks->frame;
+
+ if (UNIV_UNLIKELY(offs < 0)) {
+
+ continue;
+ }
+
+ offs >>= UNIV_PAGE_SIZE_SHIFT;
+
+ if (UNIV_LIKELY((ulint) offs < chunk->size)) {
+ buf_block_t* block = &chunk->blocks[offs];
+
+ /* The function buf_chunk_init() invokes
+ buf_block_init() so that block[n].frame ==
+ block->frame + n * UNIV_PAGE_SIZE. Check it. */
+ ut_ad(block->frame == page_align(ptr));
+#ifdef UNIV_DEBUG
+ /* A thread that updates these fields must
+ hold buf_pool_mutex and block->mutex. Acquire
+ only the latter. */
+ mutex_enter(&block->mutex);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* These types should only be used in
+ the compressed buffer pool, whose
+ memory is allocated from
+ buf_pool->chunks, in UNIV_PAGE_SIZE
+ blocks flagged as BUF_BLOCK_MEMORY. */
+ ut_error;
+ break;
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ /* Some data structures contain
+ "guess" pointers to file pages. The
+ file pages may have been freed and
+ reused. Do not complain. */
+ break;
+ case BUF_BLOCK_REMOVE_HASH:
+ /* buf_LRU_block_remove_hashed_page()
+ will overwrite the FIL_PAGE_OFFSET and
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
+ 0xff and set the state to
+ BUF_BLOCK_REMOVE_HASH. */
+ ut_ad(page_get_space_id(page_align(ptr))
+ == 0xffffffff);
+ ut_ad(page_get_page_no(page_align(ptr))
+ == 0xffffffff);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(block->page.space
+ == page_get_space_id(page_align(ptr)));
+ ut_ad(block->page.offset
+ == page_get_page_no(page_align(ptr)));
+ break;
+ }
+
+ mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG */
+
+ return(block);
+ }
+ }
+
+ /* The block should always be found. */
+ ut_error;
+ return(NULL);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+ const void* ptr) /*!< in: pointer not
+ dereferenced */
+{
+ const buf_chunk_t* chunk = buf_pool->chunks;
+ const buf_chunk_t* const echunk = chunk + buf_pool->n_chunks;
+
+ /* TODO: protect buf_pool->chunks with a mutex (it will
+ currently remain constant after buf_pool_init()) */
+ while (chunk < echunk) {
+ if (ptr >= (void *)chunk->blocks
+ && ptr < (void *)(chunk->blocks + chunk->size)) {
+
+ return(TRUE);
+ }
+
+ chunk++;
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a buffer block was created by buf_chunk_init().
+@return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
+static
+ibool
+buf_block_is_uncompressed(
+/*======================*/
+ const buf_block_t* block) /*!< in: pointer to block,
+ not dereferenced */
+{
+ //ut_ad(buf_pool_mutex_own());
+
+ if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
+ /* The pointer should be aligned. */
+ return(FALSE);
+ }
+
+ return(buf_pointer_is_block_field((void *)block));
+}
+
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint offset, /*!< in: page number */
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ buf_block_t* guess, /*!< in: guessed block or NULL */
+ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+ BUF_GET_NO_LATCH */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_block_t* block;
+ unsigned access_time;
+ ulint fix_type;
+ ibool must_read;
+ ulint retries = 0;
+ mutex_t* block_mutex;
+ trx_t* trx = NULL;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t start_time;
+ ib_uint64_t finish_time;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH)
+ || (rw_latch == RW_X_LATCH)
+ || (rw_latch == RW_NO_LATCH));
+ ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
+ ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
+ || (mode == BUF_GET_NO_LATCH));
+ ut_ad(zip_size == fil_space_get_zip_size(space));
+ ut_ad(ut_is_2pow(zip_size));
+#ifndef UNIV_LOG_DEBUG
+ ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, NULL));
+#endif
+ if (innobase_get_slow_log()) {
+ trx = innobase_get_trx();
+ }
+ buf_pool->stat.n_page_gets++;
+loop:
+ block = guess;
+ //buf_pool_mutex_enter();
+
+ if (block) {
+ block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+
+ /* If the guess is a compressed page descriptor that
+ has been allocated by buf_buddy_alloc(), it may have
+ been invalidated by buf_buddy_relocate(). In that
+ case, block could point to something that happens to
+ contain the expected bits in block->page. Similarly,
+ the guess may be pointing to a buffer pool chunk that
+ has been released when resizing the buffer pool. */
+
+ if (!block_mutex) {
+ block = guess = NULL;
+ } else if (!buf_block_is_uncompressed(block)
+ || offset != block->page.offset
+ || space != block->page.space
+ || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+
+ mutex_exit(block_mutex);
+
+ block = guess = NULL;
+ } else {
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(block->page.in_page_hash);
+ }
+ }
+
+ if (block == NULL) {
+ rw_lock_s_lock(&page_hash_latch);
+ block = (buf_block_t*) buf_page_hash_get(space, offset);
+ if (block) {
+ block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+ ut_a(block_mutex);
+ }
+ rw_lock_s_unlock(&page_hash_latch);
+ }
+
+loop2:
+ if (block == NULL) {
+ /* Page not in buf_pool: needs to be read from file */
+
+ //buf_pool_mutex_exit();
+
+ if (mode == BUF_GET_IF_IN_POOL) {
+
+ return(NULL);
+ }
+
+ if (buf_read_page(space, zip_size, offset, trx)) {
+ retries = 0;
+ } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+ ++retries;
+ } else {
+ fprintf(stderr, "InnoDB: Error: Unable"
+ " to read tablespace %lu page no"
+ " %lu into the buffer pool after"
+ " %lu attempts\n"
+ "InnoDB: The most probable cause"
+ " of this error may be that the"
+ " table has been corrupted.\n"
+ "InnoDB: You can try to fix this"
+ " problem by using"
+ " innodb_force_recovery.\n"
+ "InnoDB: Please see reference manual"
+ " for more details.\n"
+ "InnoDB: Aborting...\n",
+ space, offset,
+ BUF_PAGE_READ_MAX_RETRIES);
+
+ ut_error;
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ goto loop;
+ }
+
+ ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+
+ must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
+
+ if (must_read && mode == BUF_GET_IF_IN_POOL) {
+ /* The page is only being read to buffer */
+ //buf_pool_mutex_exit();
+ mutex_exit(block_mutex);
+
+ return(NULL);
+ }
+
+ if (srv_pass_corrupt_table) {
+ if (block->page.is_corrupt) {
+ mutex_exit(block_mutex);
+ return(NULL);
+ }
+ }
+ ut_a(!(block->page.is_corrupt));
+
+ switch (buf_block_get_state(block)) {
+ buf_page_t* bpage;
+ ibool success;
+
+ case BUF_BLOCK_FILE_PAGE:
+ if (block_mutex == &buf_pool_zip_mutex) {
+ /* it is wrong mutex... */
+ mutex_exit(block_mutex);
+ goto loop;
+ }
+ break;
+
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_ad(block_mutex == &buf_pool_zip_mutex);
+ bpage = &block->page;
+ /* Protect bpage->buf_fix_count. */
+ /* Already proteced here. */
+ //mutex_enter(&buf_pool_zip_mutex);
+
+ if (bpage->buf_fix_count
+ || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* This condition often occurs when the buffer
+ is not buffer-fixed, but I/O-fixed by
+ buf_page_init_for_read(). */
+ //mutex_exit(&buf_pool_zip_mutex);
+wait_until_unfixed:
+ /* The block is buffer-fixed or I/O-fixed.
+ Try again later. */
+ //buf_pool_mutex_exit();
+ mutex_exit(block_mutex);
+ os_thread_sleep(WAIT_FOR_READ);
+
+ goto loop;
+ }
+
+ /* Allocate an uncompressed page. */
+ //buf_pool_mutex_exit();
+ //mutex_exit(&buf_pool_zip_mutex);
+ mutex_exit(block_mutex);
+
+ block = buf_LRU_get_free_block(0);
+ ut_a(block);
+ block_mutex = &block->mutex;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ mutex_enter(block_mutex);
+
+ {
+ buf_page_t* hash_bpage
+ = buf_page_hash_get(space, offset);
+
+ if (UNIV_UNLIKELY(bpage != hash_bpage)) {
+ /* The buf_pool->page_hash was modified
+ while buf_pool_mutex was released.
+ Free the block that was allocated. */
+
+ buf_LRU_block_free_non_file_page(block, TRUE);
+ mutex_exit(block_mutex);
+
+ block = (buf_block_t*) hash_bpage;
+ if (block) {
+ block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+ ut_a(block_mutex);
+ }
+ rw_lock_x_unlock(&page_hash_latch);
+ mutex_exit(&LRU_list_mutex);
+ goto loop2;
+ }
+ }
+
+ mutex_enter(&buf_pool_zip_mutex);
+
+ if (UNIV_UNLIKELY
+ (bpage->buf_fix_count
+ || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
+
+ mutex_exit(&buf_pool_zip_mutex);
+ /* The block was buffer-fixed or I/O-fixed
+ while buf_pool_mutex was not held by this thread.
+ Free the block that was allocated and try again.
+ This should be extremely unlikely. */
+
+ buf_LRU_block_free_non_file_page(block, TRUE);
+ //mutex_exit(&block->mutex);
+
+ rw_lock_x_unlock(&page_hash_latch);
+ mutex_exit(&LRU_list_mutex);
+ goto wait_until_unfixed;
+ }
+
+ /* Move the compressed page from bpage to block,
+ and uncompress it. */
+
+ mutex_enter(&flush_list_mutex);
+
+ buf_relocate(bpage, &block->page);
+
+ rw_lock_x_unlock(&page_hash_latch);
+
+ buf_block_init_low(block);
+ block->lock_hash_val = lock_rec_hash(space, offset);
+
+ UNIV_MEM_DESC(&block->page.zip.data,
+ page_zip_get_size(&block->page.zip), block);
+
+ if (buf_page_get_state(&block->page)
+ == BUF_BLOCK_ZIP_PAGE) {
+ UT_LIST_REMOVE(zip_list, buf_pool->zip_clean,
+ &block->page);
+ ut_ad(!block->page.in_flush_list);
+ } else {
+ /* Relocate buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage,
+ &block->page);
+ }
+
+ mutex_exit(&flush_list_mutex);
+
+ /* Buffer-fix, I/O-fix, and X-latch the block
+ for the duration of the decompression.
+ Also add the block to the unzip_LRU list. */
+ block->page.state = BUF_BLOCK_FILE_PAGE;
+
+ /* Insert at the front of unzip_LRU list */
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ mutex_exit(&LRU_list_mutex);
+
+ block->page.buf_fix_count = 1;
+ buf_block_set_io_fix(block, BUF_IO_READ);
+ rw_lock_x_lock_func(&block->lock, 0, file, line);
+
+ UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+ mutex_exit(block_mutex);
+ mutex_exit(&buf_pool_zip_mutex);
+
+ mutex_enter(&buf_pool_mutex);
+ buf_pool->n_pend_unzip++;
+ mutex_exit(&buf_pool_mutex);
+
+ buf_buddy_free(bpage, sizeof *bpage, FALSE);
+
+ //buf_pool_mutex_exit();
+
+ /* Decompress the page and apply buffered operations
+ while not holding buf_pool_mutex or block->mutex. */
+ success = buf_zip_decompress(block, srv_use_checksums);
+ ut_a(success);
+
+ if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
+ ibuf_merge_or_delete_for_page(block, space, offset,
+ zip_size, TRUE);
+ }
+
+ /* Unfix and unlatch the block. */
+ //buf_pool_mutex_enter();
+ block_mutex = &block->mutex;
+ mutex_enter(block_mutex);
+ block->page.buf_fix_count--;
+ buf_block_set_io_fix(block, BUF_IO_NONE);
+
+ mutex_enter(&buf_pool_mutex);
+ buf_pool->n_pend_unzip--;
+ mutex_exit(&buf_pool_mutex);
+ rw_lock_x_unlock(&block->lock);
+ break;
+
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ //mutex_enter(&block->mutex);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
+ UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
+#endif
+
+ buf_block_buf_fix_inc(block, file, line);
+
+ //mutex_exit(&block->mutex);
+
+ /* Check if this is the first access to the page */
+
+ access_time = buf_page_is_accessed(&block->page);
+
+ //buf_pool_mutex_exit();
+ mutex_exit(block_mutex);
+
+ buf_page_set_accessed_make_young(&block->page, access_time);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ ut_a(!block->page.file_page_was_freed);
+#endif
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ switch (rw_latch) {
+ case RW_NO_LATCH:
+ if (must_read) {
+ /* Let us wait until the read operation
+ completes */
+
+ if (innobase_get_slow_log() && trx && trx->take_stats)
+ {
+ ut_usectime(&sec, &ms);
+ start_time = (ib_uint64_t)sec * 1000000 + ms;
+ } else {
+ start_time = 0;
+ }
+ for (;;) {
+ enum buf_io_fix io_fix;
+
+ mutex_enter(&block->mutex);
+ io_fix = buf_block_get_io_fix(block);
+ mutex_exit(&block->mutex);
+
+ if (io_fix == BUF_IO_READ) {
+
+ os_thread_sleep(WAIT_FOR_READ);
+ } else {
+ break;
+ }
+ }
+ if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+ }
+
+ fix_type = MTR_MEMO_BUF_FIX;
+ break;
+
+ case RW_S_LATCH:
+ rw_lock_s_lock_func(&(block->lock), 0, file, line);
+
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ break;
+
+ default:
+ ut_ad(rw_latch == RW_X_LATCH);
+ rw_lock_x_lock_func(&(block->lock), 0, file, line);
+
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ break;
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+ if (!access_time) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(space, zip_size, offset, trx);
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+ if (innobase_get_slow_log()) {
+ _increment_page_get_statistics(block, trx);
+ }
+
+ return(block);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed buffer block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
+ ..._GUESS_ON_CLOCK */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ unsigned access_time;
+ ibool success;
+ ulint fix_type;
+ trx_t* trx = NULL;
+
+ ut_ad(block);
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ mutex_enter(&block->mutex);
+
+ if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ buf_block_buf_fix_inc(block, file, line);
+
+ mutex_exit(&block->mutex);
+
+ /* Check if this is the first access to the page.
+ We do a dirty read on purpose, to avoid mutex contention.
+ This field is only used for heuristic purposes; it does not
+ affect correctness. */
+
+ access_time = buf_page_is_accessed(&block->page);
+ buf_page_set_accessed_make_young(&block->page, access_time);
+
+ ut_ad(!ibuf_inside()
+ || ibuf_page(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block), NULL));
+
+ if (rw_latch == RW_S_LATCH) {
+ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+ success = rw_lock_x_lock_func_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ }
+
+ if (UNIV_UNLIKELY(!success)) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ } else {
+ rw_lock_x_unlock(&(block->lock));
+ }
+
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ ut_a(block->page.file_page_was_freed == FALSE);
+#endif
+ if (innobase_get_slow_log()) {
+ trx = innobase_get_trx();
+ }
+
+ if (UNIV_UNLIKELY(!access_time)) {
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+ buf_read_ahead_linear(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block), trx);
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+ buf_pool->stat.n_page_gets++;
+
+ if (innobase_get_slow_log()) {
+ _increment_page_get_statistics(block, trx);
+ }
+ return(TRUE);
+}
+
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done. For example, if a search in an adaptive hash index leads us to this
+frame.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: the known page */
+ ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ibool success;
+ ulint fix_type;
+ trx_t* trx = NULL;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ mutex_enter(&block->mutex);
+
+ if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
+ /* Another thread is just freeing the block from the LRU list
+ of the buffer pool: do not try to access this page; this
+ attempt to access the page can only come through the hash
+ index because when the buffer block state is ..._REMOVE_HASH,
+ we have already removed it from the page address hash table
+ of the buffer pool. */
+
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ buf_block_buf_fix_inc(block, file, line);
+
+ mutex_exit(&block->mutex);
+
+ if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) {
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ buf_LRU_make_block_young(&block->page);
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ } else if (!buf_page_is_accessed(&block->page)) {
+ /* Above, we do a dirty read on purpose, to avoid
+ mutex contention. The field buf_page_t::access_time
+ is only used for heuristic purposes. Writes to the
+ field must be protected by mutex, however. */
+ ulint time_ms = ut_time_ms();
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&block->mutex);
+ buf_page_set_accessed(&block->page, time_ms);
+ //buf_pool_mutex_exit();
+ mutex_exit(&block->mutex);
+ }
+
+ ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
+
+ if (rw_latch == RW_S_LATCH) {
+ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+ success = rw_lock_x_lock_func_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ }
+
+ if (!success) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ ut_a(block->page.file_page_was_freed == FALSE);
+#endif
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a((mode == BUF_KEEP_OLD)
+ || (ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0));
+#endif
+ buf_pool->stat.n_page_gets++;
+
+ if (innobase_get_slow_log()) {
+ trx = innobase_get_trx();
+ _increment_page_get_statistics(block, trx);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the kernel mutex.
+@return pointer to a page or NULL */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+ ulint space_id,/*!< in: tablespace id */
+ ulint page_no,/*!< in: page number */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ buf_block_t* block;
+ ibool success;
+ ulint fix_type;
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+ block = buf_block_hash_get(space_id, page_no);
+
+ if (!block) {
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+ return(NULL);
+ }
+
+ mutex_enter(&block->mutex);
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_a(buf_block_get_space(block) == space_id);
+ ut_a(buf_block_get_page_no(block) == page_no);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ buf_block_buf_fix_inc(block, file, line);
+ mutex_exit(&block->mutex);
+
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ success = rw_lock_s_lock_nowait(&block->lock, file, line);
+
+ if (!success) {
+ /* Let us try to get an X-latch. If the current thread
+ is holding an X-latch on the page, we cannot get an
+ S-latch. */
+
+ fix_type = MTR_MEMO_PAGE_X_FIX;
+ success = rw_lock_x_lock_func_nowait(&block->lock,
+ file, line);
+ }
+
+ if (!success) {
+ mutex_enter(&block->mutex);
+ buf_block_buf_fix_dec(block);
+ mutex_exit(&block->mutex);
+
+ return(NULL);
+ }
+
+ mtr_memo_push(mtr, block, fix_type);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 5771 || buf_validate());
+ ut_a(block->page.buf_fix_count > 0);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ ut_a(block->page.file_page_was_freed == FALSE);
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ buf_pool->stat.n_page_gets++;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+
+ return(block);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_page_init_low(
+/*==============*/
+ buf_page_t* bpage) /*!< in: block to init */
+{
+ bpage->flush_type = BUF_FLUSH_LRU;
+ bpage->io_fix = BUF_IO_NONE;
+ bpage->buf_fix_count = 0;
+ bpage->freed_page_clock = 0;
+ bpage->access_time = 0;
+ bpage->newest_modification = 0;
+ bpage->oldest_modification = 0;
+ HASH_INVALIDATE(bpage, hash);
+ bpage->is_corrupt = FALSE;
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ bpage->file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+}
+
+/********************************************************************//**
+Inits a page to the buffer buf_pool. */
+static
+void
+buf_page_init(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ buf_block_t* block) /*!< in: block to init */
+{
+ buf_page_t* hash_page;
+
+ //ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+ ut_ad(mutex_own(&(block->mutex)));
+ ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+ /* Set the state of the block */
+ buf_block_set_file_page(block, space, offset);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ if (!space) {
+ /* Silence valid Valgrind warnings about uninitialized
+ data being written to data files. There are some unused
+ bytes on some pages that InnoDB does not initialize. */
+ UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ buf_block_init_low(block);
+
+ block->lock_hash_val = lock_rec_hash(space, offset);
+
+ /* Insert into the hash table of file pages */
+
+ hash_page = buf_page_hash_get(space, offset);
+
+ if (UNIV_LIKELY_NULL(hash_page)) {
+ fprintf(stderr,
+ "InnoDB: Error: page %lu %lu already found"
+ " in the hash table: %p, %p\n",
+ (ulong) space,
+ (ulong) offset,
+ (const void*) hash_page, (const void*) block);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ mutex_exit(&block->mutex);
+ //buf_pool_mutex_exit();
+ rw_lock_x_unlock(&page_hash_latch);
+ buf_print();
+ buf_LRU_print();
+ buf_validate();
+ buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ ut_error;
+ }
+
+ buf_page_init_low(&block->page);
+
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(!block->page.in_page_hash);
+ ut_d(block->page.in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(space, offset), &block->page);
+}
+
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset) /*!< in: page number */
+{
+ buf_block_t* block;
+ buf_page_t* bpage;
+ mtr_t mtr;
+ ibool lru = FALSE;
+ void* data;
+
+ ut_ad(buf_pool);
+
+ *err = DB_SUCCESS;
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+ /* It is a read-ahead within an ibuf routine */
+
+ ut_ad(!ibuf_bitmap_page(zip_size, offset));
+ ut_ad(ibuf_inside());
+
+ mtr_start(&mtr);
+
+ if (!recv_no_ibuf_operations
+ && !ibuf_page(space, zip_size, offset, &mtr)) {
+
+ mtr_commit(&mtr);
+
+ return(NULL);
+ }
+ } else {
+ ut_ad(mode == BUF_READ_ANY_PAGE);
+ }
+
+ if (zip_size && UNIV_LIKELY(!unzip)
+ && UNIV_LIKELY(!recv_recovery_is_on())) {
+ block = NULL;
+ } else {
+ block = buf_LRU_get_free_block(0);
+ ut_ad(block);
+ }
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+
+ if (buf_page_hash_get(space, offset)) {
+ /* The page is already in the buffer pool. */
+err_exit:
+ if (block) {
+ mutex_enter(&block->mutex);
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ buf_LRU_block_free_non_file_page(block, FALSE);
+ mutex_exit(&block->mutex);
+ }
+ else {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+
+ bpage = NULL;
+ goto func_exit;
+ }
+
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(
+ space, tablespace_version)) {
+ /* The page belongs to a space which has been
+ deleted or is being deleted. */
+ *err = DB_TABLESPACE_DELETED;
+
+ goto err_exit;
+ }
+
+ if (block) {
+ bpage = &block->page;
+ mutex_enter(&block->mutex);
+ buf_page_init(space, offset, block);
+
+ rw_lock_x_unlock(&page_hash_latch);
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+
+ /* We set a pass-type x-lock on the frame because then
+ the same thread which called for the read operation
+ (and is running now at this point of code) can wait
+ for the read to complete by waiting for the x-lock on
+ the frame; if the x-lock were recursive, the same
+ thread would illegally get the x-lock before the page
+ read is completed. The x-lock is cleared by the
+ io-handler thread. */
+
+ rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+ buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ page_zip_set_size(&block->page.zip, zip_size);
+
+ /* buf_pool_mutex may be released and
+ reacquired by buf_buddy_alloc(). Thus, we
+ must release block->mutex in order not to
+ break the latching order in the reacquisition
+ of buf_pool_mutex. We also must defer this
+ operation until after the block descriptor has
+ been added to buf_pool->LRU and
+ buf_pool->page_hash. */
+ mutex_exit(&block->mutex);
+ data = buf_buddy_alloc(zip_size, &lru, FALSE);
+ mutex_enter(&block->mutex);
+ block->page.zip.data = data;
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list
+ == buf_page_belongs_to_unzip_LRU(&block->page)
+ we have to add this block to unzip_LRU
+ after block->page.zip.data is set. */
+ ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+ buf_unzip_LRU_add_block(block, TRUE);
+ }
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&block->mutex);
+ } else {
+ /* Defer buf_buddy_alloc() until after the block has
+ been found not to exist. The buf_buddy_alloc() and
+ buf_buddy_free() calls may be expensive because of
+ buf_buddy_relocate(). */
+
+ /* The compressed page must be allocated before the
+ control block (bpage), in order to avoid the
+ invocation of buf_buddy_relocate_block() on
+ uninitialized data. */
+ data = buf_buddy_alloc(zip_size, &lru, TRUE);
+ bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE);
+
+ /* If buf_buddy_alloc() allocated storage from the LRU list,
+ it released and reacquired buf_pool_mutex. Thus, we must
+ check the page_hash again, as it may have been modified. */
+ if (UNIV_UNLIKELY(lru)
+ && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
+
+ /* The block was added by some other thread. */
+ buf_buddy_free(bpage, sizeof *bpage, TRUE);
+ buf_buddy_free(data, zip_size, TRUE);
+
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ bpage = NULL;
+ goto func_exit;
+ }
+
+ page_zip_des_init(&bpage->zip);
+ page_zip_set_size(&bpage->zip, zip_size);
+ bpage->zip.data = data;
+
+ mutex_enter(&buf_pool_zip_mutex);
+ UNIV_MEM_DESC(bpage->zip.data,
+ page_zip_get_size(&bpage->zip), bpage);
+ buf_page_init_low(bpage);
+ bpage->state = BUF_BLOCK_ZIP_PAGE;
+ bpage->space = space;
+ bpage->offset = offset;
+
+#ifdef UNIV_DEBUG
+ bpage->in_page_hash = FALSE;
+ bpage->in_zip_hash = FALSE;
+ bpage->in_flush_list = FALSE;
+ bpage->in_free_list = FALSE;
+#endif /* UNIV_DEBUG */
+ bpage->in_LRU_list = FALSE;
+
+ ut_d(bpage->in_page_hash = TRUE);
+ HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(space, offset), bpage);
+
+ rw_lock_x_unlock(&page_hash_latch);
+
+ /* The block must be put to the LRU list, to the old blocks */
+ buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+ mutex_enter(&flush_list_mutex);
+ buf_LRU_insert_zip_clean(bpage);
+ mutex_exit(&flush_list_mutex);
+
+ mutex_exit(&LRU_list_mutex);
+
+ buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+ mutex_exit(&buf_pool_zip_mutex);
+ }
+
+ mutex_enter(&buf_pool_mutex);
+ buf_pool->n_pend_reads++;
+ mutex_exit(&buf_pool_mutex);
+func_exit:
+ //buf_pool_mutex_exit();
+
+ if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+
+ mtr_commit(&mtr);
+ }
+
+ ut_ad(!bpage || buf_page_in_file(bpage));
+ return(bpage);
+}
+
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space in units of
+ a page */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ buf_frame_t* frame;
+ buf_block_t* block;
+ buf_block_t* free_block = NULL;
+ ulint time_ms = ut_time_ms();
+
+ ut_ad(mtr);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad(space || !zip_size);
+
+ free_block = buf_LRU_get_free_block(0);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+
+ block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+ if (block && buf_page_in_file(&block->page)) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(space, offset) == 0);
+#endif
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+ /* Page can be found in buf_pool */
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ buf_block_free(free_block);
+
+ return(buf_page_get_with_no_latch(space, zip_size,
+ offset, mtr));
+ }
+
+ /* If we get here, the page was not in buf_pool: init it there */
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "Creating space %lu page %lu to buffer\n",
+ (ulong) space, (ulong) offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ block = free_block;
+
+ mutex_enter(&block->mutex);
+
+ buf_page_init(space, offset, block);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(&block->page, FALSE);
+
+ buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+ buf_pool->stat.n_pages_created++;
+
+ if (zip_size) {
+ void* data;
+ ibool lru;
+
+ /* Prevent race conditions during buf_buddy_alloc(),
+ which may release and reacquire buf_pool_mutex,
+ by IO-fixing and X-latching the block. */
+
+ buf_page_set_io_fix(&block->page, BUF_IO_READ);
+ rw_lock_x_lock(&block->lock);
+
+ page_zip_set_size(&block->page.zip, zip_size);
+ mutex_exit(&block->mutex);
+ /* buf_pool_mutex may be released and reacquired by
+ buf_buddy_alloc(). Thus, we must release block->mutex
+ in order not to break the latching order in
+ the reacquisition of buf_pool_mutex. We also must
+ defer this operation until after the block descriptor
+ has been added to buf_pool->LRU and buf_pool->page_hash. */
+ data = buf_buddy_alloc(zip_size, &lru, FALSE);
+ mutex_enter(&block->mutex);
+ block->page.zip.data = data;
+
+ /* To maintain the invariant
+ block->in_unzip_LRU_list
+ == buf_page_belongs_to_unzip_LRU(&block->page)
+ we have to add this block to unzip_LRU after
+ block->page.zip.data is set. */
+ ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+ buf_unzip_LRU_add_block(block, FALSE);
+
+ buf_page_set_io_fix(&block->page, BUF_IO_NONE);
+ rw_lock_x_unlock(&block->lock);
+ }
+
+ buf_page_set_accessed(&block->page, time_ms);
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+
+ mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
+
+ mutex_exit(&block->mutex);
+
+ /* Delete possible entries for the page from the insert buffer:
+ such can exist if the page belonged to an index which was dropped */
+
+ ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
+
+ /* Flush pages from the end of the LRU list if necessary */
+ buf_flush_free_margin(FALSE);
+
+ frame = block->frame;
+
+ memset(frame + FIL_PAGE_PREV, 0xff, 4);
+ memset(frame + FIL_PAGE_NEXT, 0xff, 4);
+ mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+ /* Reset to zero the file flush lsn field in the page; if the first
+ page of an ibdata file is 'created' in this function into the buffer
+ pool then we lose the original contents of the file flush lsn stamp.
+ Then InnoDB could in a crash recovery print a big, false, corruption
+ warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
+
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(++buf_dbg_counter % 357 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(buf_block_get_space(block),
+ buf_block_get_page_no(block)) == 0);
+#endif
+ return(block);
+}
+
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_io_complete(
+/*=================*/
+ buf_page_t* bpage, /*!< in: pointer to the block in question */
+ trx_t* trx)
+{
+ enum buf_io_fix io_type;
+ const ibool uncompressed = (buf_page_get_state(bpage)
+ == BUF_BLOCK_FILE_PAGE);
+ enum buf_flush flush_type;
+ mutex_t* block_mutex;
+
+ ut_a(buf_page_in_file(bpage));
+
+ /* We do not need protect io_fix here by mutex to read
+ it because this is the only function where we can change the value
+ from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+ ensures that this is the only thread that handles the i/o for this
+ block. */
+
+ io_type = buf_page_get_io_fix(bpage);
+ ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+ if (io_type == BUF_IO_READ) {
+ ulint read_page_no;
+ ulint read_space_id;
+ byte* frame;
+
+ if (buf_page_get_zip_size(bpage)) {
+ frame = bpage->zip.data;
+ buf_pool->n_pend_unzip++;
+ if (uncompressed
+ && !buf_zip_decompress((buf_block_t*) bpage,
+ FALSE)) {
+
+ buf_pool->n_pend_unzip--;
+ goto corrupt;
+ }
+ buf_pool->n_pend_unzip--;
+ } else {
+ ut_a(uncompressed);
+ frame = ((buf_block_t*) bpage)->frame;
+ }
+
+ /* If this page is not uninitialized and not in the
+ doublewrite buffer, then the page number and space id
+ should be the same as in block. */
+ read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
+ read_space_id = mach_read_from_4(
+ frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ if ((bpage->space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
+ && trx_doublewrite_page_inside(bpage->offset)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: reading page %lu\n"
+ "InnoDB: which is in the"
+ " doublewrite buffer!\n",
+ (ulong) bpage->offset);
+ } else if (!read_space_id && !read_page_no) {
+ /* This is likely an uninitialized page. */
+ } else if ((bpage->space
+ && bpage->space != read_space_id)
+ || bpage->offset != read_page_no) {
+ /* We did not compare space_id to read_space_id
+ if bpage->space == 0, because the field on the
+ page may contain garbage in MySQL < 4.1.1,
+ which only supported bpage->space == 0. */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: space id and page n:o"
+ " stored in the page\n"
+ "InnoDB: read in are %lu:%lu,"
+ " should be %lu:%lu!\n",
+ (ulong) read_space_id, (ulong) read_page_no,
+ (ulong) bpage->space,
+ (ulong) bpage->offset);
+ }
+
+ if (!srv_pass_corrupt_table || !bpage->is_corrupt) {
+ /* From version 3.23.38 up we store the page checksum
+ to the 4 first bytes of the page end lsn field */
+
+ if (buf_page_is_corrupted(frame,
+ buf_page_get_zip_size(bpage))) {
+corrupt:
+ fprintf(stderr,
+ "InnoDB: Database page corruption on disk"
+ " or a failed\n"
+ "InnoDB: file read of page %lu.\n"
+ "InnoDB: You may have to recover"
+ " from a backup.\n",
+ (ulong) bpage->offset);
+ buf_page_print(frame, buf_page_get_zip_size(bpage));
+ fprintf(stderr,
+ "InnoDB: Database page corruption on disk"
+ " or a failed\n"
+ "InnoDB: file read of page %lu.\n"
+ "InnoDB: You may have to recover"
+ " from a backup.\n",
+ (ulong) bpage->offset);
+ fputs("InnoDB: It is also possible that"
+ " your operating\n"
+ "InnoDB: system has corrupted its"
+ " own file cache\n"
+ "InnoDB: and rebooting your computer"
+ " removes the\n"
+ "InnoDB: error.\n"
+ "InnoDB: If the corrupt page is an index page\n"
+ "InnoDB: you can also try to"
+ " fix the corruption\n"
+ "InnoDB: by dumping, dropping,"
+ " and reimporting\n"
+ "InnoDB: the corrupt table."
+ " You can use CHECK\n"
+ "InnoDB: TABLE to scan your"
+ " table for corruption.\n"
+ "InnoDB: See also "
+ REFMAN "forcing-recovery.html\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+
+ if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
+ && bpage->space < SRV_LOG_SPACE_FIRST_ID) {
+ fprintf(stderr,
+ "InnoDB: space %u will be treated as corrupt.\n",
+ bpage->space);
+ fil_space_set_corrupt(bpage->space);
+ if (trx && trx->dict_operation_lock_mode == 0) {
+ dict_table_set_corrupt_by_space(bpage->space, TRUE);
+ } else {
+ dict_table_set_corrupt_by_space(bpage->space, FALSE);
+ }
+ bpage->is_corrupt = TRUE;
+ } else
+ if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
+ fputs("InnoDB: Ending processing because of"
+ " a corrupt database page.\n",
+ stderr);
+ exit(1);
+ }
+ }
+ } /**/
+
+ if (recv_recovery_is_on()) {
+ /* Pages must be uncompressed for crash recovery. */
+ ut_a(uncompressed);
+ recv_recover_page(TRUE, (buf_block_t*) bpage);
+ }
+
+ if (uncompressed && !recv_no_ibuf_operations) {
+ ibuf_merge_or_delete_for_page(
+ /* Delete possible entries, if bpage is_corrupt */
+ (srv_pass_corrupt_table && bpage->is_corrupt) ? NULL :
+ (buf_block_t*) bpage, bpage->space,
+ bpage->offset, buf_page_get_zip_size(bpage),
+ (srv_pass_corrupt_table && bpage->is_corrupt) ? FALSE :
+ TRUE);
+ }
+ }
+
+ //buf_pool_mutex_enter();
+ if (io_type == BUF_IO_WRITE) {
+ flush_type = buf_page_get_flush_type(bpage);
+ /* to keep consistency at buf_LRU_insert_zip_clean() */
+ //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */
+ mutex_enter(&LRU_list_mutex);
+ //}
+ }
+ block_mutex = buf_page_get_mutex_enter(bpage);
+ ut_a(block_mutex);
+ mutex_enter(&buf_pool_mutex);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ if (io_type == BUF_IO_WRITE || uncompressed) {
+ /* For BUF_IO_READ of compressed-only blocks, the
+ buffered operations will be merged by buf_page_get_gen()
+ after the block has been uncompressed. */
+ ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+ }
+#endif
+ /* Because this thread which does the unlocking is not the same that
+ did the locking, we use a pass value != 0 in unlock, which simply
+ removes the newest lock debug record, without checking the thread
+ id. */
+
+ buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+ switch (io_type) {
+ case BUF_IO_READ:
+ /* NOTE that the call to ibuf may have moved the ownership of
+ the x-latch to this OS thread: do not let this confuse you in
+ debugging! */
+
+ ut_ad(buf_pool->n_pend_reads > 0);
+ buf_pool->n_pend_reads--;
+ buf_pool->stat.n_pages_read++;
+
+ if (uncompressed) {
+ rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_READ);
+ }
+
+ break;
+
+ case BUF_IO_WRITE:
+ /* Write means a flush operation: call the completion
+ routine in the flush system */
+
+ buf_flush_write_complete(bpage);
+
+ /* to keep consistency at buf_LRU_insert_zip_clean() */
+ //if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */
+ mutex_exit(&LRU_list_mutex);
+ //}
+
+ if (uncompressed) {
+ rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ buf_pool->stat.n_pages_written++;
+
+ break;
+
+ default:
+ ut_error;
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "Has %s page space %lu page no %lu\n",
+ io_type == BUF_IO_READ ? "read" : "written",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+ }
+#endif /* UNIV_DEBUG */
+
+ mutex_exit(&buf_pool_mutex);
+ mutex_exit(block_mutex);
+ //buf_pool_mutex_exit();
+}
+
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void)
+/*=====================*/
+{
+ ibool freed;
+ enum buf_flush i;
+
+ buf_pool_mutex_enter();
+
+ for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+
+ /* As this function is called during startup and
+ during redo application phase during recovery, InnoDB
+ is single threaded (apart from IO helper threads) at
+ this stage. No new write batch can be in intialization
+ stage at this point. */
+ ut_ad(buf_pool->init_flush[i] == FALSE);
+
+ /* However, it is possible that a write batch that has
+ been posted earlier is still not complete. For buffer
+ pool invalidation to proceed we must ensure there is NO
+ write activity happening. */
+ if (buf_pool->n_flush[i] > 0) {
+ buf_pool_mutex_exit();
+ buf_flush_wait_batch_end(i);
+ buf_pool_mutex_enter();
+ }
+ }
+
+ buf_pool_mutex_exit();
+
+ ut_ad(buf_all_freed());
+
+ freed = TRUE;
+
+ while (freed) {
+ freed = buf_LRU_search_and_free_block(100);
+ }
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
+ ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
+
+ buf_pool->freed_page_clock = 0;
+ buf_pool->LRU_old = NULL;
+ buf_pool->LRU_old_len = 0;
+ buf_pool->LRU_flush_ended = 0;
+
+ memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
+ buf_refresh_io_stats();
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer buf_pool data structure.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void)
+/*==============*/
+{
+ buf_page_t* b;
+ buf_chunk_t* chunk;
+ ulint i;
+ ulint n_single_flush = 0;
+ ulint n_lru_flush = 0;
+ ulint n_list_flush = 0;
+ ulint n_lru = 0;
+ ulint n_flush = 0;
+ ulint n_free = 0;
+ ulint n_zip = 0;
+
+ ut_ad(buf_pool);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ /* for keep the new latch order, it cannot validate correctly... */
+
+ chunk = buf_pool->chunks;
+
+ /* Check the uncompressed blocks. */
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+
+ ulint j;
+ buf_block_t* block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+
+ mutex_enter(&block->mutex);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* These should only occur on
+ zip_clean, zip_free[], or flush_list. */
+ ut_error;
+ break;
+
+ case BUF_BLOCK_FILE_PAGE:
+ ut_a(buf_page_hash_get(buf_block_get_space(
+ block),
+ buf_block_get_page_no(
+ block))
+ == &block->page);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(buf_page_get_io_fix(&block->page)
+ == BUF_IO_READ
+ || !ibuf_count_get(buf_block_get_space(
+ block),
+ buf_block_get_page_no(
+ block)));
+#endif
+ switch (buf_page_get_io_fix(&block->page)) {
+ case BUF_IO_NONE:
+ break;
+
+ case BUF_IO_WRITE:
+ switch (buf_page_get_flush_type(
+ &block->page)) {
+ case BUF_FLUSH_LRU:
+ n_lru_flush++;
+ ut_a(rw_lock_is_locked(
+ &block->lock,
+ RW_LOCK_SHARED));
+ break;
+ case BUF_FLUSH_LIST:
+ n_list_flush++;
+ break;
+ case BUF_FLUSH_SINGLE_PAGE:
+ n_single_flush++;
+ break;
+ default:
+ ut_error;
+ }
+
+ break;
+
+ case BUF_IO_READ:
+
+ ut_a(rw_lock_is_locked(&block->lock,
+ RW_LOCK_EX));
+ break;
+ }
+
+ n_lru++;
+
+ if (block->page.oldest_modification > 0) {
+ n_flush++;
+ }
+
+ break;
+
+ case BUF_BLOCK_NOT_USED:
+ n_free++;
+ break;
+
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ /* do nothing */
+ break;
+ }
+
+ mutex_exit(&block->mutex);
+ }
+ }
+
+ mutex_enter(&buf_pool_zip_mutex);
+
+ /* Check clean compressed-only blocks. */
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(zip_list, b)) {
+ ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+ switch (buf_page_get_io_fix(b)) {
+ case BUF_IO_NONE:
+ /* All clean blocks should be I/O-unfixed. */
+ break;
+ case BUF_IO_READ:
+ /* In buf_LRU_free_block(), we temporarily set
+ b->io_fix = BUF_IO_READ for a newly allocated
+ control block in order to prevent
+ buf_page_get_gen() from decompressing the block. */
+ break;
+ default:
+ ut_error;
+ break;
+ }
+ ut_a(!b->oldest_modification);
+ ut_a(buf_page_hash_get(b->space, b->offset) == b);
+
+ n_lru++;
+ n_zip++;
+ }
+
+ /* Check dirty compressed-only blocks. */
+
+ mutex_enter(&flush_list_mutex);
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(flush_list, b)) {
+ ut_ad(b->in_flush_list);
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_a(b->oldest_modification);
+ n_lru++;
+ n_flush++;
+ n_zip++;
+ switch (buf_page_get_io_fix(b)) {
+ case BUF_IO_NONE:
+ case BUF_IO_READ:
+ break;
+
+ case BUF_IO_WRITE:
+ switch (buf_page_get_flush_type(b)) {
+ case BUF_FLUSH_LRU:
+ n_lru_flush++;
+ break;
+ case BUF_FLUSH_LIST:
+ n_list_flush++;
+ break;
+ case BUF_FLUSH_SINGLE_PAGE:
+ n_single_flush++;
+ break;
+ default:
+ ut_error;
+ }
+ break;
+ }
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ ut_a(buf_page_hash_get(b->space, b->offset) == b);
+ }
+ mutex_exit(&flush_list_mutex);
+
+ mutex_exit(&buf_pool_zip_mutex);
+
+ if (n_lru + n_free > buf_pool->curr_size + n_zip) {
+ fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
+ (ulong) n_lru, (ulong) n_free,
+ (ulong) buf_pool->curr_size, (ulong) n_zip);
+ ut_error;
+ }
+
+ ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
+ /* because of latching order with block->mutex, we cannot get free_list_mutex before that */
+/*
+ if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+ fprintf(stderr, "Free list len %lu, free blocks %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) n_free);
+ ut_error;
+ }
+*/
+ /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */
+/*
+ ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+ ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
+*/
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ ut_a(buf_LRU_validate());
+ ut_a(buf_flush_validate());
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure. */
+UNIV_INTERN
+void
+buf_print(void)
+/*===========*/
+{
+ dulint* index_ids;
+ ulint* counts;
+ ulint size;
+ ulint i;
+ ulint j;
+ dulint id;
+ ulint n_found;
+ buf_chunk_t* chunk;
+ dict_index_t* index;
+
+ ut_ad(buf_pool);
+
+ size = buf_pool->curr_size;
+
+ index_ids = mem_alloc(sizeof(dulint) * size);
+ counts = mem_alloc(sizeof(ulint) * size);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(&free_list_mutex);
+ mutex_enter(&flush_list_mutex);
+
+ fprintf(stderr,
+ "buf_pool size %lu\n"
+ "database pages %lu\n"
+ "free pages %lu\n"
+ "modified database pages %lu\n"
+ "n pending decompressions %lu\n"
+ "n pending reads %lu\n"
+ "n pending flush LRU %lu list %lu single page %lu\n"
+ "pages made young %lu, not young %lu\n"
+ "pages read %lu, created %lu, written %lu\n",
+ (ulong) size,
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+ (ulong) buf_pool->n_pend_unzip,
+ (ulong) buf_pool->n_pend_reads,
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
+ (ulong) buf_pool->stat.n_pages_made_young,
+ (ulong) buf_pool->stat.n_pages_not_made_young,
+ (ulong) buf_pool->stat.n_pages_read,
+ (ulong) buf_pool->stat.n_pages_created,
+ (ulong) buf_pool->stat.n_pages_written);
+
+ /* Count the number of blocks belonging to each index in the buffer */
+
+ n_found = 0;
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ ulint n_blocks = chunk->size;
+
+ for (; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+
+ id = btr_page_get_index_id(frame);
+
+ /* Look for the id in the index_ids array */
+ j = 0;
+
+ while (j < n_found) {
+
+ if (ut_dulint_cmp(index_ids[j],
+ id) == 0) {
+ counts[j]++;
+
+ break;
+ }
+ j++;
+ }
+
+ if (j == n_found) {
+ n_found++;
+ index_ids[j] = id;
+ counts[j] = 1;
+ }
+ }
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&free_list_mutex);
+ mutex_exit(&flush_list_mutex);
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+
+ fprintf(stderr,
+ "Block count for index %lu in buffer is about %lu",
+ (ulong) ut_dulint_get_low(index_ids[i]),
+ (ulong) counts[i]);
+
+ if (index) {
+ putc(' ', stderr);
+ dict_index_name_print(stderr, NULL, index);
+ }
+
+ putc('\n', stderr);
+ }
+
+ mem_free(index_ids);
+ mem_free(counts);
+
+ ut_a(buf_validate());
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void)
+/*==============================*/
+{
+ buf_chunk_t* chunk;
+ buf_page_t* b;
+ ulint i;
+ ulint fixed_pages_number = 0;
+
+ //buf_pool_mutex_enter();
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+ buf_block_t* block;
+ ulint j;
+
+ block = chunk->blocks;
+
+ for (j = chunk->size; j--; block++) {
+ if (buf_block_get_state(block)
+ != BUF_BLOCK_FILE_PAGE) {
+
+ continue;
+ }
+
+ mutex_enter(&block->mutex);
+
+ if (block->page.buf_fix_count != 0
+ || buf_page_get_io_fix(&block->page)
+ != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+
+ mutex_exit(&block->mutex);
+ }
+ }
+
+ mutex_enter(&buf_pool_zip_mutex);
+
+ /* Traverse the lists of clean and dirty compressed-only blocks. */
+
+ for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+ b = UT_LIST_GET_NEXT(zip_list, b)) {
+ ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+ ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
+
+ if (b->buf_fix_count != 0
+ || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+ }
+
+ mutex_enter(&flush_list_mutex);
+ for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+ b = UT_LIST_GET_NEXT(flush_list, b)) {
+ ut_ad(b->in_flush_list);
+
+ switch (buf_page_get_state(b)) {
+ case BUF_BLOCK_ZIP_DIRTY:
+ if (b->buf_fix_count != 0
+ || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+ fixed_pages_number++;
+ }
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ /* uncompressed page */
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+ }
+ mutex_exit(&flush_list_mutex);
+
+ mutex_exit(&buf_pool_zip_mutex);
+ //buf_pool_mutex_exit();
+
+ return(fixed_pages_number);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Returns the number of pending buf pool ios.
+@return number of pending I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_ios(void)
+/*=======================*/
+{
+ return(buf_pool->n_pend_reads
+ + buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->n_flush[BUF_FLUSH_LIST]
+ + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+}
+
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void)
+/*============================*/
+{
+ ulint ratio;
+
+ //buf_pool_mutex_enter(); /* optimistic */
+
+ ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
+ / (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+ + UT_LIST_GET_LEN(buf_pool->free));
+
+ /* 1 + is there to avoid division by zero */
+
+ //buf_pool_mutex_exit(); /* optimistic */
+
+ return(ratio);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+ FILE* file) /*!< in/out: buffer where to print */
+{
+ time_t current_time;
+ double time_elapsed;
+ ulint n_gets_diff;
+
+ ut_ad(buf_pool);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(&free_list_mutex);
+ mutex_enter(&buf_pool_mutex);
+ mutex_enter(&flush_list_mutex);
+
+ fprintf(file,
+ "Buffer pool size %lu\n"
+ "Buffer pool size, bytes %lu\n"
+ "Free buffers %lu\n"
+ "Database pages %lu\n"
+ "Old database pages %lu\n"
+ "Modified db pages %lu\n"
+ "Pending reads %lu\n"
+ "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ (ulong) buf_pool->curr_size,
+ (ulong) buf_pool->curr_size * UNIV_PAGE_SIZE,
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) buf_pool->LRU_old_len,
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+ (ulong) buf_pool->n_pend_reads,
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->init_flush[BUF_FLUSH_LRU],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
+ + buf_pool->init_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool->last_printout_time);
+
+ fprintf(file,
+ "Pages made young %lu, not young %lu\n"
+ "%.2f youngs/s, %.2f non-youngs/s\n"
+ "Pages read %lu, created %lu, written %lu\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+ (ulong) buf_pool->stat.n_pages_made_young,
+ (ulong) buf_pool->stat.n_pages_not_made_young,
+ (buf_pool->stat.n_pages_made_young
+ - buf_pool->old_stat.n_pages_made_young)
+ / time_elapsed,
+ (buf_pool->stat.n_pages_not_made_young
+ - buf_pool->old_stat.n_pages_not_made_young)
+ / time_elapsed,
+ (ulong) buf_pool->stat.n_pages_read,
+ (ulong) buf_pool->stat.n_pages_created,
+ (ulong) buf_pool->stat.n_pages_written,
+ (buf_pool->stat.n_pages_read
+ - buf_pool->old_stat.n_pages_read)
+ / time_elapsed,
+ (buf_pool->stat.n_pages_created
+ - buf_pool->old_stat.n_pages_created)
+ / time_elapsed,
+ (buf_pool->stat.n_pages_written
+ - buf_pool->old_stat.n_pages_written)
+ / time_elapsed);
+
+ n_gets_diff = buf_pool->stat.n_page_gets - buf_pool->old_stat.n_page_gets;
+
+ if (n_gets_diff) {
+ fprintf(file,
+ "Buffer pool hit rate %lu / 1000,"
+ " young-making rate %lu / 1000 not %lu / 1000\n",
+ (ulong)
+ (1000 - ((1000 * (buf_pool->stat.n_pages_read
+ - buf_pool->old_stat.n_pages_read))
+ / (buf_pool->stat.n_page_gets
+ - buf_pool->old_stat.n_page_gets))),
+ (ulong)
+ (1000 * (buf_pool->stat.n_pages_made_young
+ - buf_pool->old_stat.n_pages_made_young)
+ / n_gets_diff),
+ (ulong)
+ (1000 * (buf_pool->stat.n_pages_not_made_young
+ - buf_pool->old_stat.n_pages_not_made_young)
+ / n_gets_diff));
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
+ }
+
+ /* Statistics about read ahead algorithm */
+ fprintf(file, "Pages read ahead %.2f/s,"
+ " evicted without access %.2f/s\n",
+ (buf_pool->stat.n_ra_pages_read
+ - buf_pool->old_stat.n_ra_pages_read)
+ / time_elapsed,
+ (buf_pool->stat.n_ra_pages_evicted
+ - buf_pool->old_stat.n_ra_pages_evicted)
+ / time_elapsed);
+
+ /* Print some values to help us with visualizing what is
+ happening with LRU eviction. */
+ fprintf(file,
+ "LRU len: %lu, unzip_LRU len: %lu\n"
+ "I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
+ UT_LIST_GET_LEN(buf_pool->LRU),
+ UT_LIST_GET_LEN(buf_pool->unzip_LRU),
+ buf_LRU_stat_sum.io, buf_LRU_stat_cur.io,
+ buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip);
+
+ buf_refresh_io_stats();
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&free_list_mutex);
+ mutex_exit(&buf_pool_mutex);
+ mutex_exit(&flush_list_mutex);
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(void)
+/*======================*/
+{
+ buf_pool->last_printout_time = time(NULL);
+ buf_pool->old_stat = buf_pool->stat;
+}
+
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void)
+/*===============*/
+{
+ buf_chunk_t* chunk;
+ ulint i;
+
+ ut_ad(buf_pool);
+
+ //buf_pool_mutex_enter(); /* optimistic */
+
+ chunk = buf_pool->chunks;
+
+ for (i = buf_pool->n_chunks; i--; chunk++) {
+
+ const buf_block_t* block = buf_chunk_not_freed(chunk);
+
+ if (UNIV_LIKELY_NULL(block)) {
+ fprintf(stderr,
+ "Page %lu %lu still fixed or dirty\n",
+ (ulong) block->page.space,
+ (ulong) block->page.offset);
+ ut_error;
+ }
+ }
+
+ //buf_pool_mutex_exit(); /* optimistic */
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return TRUE if there is no pending i/o */
+UNIV_INTERN
+ibool
+buf_pool_check_no_pending_io(void)
+/*==============================*/
+{
+ ibool ret;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU]
+ + buf_pool->n_flush[BUF_FLUSH_LIST]
+ + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) {
+ ret = FALSE;
+ } else {
+ ret = TRUE;
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void)
+/*=======================*/
+{
+ ulint len;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&free_list_mutex);
+
+ len = UT_LIST_GET_LEN(buf_pool->free);
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&free_list_mutex);
+
+ return(len);
+}
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ buf_block_t* block) /*!< in: block to init */
+{
+ block->page.state = BUF_BLOCK_FILE_PAGE;
+ block->page.space = space;
+ block->page.offset = offset;
+
+ page_zip_des_init(&block->page.zip);
+
+ /* We assume that block->page.data has been allocated
+ with zip_size == UNIV_PAGE_SIZE. */
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ ut_ad(ut_is_2pow(zip_size));
+ page_zip_set_size(&block->page.zip, zip_size);
+ if (zip_size) {
+ block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
new file mode 100644
index 00000000000..0a03d583549
--- /dev/null
+++ b/storage/xtradb/buf/buf0flu.c
@@ -0,0 +1,1781 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.c
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+
+/**********************************************************************
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_flush_stat_update(). */
+#define BUF_FLUSH_STAT_N_INTERVAL 20
+
+/** Sampled values buf_flush_stat_cur.
+Not protected by any mutex. Updated by buf_flush_stat_update(). */
+static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
+
+/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
+static ulint buf_flush_stat_arr_ind;
+
+/** Values at start of the current interval. Reset by
+buf_flush_stat_update(). */
+static buf_flush_stat_t buf_flush_stat_cur;
+
+/** Running sum of past values of buf_flush_stat_cur.
+Updated by buf_flush_stat_update(). Not protected by any mutex. */
+static buf_flush_stat_t buf_flush_stat_sum;
+
+/** Number of pages flushed through non flush_list flushes. */
+static ulint buf_lru_flush_page_count = 0;
+
+/* @} */
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+static
+ibool
+buf_flush_validate_low(void);
+/*========================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/********************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+ buf_page_t* bpage) /*!< in: bpage to be inserted. */
+{
+ buf_page_t* prev = NULL;
+ const ib_rbt_node_t* c_node;
+ const ib_rbt_node_t* p_node;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+
+ /* Insert this buffer into the rbt. */
+ c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+ ut_a(c_node != NULL);
+
+ /* Get the predecessor. */
+ p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+ if (p_node != NULL) {
+ prev = *rbt_value(buf_page_t*, p_node);
+ ut_a(prev != NULL);
+ }
+
+ return(prev);
+}
+
+/********************************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+ buf_page_t* bpage) /*!< in: bpage to be removed. */
+{
+
+ ibool ret = FALSE;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+ ret = rbt_delete(buf_pool->flush_rbt, &bpage);
+ ut_ad(ret);
+}
+
+/********************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+ const void* p1, /*!< in: block1 */
+ const void* p2) /*!< in: block2 */
+{
+ int ret;
+ const buf_page_t* b1;
+ const buf_page_t* b2;
+
+ ut_ad(p1 != NULL);
+ ut_ad(p2 != NULL);
+
+ b1 = *(const buf_page_t**) p1;
+ b2 = *(const buf_page_t**) p2;
+
+ ut_ad(b1 != NULL);
+ ut_ad(b2 != NULL);
+
+ ut_ad(b1->in_flush_list);
+ ut_ad(b2->in_flush_list);
+
+ if (b2->oldest_modification
+ > b1->oldest_modification) {
+ return(1);
+ }
+
+ if (b2->oldest_modification
+ < b1->oldest_modification) {
+ return(-1);
+ }
+
+ /* If oldest_modification is same then decide on the space. */
+ ret = (int)(b2->space - b1->space);
+
+ /* Or else decide ordering on the offset field. */
+ return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+ /* Create red black tree for speedy insertions in flush list. */
+ buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
+ buf_flush_block_cmp);
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+ rbt_free(buf_pool->flush_rbt);
+ buf_pool->flush_rbt = NULL;
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+ buf_block_t* block) /*!< in/out: block which is modified */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+ ut_ad(mutex_own(&flush_list_mutex));
+ ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+ || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+ <= block->page.oldest_modification));
+
+ /* If we are in the recovery then we need to update the flush
+ red-black tree as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_insert_sorted_into_flush_list(block);
+ return;
+ }
+
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.in_LRU_list);
+ ut_ad(block->page.in_page_hash);
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(!block->page.in_flush_list);
+ ut_d(block->page.in_flush_list = TRUE);
+ UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ ulint zip_size = buf_block_get_zip_size(block);
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+ } else {
+ UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+ }
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+ buf_block_t* block) /*!< in/out: block which is modified */
+{
+ buf_page_t* prev_b;
+ buf_page_t* b;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+ ut_ad(mutex_own(&flush_list_mutex));
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ ut_ad(block->page.in_LRU_list);
+ ut_ad(block->page.in_page_hash);
+ ut_ad(!block->page.in_zip_hash);
+ ut_ad(!block->page.in_flush_list);
+ ut_d(block->page.in_flush_list = TRUE);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ ulint zip_size = buf_block_get_zip_size(block);
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+ } else {
+ UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+ }
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ prev_b = NULL;
+
+ /* For the most part when this function is called the flush_rbt
+ should not be NULL. In a very rare boundary case it is possible
+ that the flush_rbt has already been freed by the recovery thread
+ before the last page was hooked up in the flush_list by the
+ io-handler thread. In that case we'll just do a simple
+ linear search in the else block. */
+ if (buf_pool->flush_rbt) {
+
+ prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+ } else {
+
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (b && b->oldest_modification
+ > block->page.oldest_modification) {
+ ut_ad(b->in_flush_list);
+ prev_b = b;
+ b = UT_LIST_GET_NEXT(flush_list, b);
+ }
+ }
+
+ if (prev_b == NULL) {
+ UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
+ } else {
+ UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
+ prev_b, &block->page);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+ buf_page_t* bpage) /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) and in the LRU list */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ //ut_ad(bpage->in_LRU_list); /* optimistic use */
+
+ if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) {
+
+ return(bpage->oldest_modification == 0
+ && buf_page_get_io_fix(bpage) == BUF_IO_NONE
+ && bpage->buf_fix_count == 0);
+ }
+
+ /* permited not to own LRU_mutex.. */
+/*
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: buffer block state %lu"
+ " in the LRU list!\n",
+ (ulong) buf_page_get_state(bpage));
+ ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+ putc('\n', stderr);
+*/
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Returns TRUE if the block is modified and ready for flushing.
+@return TRUE if can flush immediately */
+UNIV_INLINE
+ibool
+buf_flush_ready_for_flush(
+/*======================*/
+ buf_page_t* bpage, /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) */
+ enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+{
+ //ut_a(buf_page_in_file(bpage));
+ //ut_ad(buf_pool_mutex_own()); /*optimistic...*/
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
+
+ if (buf_page_in_file(bpage) && bpage->oldest_modification != 0
+ && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
+ ut_ad(bpage->in_flush_list);
+
+ if (flush_type != BUF_FLUSH_LRU) {
+
+ return(TRUE);
+
+ } else if (bpage->buf_fix_count == 0) {
+
+ /* If we are flushing the LRU list, to avoid deadlocks
+ we require the block not to be bufferfixed, and hence
+ not latched. */
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ mutex_enter(&flush_list_mutex);
+
+ ut_ad(bpage->in_flush_list);
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_PAGE:
+ /* clean compressed pages should not be on the flush list */
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ mutex_exit(&flush_list_mutex);
+ ut_error;
+ return;
+ case BUF_BLOCK_ZIP_DIRTY:
+ buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+ UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+ buf_LRU_insert_zip_clean(bpage);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+ break;
+ }
+
+ /* If the flush_rbt is active then delete from it as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ bpage->oldest_modification = 0;
+
+ ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
+ ut_ad(ut_list_node_313->in_flush_list)));
+ mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage) /*!< in/out: destination block */
+{
+ buf_page_t* prev;
+ buf_page_t* prev_b = NULL;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&flush_list_mutex));
+
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ ut_ad(bpage->in_flush_list);
+ ut_ad(dpage->in_flush_list);
+
+ /* If recovery is active we must swap the control blocks in
+ the flush_rbt as well. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ buf_flush_delete_from_flush_rbt(bpage);
+ prev_b = buf_flush_insert_in_flush_rbt(dpage);
+ }
+
+ /* Must be done after we have removed it from the flush_rbt
+ because we assert on in_flush_list in comparison function. */
+ ut_d(bpage->in_flush_list = FALSE);
+
+ prev = UT_LIST_GET_PREV(flush_list, bpage);
+ UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+
+ if (prev) {
+ ut_ad(prev->in_flush_list);
+ UT_LIST_INSERT_AFTER(
+ flush_list,
+ buf_pool->flush_list,
+ prev, dpage);
+ } else {
+ UT_LIST_ADD_FIRST(
+ flush_list,
+ buf_pool->flush_list,
+ dpage);
+ }
+
+ /* Just an extra check. Previous in flush_list
+ should be the same control block as in flush_rbt. */
+ ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ enum buf_flush flush_type;
+
+ ut_ad(bpage);
+
+ buf_flush_remove(bpage);
+
+ flush_type = buf_page_get_flush_type(bpage);
+ buf_pool->n_flush[flush_type]--;
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ /* Put the block to the end of the LRU list to wait to be
+ moved to the free list */
+
+ buf_LRU_make_block_old(bpage);
+
+ buf_pool->LRU_flush_ended++;
+ }
+
+ /* fprintf(stderr, "n pending flush %lu\n",
+ buf_pool->n_flush[flush_type]); */
+
+ if ((buf_pool->n_flush[flush_type] == 0)
+ && (buf_pool->init_flush[flush_type] == FALSE)) {
+
+ /* The running flush batch has ended */
+
+ os_event_set(buf_pool->no_flush[flush_type]);
+ }
+}
+
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written by the OS. */
+static
+void
+buf_flush_sync_datafiles(void)
+/*==========================*/
+{
+ /* Wake possible simulated aio thread to actually post the
+ writes to the operating system */
+ os_aio_simulated_wake_handler_threads();
+
+ /* Wait that all async writes to tablespaces have been posted to
+ the OS */
+ os_aio_wait_until_no_pending_writes();
+
+ /* Now we flush the data to disk (for example, with fsync) */
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ return;
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+static
+void
+buf_flush_buffered_writes(void)
+/*===========================*/
+{
+ byte* write_buf;
+ ulint len;
+ ulint len2;
+ ulint i;
+
+ if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
+ /* Sync the writes to the disk. */
+ buf_flush_sync_datafiles();
+ return;
+ }
+
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ /* Write first to doublewrite buffer blocks. We use synchronous
+ aio and thus know that file write has been completed when the
+ control returns. */
+
+ if (trx_doublewrite->first_free == 0) {
+
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ return;
+ }
+
+ for (i = 0; i < trx_doublewrite->first_free; i++) {
+
+ const buf_block_t* block;
+
+ block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
+
+ if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+ || block->page.zip.data) {
+ /* No simple validate for compressed pages exists. */
+ continue;
+ }
+
+ if (UNIV_UNLIKELY
+ (memcmp(block->frame + (FIL_PAGE_LSN + 4),
+ block->frame + (UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ 4))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: The page to be written"
+ " seems corrupt!\n"
+ "InnoDB: The lsn fields do not match!"
+ " Noticed in the buffer pool\n"
+ "InnoDB: before posting to the"
+ " doublewrite buffer.\n");
+ }
+
+ if (!block->check_index_page_at_flush) {
+ } else if (page_is_comp(block->frame)) {
+ if (UNIV_UNLIKELY
+ (!page_simple_validate_new(block->frame))) {
+corrupted_page:
+ buf_page_print(block->frame, 0);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Apparent corruption of an"
+ " index page n:o %lu in space %lu\n"
+ "InnoDB: to be written to data file."
+ " We intentionally crash server\n"
+ "InnoDB: to prevent corrupt data"
+ " from ending up in data\n"
+ "InnoDB: files.\n",
+ (ulong) buf_block_get_page_no(block),
+ (ulong) buf_block_get_space(block));
+
+ ut_error;
+ }
+ } else if (UNIV_UNLIKELY
+ (!page_simple_validate_old(block->frame))) {
+
+ goto corrupted_page;
+ }
+ }
+
+ /* increment the doublewrite flushed pages counter */
+ srv_dblwr_pages_written+= trx_doublewrite->first_free;
+ srv_dblwr_writes++;
+
+ len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+ trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
+
+ write_buf = trx_doublewrite->write_buf;
+ i = 0;
+
+ fil_io(OS_FILE_WRITE, TRUE,
+ (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
+ trx_doublewrite->block1, 0, len,
+ (void*) write_buf, NULL);
+
+ for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
+ len2 += UNIV_PAGE_SIZE, i++) {
+ const buf_block_t* block = (buf_block_t*)
+ trx_doublewrite->buf_block_arr[i];
+
+ if (UNIV_LIKELY(!block->page.zip.data)
+ && UNIV_LIKELY(buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE)
+ && UNIV_UNLIKELY
+ (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
+ write_buf + len2
+ + (UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: The page to be written"
+ " seems corrupt!\n"
+ "InnoDB: The lsn fields do not match!"
+ " Noticed in the doublewrite block1.\n");
+ }
+ }
+
+ if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ goto flush;
+ }
+
+ len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ * UNIV_PAGE_SIZE;
+
+ write_buf = trx_doublewrite->write_buf
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+ ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
+
+ fil_io(OS_FILE_WRITE, TRUE,
+ (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
+ trx_doublewrite->block2, 0, len,
+ (void*) write_buf, NULL);
+
+ for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
+ len2 += UNIV_PAGE_SIZE, i++) {
+ const buf_block_t* block = (buf_block_t*)
+ trx_doublewrite->buf_block_arr[i];
+
+ if (UNIV_LIKELY(!block->page.zip.data)
+ && UNIV_LIKELY(buf_block_get_state(block)
+ == BUF_BLOCK_FILE_PAGE)
+ && UNIV_UNLIKELY
+ (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
+ write_buf + len2
+ + (UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: The page to be"
+ " written seems corrupt!\n"
+ "InnoDB: The lsn fields do not match!"
+ " Noticed in"
+ " the doublewrite block2.\n");
+ }
+ }
+
+flush:
+ /* Now flush the doublewrite buffer data to disk */
+
+ fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+ blocks. Next do the writes to the intended positions. */
+
+ for (i = 0; i < trx_doublewrite->first_free; i++) {
+ const buf_block_t* block = (buf_block_t*)
+ trx_doublewrite->buf_block_arr[i];
+
+ ut_a(buf_page_in_file(&block->page));
+ if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, buf_page_get_space(&block->page),
+ buf_page_get_zip_size(&block->page),
+ buf_page_get_page_no(&block->page), 0,
+ buf_page_get_zip_size(&block->page),
+ (void*)block->page.zip.data,
+ (void*)block);
+
+ /* Increment the counter of I/O operations used
+ for selecting LRU policy. */
+ buf_LRU_stat_inc_io();
+
+ continue;
+ }
+
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
+ block->frame
+ + (UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+ 4))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: The page to be written"
+ " seems corrupt!\n"
+ "InnoDB: The lsn fields do not match!"
+ " Noticed in the buffer pool\n"
+ "InnoDB: after posting and flushing"
+ " the doublewrite buffer.\n"
+ "InnoDB: Page buf fix count %lu,"
+ " io fix %lu, state %lu\n",
+ (ulong)block->page.buf_fix_count,
+ (ulong)buf_block_get_io_fix(block),
+ (ulong)buf_block_get_state(block));
+ }
+
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, buf_block_get_space(block), 0,
+ buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+ (void*)block->frame, (void*)block);
+
+ /* Increment the counter of I/O operations used
+ for selecting LRU policy. */
+ buf_LRU_stat_inc_io();
+ }
+
+ /* Sync the writes to the disk. */
+ buf_flush_sync_datafiles();
+
+ /* We can now reuse the doublewrite memory buffer: */
+ trx_doublewrite->first_free = 0;
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_flush_buffered_writes and waits for for free space to
+appear. */
+static
+void
+buf_flush_post_to_doublewrite_buf(
+/*==============================*/
+ buf_page_t* bpage) /*!< in: buffer block to write */
+{
+ ulint zip_size;
+try_again:
+ mutex_enter(&(trx_doublewrite->mutex));
+
+ ut_a(buf_page_in_file(bpage));
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ goto try_again;
+ }
+
+ zip_size = buf_page_get_zip_size(bpage);
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+ /* Copy the compressed page and clear the rest. */
+ memcpy(trx_doublewrite->write_buf
+ + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+ bpage->zip.data, zip_size);
+ memset(trx_doublewrite->write_buf
+ + UNIV_PAGE_SIZE * trx_doublewrite->first_free
+ + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+ } else {
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+ UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+
+ memcpy(trx_doublewrite->write_buf
+ + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+ ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+ }
+
+ trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
+
+ trx_doublewrite->first_free++;
+
+ if (trx_doublewrite->first_free
+ >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ mutex_exit(&(trx_doublewrite->mutex));
+
+ buf_flush_buffered_writes();
+
+ return;
+ }
+
+ mutex_exit(&(trx_doublewrite->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+ byte* page, /*!< in/out: page */
+ void* page_zip_, /*!< in/out: compressed page, or NULL */
+ ib_uint64_t newest_lsn) /*!< in: newest modification lsn
+ to the page */
+{
+ ut_ad(page);
+
+ if (page_zip_) {
+ page_zip_des_t* page_zip = page_zip_;
+ ulint zip_size = page_zip_get_size(page_zip);
+ ut_ad(zip_size);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ memcpy(page_zip->data, page, zip_size);
+ /* fall through */
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ case FIL_PAGE_INDEX:
+ mach_write_ull(page_zip->data
+ + FIL_PAGE_LSN, newest_lsn);
+ memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ mach_write_to_4(page_zip->data
+ + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? page_zip_calc_checksum(
+ page_zip->data, zip_size)
+ : BUF_NO_CHECKSUM_MAGIC);
+ return;
+ }
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ERROR: The compressed page to be written"
+ " seems corrupt:", stderr);
+ ut_print_buf(stderr, page, zip_size);
+ fputs("\nInnoDB: Possibly older version of the page:", stderr);
+ ut_print_buf(stderr, page_zip->data, zip_size);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ /* Write the newest modification lsn to the page header and trailer */
+ mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
+
+ mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ newest_lsn);
+
+ /* Store the new formula checksum */
+
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? (!srv_fast_checksum
+ ? buf_calc_page_new_checksum(page)
+ : buf_calc_page_new_checksum_32(page))
+ : BUF_NO_CHECKSUM_MAGIC);
+
+ /* We overwrite the first 4 bytes of the end lsn field to store
+ the old formula checksum. Since it depends also on the field
+ FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
+ new formula checksum. */
+
+ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ srv_use_checksums
+ ? buf_calc_page_old_checksum(page)
+ : BUF_NO_CHECKSUM_MAGIC);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_flush_buffered_writes after we have posted a batch of writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+ buf_page_t* bpage) /*!< in: buffer block to write */
+{
+ ulint zip_size = buf_page_get_zip_size(bpage);
+ page_t* frame = NULL;
+#ifdef UNIV_LOG_DEBUG
+ static ibool univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+ ut_ad(buf_page_in_file(bpage));
+
+ /* We are not holding buf_pool_mutex or block_mutex here.
+ Nevertheless, it is safe to access bpage, because it is
+ io_fixed and oldest_modification != 0. Thus, it cannot be
+ relocated in the buffer pool or removed from flush_list or
+ LRU_list. */
+ //ut_ad(!buf_pool_mutex_own());
+ ut_ad(!mutex_own(&LRU_list_mutex));
+ ut_ad(!mutex_own(&flush_list_mutex));
+ ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
+ ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+ ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+ if (!univ_log_debug_warned) {
+ univ_log_debug_warned = TRUE;
+ fputs("Warning: cannot force log to disk if"
+ " UNIV_LOG_DEBUG is defined!\n"
+ "Crash recovery will not work!\n",
+ stderr);
+ }
+#else
+ /* Force the log to the disk before writing the modified block */
+ log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ case BUF_BLOCK_ZIP_DIRTY:
+ frame = bpage->zip.data;
+ if (UNIV_LIKELY(srv_use_checksums)) {
+ ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+ == page_zip_calc_checksum(frame, zip_size));
+ }
+ mach_write_ull(frame + FIL_PAGE_LSN,
+ bpage->newest_modification);
+ memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ frame = bpage->zip.data;
+ if (!frame) {
+ frame = ((buf_block_t*) bpage)->frame;
+ }
+
+ buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+ bpage->zip.data
+ ? &bpage->zip : NULL,
+ bpage->newest_modification);
+ break;
+ }
+
+ if (!srv_use_doublewrite_buf || !trx_doublewrite) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, buf_page_get_space(bpage), zip_size,
+ buf_page_get_page_no(bpage), 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ frame, bpage);
+ } else {
+ buf_flush_post_to_doublewrite_buf(bpage);
+ }
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
+held upon entering this function, and they will be released by this
+function. */
+static
+void
+buf_flush_page(
+/*===========*/
+ buf_page_t* bpage, /*!< in: buffer control block */
+ enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
+ or BUF_FLUSH_LIST */
+{
+ mutex_t* block_mutex;
+ ibool is_uncompressed;
+
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+ //ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)
+ || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED));
+#endif
+ ut_ad(buf_page_in_file(bpage));
+
+ block_mutex = buf_page_get_mutex(bpage);
+ ut_ad(mutex_own(block_mutex));
+
+ mutex_enter(&buf_pool_mutex);
+ rw_lock_s_unlock(&page_hash_latch);
+
+ ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+ buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+ buf_page_set_flush_type(bpage, flush_type);
+
+ if (buf_pool->n_flush[flush_type] == 0) {
+
+ os_event_reset(buf_pool->no_flush[flush_type]);
+ }
+
+ buf_pool->n_flush[flush_type]++;
+
+ is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
+
+ switch (flush_type) {
+ ibool is_s_latched;
+ case BUF_FLUSH_LIST:
+ /* If the simulated aio thread is not running, we must
+ not wait for any latch, as we may end up in a deadlock:
+ if buf_fix_count == 0, then we know we need not wait */
+
+ is_s_latched = (bpage->buf_fix_count == 0);
+ if (is_s_latched && is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ mutex_exit(block_mutex);
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ /* Even though bpage is not protected by any mutex at
+ this point, it is safe to access bpage, because it is
+ io_fixed and oldest_modification != 0. Thus, it
+ cannot be relocated in the buffer pool or removed from
+ flush_list or LRU_list. */
+
+ if (!is_s_latched) {
+ buf_flush_buffered_writes();
+
+ if (is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)
+ ->lock, BUF_IO_WRITE);
+ }
+ }
+
+ break;
+
+ case BUF_FLUSH_LRU:
+ /* VERY IMPORTANT:
+ Because any thread may call the LRU flush, even when owning
+ locks on pages, to avoid deadlocks, we must make sure that the
+ s-lock is acquired on the page without waiting: this is
+ accomplished because buf_flush_ready_for_flush() must hold,
+ and that requires the page not to be bufferfixed. */
+
+ if (is_uncompressed) {
+ rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+ BUF_IO_WRITE);
+ }
+
+ /* Note that the s-latch is acquired before releasing the
+ buf_pool mutex: this ensures that the latch is acquired
+ immediately. */
+
+ mutex_exit(block_mutex);
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ /* Even though bpage is not protected by any mutex at this
+ point, it is safe to access bpage, because it is io_fixed and
+ oldest_modification != 0. Thus, it cannot be relocated in the
+ buffer pool or removed from flush_list or LRU_list. */
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Flushing %u space %u page %u\n",
+ flush_type, bpage->space, bpage->offset);
+ }
+#endif /* UNIV_DEBUG */
+ buf_flush_write_block_low(bpage);
+}
+
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page offset */
+ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST */
+ ulint flush_neighbors)
+{
+ buf_page_t* bpage;
+ ulint low, high;
+ ulint count = 0;
+ ulint i;
+
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !flush_neighbors) {
+ /* If there is little space, it is better not to flush any
+ block except from the end of the LRU list */
+
+ low = offset;
+ high = offset + 1;
+ } else {
+ /* When flushed, dirty blocks are searched in neighborhoods of
+ this size, and flushed along with the original page. */
+
+ ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
+ buf_pool->curr_size / 16);
+
+ low = (offset / buf_flush_area) * buf_flush_area;
+ high = (offset / buf_flush_area + 1) * buf_flush_area;
+ }
+
+ /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+ if (high > fil_space_get_size(space)) {
+ high = fil_space_get_size(space);
+ }
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ for (i = low; i < high; i++) {
+
+ bpage = buf_page_hash_get(space, i);
+
+ if (!bpage) {
+
+ continue;
+ }
+
+ ut_a(buf_page_in_file(bpage));
+
+ /* We avoid flushing 'non-old' blocks in an LRU flush,
+ because the flushed blocks are soon freed */
+
+ if (flush_type != BUF_FLUSH_LRU
+ || i == offset
+ || buf_page_is_old(bpage)) {
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+
+ if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)
+ && (i == offset || !bpage->buf_fix_count)) {
+ /* We only try to flush those
+ neighbors != offset where the buf fix count is
+ zero, as we then know that we probably can
+ latch the page without a semaphore wait.
+ Semaphore waits are expensive because we must
+ flush the doublewrite buffer before we start
+ waiting. */
+
+ buf_flush_page(bpage, flush_type);
+ ut_ad(!mutex_own(block_mutex));
+ count++;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+ } else if (block_mutex) {
+ mutex_exit(block_mutex);
+ }
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_batch(
+/*============*/
+ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+ then the caller must not own any
+ latches on pages */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+{
+ buf_page_t* bpage;
+ buf_page_t* prev_bpage = NULL;
+ ulint page_count = 0;
+ ulint old_page_count;
+ ulint space;
+ ulint offset;
+ ulint remaining = 0;
+
+ ut_ad((flush_type == BUF_FLUSH_LRU)
+ || (flush_type == BUF_FLUSH_LIST));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad((flush_type != BUF_FLUSH_LIST)
+ || sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ if ((buf_pool->n_flush[flush_type] > 0)
+ || (buf_pool->init_flush[flush_type] == TRUE)) {
+
+ /* There is already a flush batch of the same type running */
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ return(ULINT_UNDEFINED);
+ }
+
+ buf_pool->init_flush[flush_type] = TRUE;
+
+ mutex_exit(&buf_pool_mutex);
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_enter(&LRU_list_mutex);
+ }
+
+ for (;;) {
+flush_next:
+ /* If we have flushed enough, leave the loop */
+ if (page_count >= min_n) {
+
+ break;
+ }
+
+ /* Start from the end of the list looking for a suitable
+ block to be flushed. */
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ } else {
+ ut_ad(flush_type == BUF_FLUSH_LIST);
+
+ mutex_enter(&flush_list_mutex);
+ remaining = UT_LIST_GET_LEN(buf_pool->flush_list);
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+ if (bpage) {
+ prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
+ }
+ mutex_exit(&flush_list_mutex);
+ if (!bpage
+ || bpage->oldest_modification >= lsn_limit) {
+ /* We have flushed enough */
+
+ break;
+ }
+ ut_ad(bpage->in_flush_list);
+ }
+
+ /* Note that after finding a single flushable page, we try to
+ flush also all its neighbors, and after that start from the
+ END of the LRU list or flush list again: the list may change
+ during the flushing and we cannot safely preserve within this
+ function a pointer to a block in the list! */
+
+ do {
+ mutex_t*block_mutex = buf_page_get_mutex_enter(bpage);
+ ibool ready;
+
+ //ut_a(buf_page_in_file(bpage));
+
+ if (block_mutex) {
+ ready = buf_flush_ready_for_flush(bpage, flush_type);
+ mutex_exit(block_mutex);
+ } else {
+ ready = FALSE;
+ }
+
+ if (ready) {
+ space = buf_page_get_space(bpage);
+ offset = buf_page_get_page_no(bpage);
+
+ //buf_pool_mutex_exit();
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(&LRU_list_mutex);
+ }
+
+ old_page_count = page_count;
+
+ /* Try to flush also all the neighbors */
+ page_count += buf_flush_try_neighbors(
+ space, offset, flush_type, srv_flush_neighbor_pages);
+ /* fprintf(stderr,
+ "Flush type %lu, page no %lu, neighb %lu\n",
+ flush_type, offset,
+ page_count - old_page_count); */
+
+ //buf_pool_mutex_enter();
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_enter(&LRU_list_mutex);
+ }
+ goto flush_next;
+
+ } else if (flush_type == BUF_FLUSH_LRU) {
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ } else {
+ ut_ad(flush_type == BUF_FLUSH_LIST);
+
+ mutex_enter(&flush_list_mutex);
+ bpage = UT_LIST_GET_PREV(flush_list, bpage);
+ //ut_ad(!bpage || bpage->in_flush_list); /* optimistic */
+ if (bpage != prev_bpage) {
+ /* the search may warp.. retrying */
+ bpage = NULL;
+ }
+ if (bpage) {
+ prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
+ }
+ mutex_exit(&flush_list_mutex);
+ remaining--;
+ }
+ } while (bpage != NULL);
+
+ if (remaining)
+ goto flush_next;
+
+ /* If we could not find anything to flush, leave the loop */
+
+ break;
+ }
+
+ if (flush_type == BUF_FLUSH_LRU) {
+ mutex_exit(&LRU_list_mutex);
+ }
+
+ mutex_enter(&buf_pool_mutex);
+
+ buf_pool->init_flush[flush_type] = FALSE;
+
+ if (buf_pool->n_flush[flush_type] == 0) {
+
+ /* The running flush batch has ended */
+
+ os_event_set(buf_pool->no_flush[flush_type]);
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ buf_flush_buffered_writes();
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints && page_count > 0) {
+ ut_a(flush_type == BUF_FLUSH_LRU
+ || flush_type == BUF_FLUSH_LIST);
+ fprintf(stderr, flush_type == BUF_FLUSH_LRU
+ ? "Flushed %lu pages in LRU flush\n"
+ : "Flushed %lu pages in flush list flush\n",
+ (ulong) page_count);
+ }
+#endif /* UNIV_DEBUG */
+
+ srv_buf_pool_flushed += page_count;
+
+ /* We keep track of all flushes happening as part of LRU
+ flush. When estimating the desired rate at which flush_list
+ should be flushed we factor in this value. */
+ if (flush_type == BUF_FLUSH_LRU) {
+ buf_lru_flush_page_count += page_count;
+ }
+
+ return(page_count);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+ enum buf_flush type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+{
+ ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
+
+ os_event_wait(buf_pool->no_flush[type]);
+}
+
+/******************************************************************//**
+Gives a recommendation of how many blocks should be flushed to establish
+a big enough margin of replaceable blocks near the end of the LRU list
+and in the free list.
+@return number of blocks which should be flushed from the end of the
+LRU list */
+static
+ulint
+buf_flush_LRU_recommendation(void)
+/*==============================*/
+{
+ buf_page_t* bpage;
+ ulint n_replaceable;
+ ulint distance = 0;
+ ibool have_LRU_mutex = FALSE;
+
+ if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+ have_LRU_mutex = TRUE;
+retry:
+ //buf_pool_mutex_enter();
+ if (have_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+
+ n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
+
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while ((bpage != NULL)
+ && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
+ + BUF_FLUSH_EXTRA_MARGIN)
+ && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
+
+ mutex_t* block_mutex;
+ if (!bpage->in_LRU_list) {
+ /* reatart. but it is very optimistic */
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ continue;
+ }
+ block_mutex = buf_page_get_mutex_enter(bpage);
+
+ if (block_mutex && buf_flush_ready_for_replace(bpage)) {
+ n_replaceable++;
+ }
+
+ if (block_mutex) {
+ mutex_exit(block_mutex);
+ }
+
+ distance++;
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ }
+
+ //buf_pool_mutex_exit();
+ if (have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+
+ if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
+
+ return(0);
+ } else if (!have_LRU_mutex) {
+ /* confirm it again with LRU_mutex for exactness */
+ have_LRU_mutex = TRUE;
+ distance = 0;
+ goto retry;
+ }
+
+ return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
+ - n_replaceable);
+}
+
+/*********************************************************************//**
+Flushes pages from the end of the LRU list if there is too small a margin
+of replaceable pages there or in the free list. VERY IMPORTANT: this function
+is called also by threads which have locks on pages. To avoid deadlocks, we
+flush only pages such that the s-lock required for flushing can be acquired
+immediately, without waiting. */
+UNIV_INTERN
+void
+buf_flush_free_margin(
+/*=======================*/
+ ibool wait)
+{
+ ulint n_to_flush;
+ ulint n_flushed;
+
+ n_to_flush = buf_flush_LRU_recommendation();
+
+ if (n_to_flush > 0) {
+ n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
+ if (wait && n_flushed == ULINT_UNDEFINED) {
+ /* There was an LRU type flush batch already running;
+ let us wait for it to end */
+
+ buf_flush_wait_batch_end(BUF_FLUSH_LRU);
+ }
+ }
+}
+
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval.
+Flush rate heuristic depends on (a) rate of redo log generation and
+(b) the rate at which LRU flush is happening. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void)
+/*=======================*/
+{
+ buf_flush_stat_t* item;
+ ib_uint64_t lsn_diff;
+ ib_uint64_t lsn;
+ ulint n_flushed;
+
+ lsn = log_get_lsn();
+ if (buf_flush_stat_cur.redo == 0) {
+ /* First time around. Just update the current LSN
+ and return. */
+ buf_flush_stat_cur.redo = lsn;
+ return;
+ }
+
+ item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
+
+ /* values for this interval */
+ lsn_diff = lsn - buf_flush_stat_cur.redo;
+ n_flushed = buf_lru_flush_page_count
+ - buf_flush_stat_cur.n_flushed;
+
+ /* add the current value and subtract the obsolete entry. */
+ buf_flush_stat_sum.redo += lsn_diff - item->redo;
+ buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
+
+ /* put current entry in the array. */
+ item->redo = lsn_diff;
+ item->n_flushed = n_flushed;
+
+ /* update the index */
+ buf_flush_stat_arr_ind++;
+ buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
+
+ /* reset the current entry. */
+ buf_flush_stat_cur.redo = lsn;
+ buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
+}
+
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at a significant rate without corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void)
+/*==================================*/
+{
+ ulint redo_avg;
+ ulint lru_flush_avg;
+ ulint n_dirty;
+ ulint n_flush_req;
+ lint rate;
+ ib_uint64_t lsn = log_get_lsn();
+ ulint log_capacity = log_get_capacity();
+
+ /* log_capacity should never be zero after the initialization
+ of log subsystem. */
+ ut_ad(log_capacity != 0);
+
+ /* Get total number of dirty pages. It is OK to access
+ flush_list without holding any mtex as we are using this
+ only for heuristics. */
+ n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+ /* An overflow can happen if we generate more than 2^32 bytes
+ of redo in this interval i.e.: 4G of redo in 1 second. We can
+ safely consider this as infinity because if we ever come close
+ to 4G we'll start a synchronous flush of dirty pages. */
+ /* redo_avg below is average at which redo is generated in
+ past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
+ interval. */
+ redo_avg = (ulint) (buf_flush_stat_sum.redo
+ / BUF_FLUSH_STAT_N_INTERVAL
+ + (lsn - buf_flush_stat_cur.redo));
+
+ /* An overflow can happen possibly if we flush more than 2^32
+ pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
+ unlikely scenario. Even when this happens it means that our
+ flush rate will be off the mark. It won't affect correctness
+ of any subsystem. */
+ /* lru_flush_avg below is rate at which pages are flushed as
+ part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
+ number of pages flushed in the current interval. */
+ lru_flush_avg = buf_flush_stat_sum.n_flushed
+ / BUF_FLUSH_STAT_N_INTERVAL
+ + (buf_lru_flush_page_count
+ - buf_flush_stat_cur.n_flushed);
+
+ n_flush_req = (n_dirty * redo_avg) / log_capacity;
+
+ /* The number of pages that we want to flush from the flush
+ list is the difference between the required rate and the
+ number of pages that we are historically flushing from the
+ LRU list */
+ rate = n_flush_req - lru_flush_avg;
+ return(rate > 0 ? (ulint) rate : 0);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+static
+ibool
+buf_flush_validate_low(void)
+/*========================*/
+{
+ buf_page_t* bpage;
+ const ib_rbt_node_t* rnode = NULL;
+
+ UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
+ ut_ad(ut_list_node_313->in_flush_list));
+
+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ /* If we are in recovery mode i.e.: flush_rbt != NULL
+ then each block in the flush_list must also be present
+ in the flush_rbt. */
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ rnode = rbt_first(buf_pool->flush_rbt);
+ }
+
+ while (bpage != NULL) {
+ const ib_uint64_t om = bpage->oldest_modification;
+ ut_ad(bpage->in_flush_list);
+ //ut_a(buf_page_in_file(bpage)); /* optimistic */
+ ut_a(om > 0);
+
+ if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+ ut_a(rnode);
+ buf_page_t* rpage = *rbt_value(buf_page_t*,
+ rnode);
+ ut_a(rpage);
+ ut_a(rpage == bpage);
+ rnode = rbt_next(buf_pool->flush_rbt, rnode);
+ }
+
+ bpage = UT_LIST_GET_NEXT(flush_list, bpage);
+
+ ut_a(!bpage || om >= bpage->oldest_modification);
+ }
+
+ /* By this time we must have exhausted the traversal of
+ flush_rbt (if active) as well. */
+ ut_a(rnode == NULL);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(void)
+/*====================*/
+{
+ ibool ret;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+ ret = buf_flush_validate_low();
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+
+ return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c
new file mode 100644
index 00000000000..94828940fd4
--- /dev/null
+++ b/storage/xtradb/buf/buf0lru.c
@@ -0,0 +1,2580 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.c
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+#ifdef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "hash0hash.h"
+#include "os0sync.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+#define BUF_LRU_OLD_TOLERANCE 20
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN 5
+#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN
+# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN"
+#endif
+
+/** When dropping the search hash index entries before deleting an ibd
+file, we build a local array of pages belonging to that tablespace
+in the buffer pool. Following is the size of that array. */
+#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static ibool buf_lru_switched_on_innodb_mon = FALSE;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well). From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_LRU_stat_update(). */
+#define BUF_LRU_STAT_N_INTERVAL 50
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+#define BUF_LRU_IO_TO_UNZIP_FACTOR 50
+
+/** Sampled values buf_LRU_stat_cur.
+Protected by buf_pool_mutex. Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint buf_LRU_stat_arr_ind;
+
+/** Current operation counters. Not protected by any mutex. Cleared
+by buf_LRU_stat_update(). */
+UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */
+UNIV_INTERN buf_LRU_stat_t buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for
+"old" blocks. Protected by buf_pool_mutex. */
+UNIV_INTERN uint buf_LRU_old_ratio;
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+UNIV_INTERN uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed and buf_pool_zip_mutex will be released.
+
+If a compressed page or a compressed-only block descriptor is freed,
+other compressed pages or compressed-only block descriptors may be
+relocated.
+@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
+was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+static
+enum buf_page_state
+buf_LRU_block_remove_hashed_page(
+/*=============================*/
+ buf_page_t* bpage, /*!< in: block, must contain a file page and
+ be in a state where it can be freed; there
+ may or may not be a hash index to the page */
+ ibool zip); /*!< in: TRUE if should remove also the
+ compressed page of an uncompressed page */
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+ buf_block_t* block, /*!< in: block, must contain a file page and
+ be in a state where it can be freed */
+ ibool have_page_hash_mutex);
+
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return TRUE if should use unzip_LRU */
+UNIV_INLINE
+ibool
+buf_LRU_evict_from_unzip_LRU(
+ ibool have_LRU_mutex)
+/*==============================*/
+{
+ ulint io_avg;
+ ulint unzip_avg;
+
+ //ut_ad(buf_pool_mutex_own());
+
+ if (!have_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+ /* If the unzip_LRU list is empty, we can only use the LRU. */
+ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ return(FALSE);
+ }
+
+ /* If unzip_LRU is at most 10% of the size of the LRU list,
+ then use the LRU. This slack allows us to keep hot
+ decompressed pages in the buffer pool. */
+ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
+ <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ return(FALSE);
+ }
+
+ /* If eviction hasn't started yet, we assume by default
+ that a workload is disk bound. */
+ if (buf_pool->freed_page_clock == 0) {
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ return(TRUE);
+ }
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+
+ /* Calculate the average over past intervals, and add the values
+ of the current interval. */
+ io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.io;
+ unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+ + buf_LRU_stat_cur.unzip;
+
+ /* Decide based on our formula. If the load is I/O bound
+ (unzip_avg is smaller than the weighted io_avg), evict an
+ uncompressed frame from unzip_LRU. Otherwise we assume that
+ the load is CPU bound and evict from the regular LRU. */
+ return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/******************************************************************//**
+Attempts to drop page hash index on a batch of pages belonging to a
+particular space id. */
+static
+void
+buf_LRU_drop_page_hash_batch(
+/*=========================*/
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ const ulint* arr, /*!< in: array of page_no */
+ ulint count) /*!< in: number of entries in array */
+{
+ ulint i;
+
+ ut_ad(arr != NULL);
+ ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+
+ for (i = 0; i < count; ++i) {
+ btr_search_drop_page_hash_when_freed(space_id, zip_size,
+ arr[i]);
+ }
+}
+
+/******************************************************************//**
+When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
+hash index entries belonging to that table. This function tries to
+do that in batch. Note that this is a 'best effort' attempt and does
+not guarantee that ALL hash entries will be removed. */
+static
+void
+buf_LRU_drop_page_hash_for_tablespace(
+/*==================================*/
+ ulint id) /*!< in: space id */
+{
+ buf_page_t* bpage;
+ ulint* page_arr;
+ ulint num_entries;
+ ulint zip_size;
+
+ zip_size = fil_space_get_zip_size(id);
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+ /* Somehow, the tablespace does not exist. Nothing to drop. */
+ ut_ad(0);
+ return;
+ }
+
+ page_arr = ut_malloc(sizeof(ulint)
+ * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+scan_again:
+ num_entries = 0;
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while (bpage != NULL) {
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+ buf_page_t* prev_bpage;
+
+ prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+ if (!block_mutex) {
+ goto next_page;
+ }
+
+ ut_a(buf_page_in_file(bpage));
+
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
+ || bpage->space != id
+ || bpage->buf_fix_count > 0
+ || bpage->io_fix != BUF_IO_NONE) {
+ /* We leave the fixed pages as is in this scan.
+ To be dealt with later in the final scan. */
+ mutex_exit(block_mutex);
+ goto next_page;
+ }
+
+ if (((buf_block_t*) bpage)->is_hashed) {
+
+ /* Store the offset(i.e.: page_no) in the array
+ so that we can drop hash index in a batch
+ later. */
+ page_arr[num_entries] = bpage->offset;
+ mutex_exit(block_mutex);
+ ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+ ++num_entries;
+
+ if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+ goto next_page;
+ }
+ /* Array full. We release the buf_pool_mutex to
+ obey the latching order. */
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+
+ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
+ num_entries);
+ num_entries = 0;
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ } else {
+ mutex_exit(block_mutex);
+ }
+
+next_page:
+ /* Note that we may have released the buf_pool mutex
+ above after reading the prev_bpage during processing
+ of a page_hash_batch (i.e.: when the array was full).
+ This means that prev_bpage can change in LRU list.
+ This is OK because this function is a 'best effort'
+ to drop as many search hash entries as possible and
+ it does not guarantee that ALL such entries will be
+ dropped. */
+ bpage = prev_bpage;
+
+ /* If, however, bpage has been removed from LRU list
+ to the free list then we should restart the scan.
+ bpage->state is protected by buf_pool mutex. */
+ if (bpage && !buf_page_in_file(bpage)) {
+ ut_a(num_entries == 0);
+ goto scan_again;
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+
+ /* Drop any remaining batch of search hashed pages. */
+ buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries);
+ ut_free(page_arr);
+}
+
+/******************************************************************//**
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. */
+UNIV_INTERN
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+ ulint id) /*!< in: space id */
+{
+ buf_page_t* bpage;
+ ibool all_freed;
+
+ /* Before we attempt to drop pages one by one we first
+ attempt to drop page hash index entries in batches to make
+ it more efficient. The batching attempt is a best effort
+ attempt and does not guarantee that all pages hash entries
+ will be dropped. We get rid of remaining page hash entries
+ one by one below. */
+ buf_LRU_drop_page_hash_for_tablespace(id);
+
+scan_again:
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+
+ all_freed = TRUE;
+
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while (bpage != NULL) {
+ buf_page_t* prev_bpage;
+ ibool prev_bpage_buf_fix = FALSE;
+
+ ut_a(buf_page_in_file(bpage));
+
+ prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+ /* bpage->space and bpage->io_fix are protected by
+ buf_pool_mutex and block_mutex. It is safe to check
+ them while holding buf_pool_mutex only. */
+
+ if (buf_page_get_space(bpage) != id) {
+ /* Skip this block, as it does not belong to
+ the space that is being invalidated. */
+ } else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+ /* We cannot remove this page during this scan
+ yet; maybe the system is currently reading it
+ in, or flushing the modifications to the file */
+
+ all_freed = FALSE;
+ } else {
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+
+ if (!block_mutex) {
+ /* It may be impossible case...
+ Something wrong, so will be scan_again */
+
+ all_freed = FALSE;
+
+ goto next_page_no_mutex;
+ }
+
+ if (bpage->buf_fix_count > 0) {
+
+ /* We cannot remove this page during
+ this scan yet; maybe the system is
+ currently reading it in, or flushing
+ the modifications to the file */
+
+ all_freed = FALSE;
+
+ goto next_page;
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Dropping space %lu page %lu\n",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+ }
+#endif
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ /* This is a compressed-only block
+ descriptor. Ensure that prev_bpage
+ cannot be relocated when bpage is freed. */
+ if (UNIV_LIKELY(prev_bpage != NULL)) {
+ switch (buf_page_get_state(
+ prev_bpage)) {
+ case BUF_BLOCK_FILE_PAGE:
+ /* Descriptors of uncompressed
+ blocks will not be relocated,
+ because we are holding the
+ buf_pool_mutex. */
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ /* Descriptors of compressed-
+ only blocks can be relocated,
+ unless they are buffer-fixed.
+ Because both bpage and
+ prev_bpage are protected by
+ buf_pool_zip_mutex, it is
+ not necessary to acquire
+ further mutexes. */
+ ut_ad(&buf_pool_zip_mutex
+ == block_mutex);
+ ut_ad(mutex_own(block_mutex));
+ prev_bpage_buf_fix = TRUE;
+ prev_bpage->buf_fix_count++;
+ break;
+ default:
+ ut_error;
+ }
+ }
+ } else if (((buf_block_t*) bpage)->is_hashed) {
+ ulint page_no;
+ ulint zip_size;
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ zip_size = buf_page_get_zip_size(bpage);
+ page_no = buf_page_get_page_no(bpage);
+
+ mutex_exit(block_mutex);
+
+ /* Note that the following call will acquire
+ an S-latch on the page */
+
+ btr_search_drop_page_hash_when_freed(
+ id, zip_size, page_no);
+ goto scan_again;
+ }
+
+ if (bpage->oldest_modification != 0) {
+
+ buf_flush_remove(bpage);
+ }
+
+ /* Remove from the LRU list. */
+
+ if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
+ != BUF_BLOCK_ZIP_FREE) {
+ buf_LRU_block_free_hashed_page((buf_block_t*)
+ bpage, TRUE);
+ } else {
+ /* The block_mutex should have been
+ released by buf_LRU_block_remove_hashed_page()
+ when it returns BUF_BLOCK_ZIP_FREE. */
+ ut_ad(block_mutex == &buf_pool_zip_mutex);
+ ut_ad(!mutex_own(block_mutex));
+
+ if (prev_bpage_buf_fix) {
+ /* We temporarily buffer-fixed
+ prev_bpage, so that
+ buf_buddy_free() could not
+ relocate it, in case it was a
+ compressed-only block
+ descriptor. */
+
+ mutex_enter(block_mutex);
+ ut_ad(prev_bpage->buf_fix_count > 0);
+ prev_bpage->buf_fix_count--;
+ mutex_exit(block_mutex);
+ }
+
+ goto next_page_no_mutex;
+ }
+next_page:
+ mutex_exit(block_mutex);
+ }
+
+next_page_no_mutex:
+ bpage = prev_bpage;
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+
+ if (!all_freed) {
+ os_thread_sleep(20000);
+
+ goto scan_again;
+ }
+}
+
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: pointer to the block in question */
+{
+ buf_page_t* b;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+ ut_ad(mutex_own(&flush_list_mutex));
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+
+ /* Find the first successor of bpage in the LRU list
+ that is in the zip_clean list. */
+ b = bpage;
+ do {
+ b = UT_LIST_GET_NEXT(LRU, b);
+ } while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list));
+
+ /* Insert bpage before b, i.e., after the predecessor of b. */
+ if (b) {
+ b = UT_LIST_GET_PREV(zip_list, b);
+ }
+
+ if (b) {
+ UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage);
+ } else {
+ UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage);
+ }
+}
+
+/******************************************************************//**
+Try to free an uncompressed page of a compressed block from the unzip
+LRU list. The compressed page is preserved, and it need not be clean.
+@return TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_unzip_LRU_list(
+/*=============================*/
+ ulint n_iterations, /*!< in: how many times this has been called
+ repeatedly without result: a high value means
+ that we should search farther; we will search
+ n_iterations / 5 of the unzip_LRU list,
+ or nothing if n_iterations >= 5 */
+ ibool have_LRU_mutex)
+{
+ buf_block_t* block;
+ ulint distance;
+
+ //ut_ad(buf_pool_mutex_own()); /* optimistic */
+
+ /* Theoratically it should be much easier to find a victim
+ from unzip_LRU as we can choose even a dirty block (as we'll
+ be evicting only the uncompressed frame). In a very unlikely
+ eventuality that we are unable to find a victim from
+ unzip_LRU, we fall back to the regular LRU list. We do this
+ if we have done five iterations so far. */
+
+ if (UNIV_UNLIKELY(n_iterations >= 5)
+ || !buf_LRU_evict_from_unzip_LRU(have_LRU_mutex)) {
+
+ return(FALSE);
+ }
+
+ distance = 100 + (n_iterations
+ * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5;
+
+restart:
+ for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+ UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
+ block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
+
+ enum buf_lru_free_block_status freed;
+
+ mutex_enter(&block->mutex);
+ if (!block->in_unzip_LRU_list || !block->page.in_LRU_list
+ || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+ mutex_exit(&block->mutex);
+ goto restart;
+ }
+
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+
+ freed = buf_LRU_free_block(&block->page, FALSE, NULL, have_LRU_mutex);
+ mutex_exit(&block->mutex);
+
+ switch (freed) {
+ case BUF_LRU_FREED:
+ return(TRUE);
+
+ case BUF_LRU_CANNOT_RELOCATE:
+ /* If we failed to relocate, try
+ regular LRU eviction. */
+ return(FALSE);
+
+ case BUF_LRU_NOT_FREED:
+ /* The block was buffer-fixed or I/O-fixed.
+ Keep looking. */
+ continue;
+ }
+
+ /* inappropriate return value from
+ buf_LRU_free_block() */
+ ut_error;
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************//**
+Try to free a clean page from the common LRU list.
+@return TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_common_LRU_list(
+/*==============================*/
+ ulint n_iterations, /*!< in: how many times this has been called
+ repeatedly without result: a high value means
+ that we should search farther; if
+ n_iterations < 10, then we search
+ n_iterations / 10 * buf_pool->curr_size
+ pages from the end of the LRU list */
+ ibool have_LRU_mutex)
+{
+ buf_page_t* bpage;
+ ulint distance;
+
+ //ut_ad(buf_pool_mutex_own()); /* optimistic */
+
+ distance = 100 + (n_iterations * buf_pool->curr_size) / 10;
+
+restart:
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+ UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
+ bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
+
+ enum buf_lru_free_block_status freed;
+ unsigned accessed;
+ mutex_t* block_mutex
+ = buf_page_get_mutex_enter(bpage);
+
+ if (!block_mutex) {
+ goto restart;
+ }
+
+ if (!bpage->in_LRU_list
+ || !buf_page_in_file(bpage)) {
+ mutex_exit(block_mutex);
+ goto restart;
+ }
+
+ ut_ad(buf_page_in_file(bpage));
+ ut_ad(bpage->in_LRU_list);
+
+ accessed = buf_page_is_accessed(bpage);
+ freed = buf_LRU_free_block(bpage, TRUE, NULL, have_LRU_mutex);
+ mutex_exit(block_mutex);
+
+ switch (freed) {
+ case BUF_LRU_FREED:
+ /* Keep track of pages that are evicted without
+ ever being accessed. This gives us a measure of
+ the effectiveness of readahead */
+ if (!accessed) {
+ ++buf_pool->stat.n_ra_pages_evicted;
+ }
+ return(TRUE);
+
+ case BUF_LRU_NOT_FREED:
+ /* The block was dirty, buffer-fixed, or I/O-fixed.
+ Keep looking. */
+ continue;
+
+ case BUF_LRU_CANNOT_RELOCATE:
+ /* This should never occur, because we
+ want to discard the compressed page too. */
+ break;
+ }
+
+ /* inappropriate return value from
+ buf_LRU_free_block() */
+ ut_error;
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************//**
+Try to free a replaceable block.
+@return TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+ ulint n_iterations) /*!< in: how many times this has been called
+ repeatedly without result: a high value means
+ that we should search farther; if
+ n_iterations < 10, then we search
+ n_iterations / 10 * buf_pool->curr_size
+ pages from the end of the LRU list; if
+ n_iterations < 5, then we will also search
+ n_iterations / 5 of the unzip_LRU list. */
+{
+ ibool freed = FALSE;
+ ibool have_LRU_mutex = FALSE;
+
+ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+ have_LRU_mutex = TRUE;
+
+ /* optimistic search... */
+ //buf_pool_mutex_enter();
+ if (have_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+
+ freed = buf_LRU_free_from_unzip_LRU_list(n_iterations, have_LRU_mutex);
+
+ if (!freed) {
+ freed = buf_LRU_free_from_common_LRU_list(n_iterations, have_LRU_mutex);
+ }
+
+ mutex_enter(&buf_pool_mutex);
+ if (!freed) {
+ buf_pool->LRU_flush_ended = 0;
+ } else if (buf_pool->LRU_flush_ended > 0) {
+ buf_pool->LRU_flush_ended--;
+ }
+ mutex_exit(&buf_pool_mutex);
+
+ //buf_pool_mutex_exit();
+ if (have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+
+ return(freed);
+}
+
+/******************************************************************//**
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+UNIV_INTERN
+void
+buf_LRU_try_free_flushed_blocks(void)
+/*=================================*/
+{
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ while (buf_pool->LRU_flush_ended > 0) {
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ buf_LRU_search_and_free_block(1);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+}
+
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void)
+/*==============================*/
+{
+ ibool ret = FALSE;
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(&free_list_mutex);
+
+ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+ + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) {
+
+ ret = TRUE;
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&free_list_mutex);
+
+ return(ret);
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, returns NULL.
+@return a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(void)
+/*=======================*/
+{
+ buf_block_t* block;
+
+ //ut_ad(buf_pool_mutex_own());
+
+ mutex_enter(&free_list_mutex);
+ block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free);
+
+ if (block) {
+ ut_ad(block->page.in_free_list);
+ ut_d(block->page.in_free_list = FALSE);
+ ut_ad(!block->page.in_flush_list);
+ ut_ad(!block->page.in_LRU_list);
+ ut_a(!buf_page_in_file(&block->page));
+ UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+
+ mutex_exit(&free_list_mutex);
+
+ mutex_enter(&block->mutex);
+
+ buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+ UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+
+ mutex_exit(&block->mutex);
+ } else {
+ mutex_exit(&free_list_mutex);
+ }
+
+ return(block);
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+ ulint zip_size) /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+{
+ buf_block_t* block = NULL;
+ ibool freed;
+ ulint n_iterations = 1;
+ ibool mon_value_was = FALSE;
+ ibool started_monitor = FALSE;
+loop:
+ //buf_pool_mutex_enter();
+
+ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+ + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: ERROR: over 95 percent of the buffer pool"
+ " is occupied by\n"
+ "InnoDB: lock heaps or the adaptive hash index!"
+ " Check that your\n"
+ "InnoDB: transactions do not set too many row locks.\n"
+ "InnoDB: Your buffer pool size is %lu MB."
+ " Maybe you should make\n"
+ "InnoDB: the buffer pool bigger?\n"
+ "InnoDB: We intentionally generate a seg fault"
+ " to print a stack trace\n"
+ "InnoDB: on Linux!\n",
+ (ulong) (buf_pool->curr_size
+ / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+ ut_error;
+
+ } else if (!recv_recovery_on
+ && (UT_LIST_GET_LEN(buf_pool->free)
+ + UT_LIST_GET_LEN(buf_pool->LRU))
+ < buf_pool->curr_size / 3) {
+
+ if (!buf_lru_switched_on_innodb_mon) {
+
+ /* Over 67 % of the buffer pool is occupied by lock
+ heaps or the adaptive hash index. This may be a memory
+ leak! */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: WARNING: over 67 percent of"
+ " the buffer pool is occupied by\n"
+ "InnoDB: lock heaps or the adaptive"
+ " hash index! Check that your\n"
+ "InnoDB: transactions do not set too many"
+ " row locks.\n"
+ "InnoDB: Your buffer pool size is %lu MB."
+ " Maybe you should make\n"
+ "InnoDB: the buffer pool bigger?\n"
+ "InnoDB: Starting the InnoDB Monitor to print"
+ " diagnostics, including\n"
+ "InnoDB: lock heap and hash index sizes.\n",
+ (ulong) (buf_pool->curr_size
+ / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+ buf_lru_switched_on_innodb_mon = TRUE;
+ srv_print_innodb_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ }
+ } else if (buf_lru_switched_on_innodb_mon) {
+
+ /* Switch off the InnoDB Monitor; this is a simple way
+ to stop the monitor if the situation becomes less urgent,
+ but may also surprise users if the user also switched on the
+ monitor! */
+
+ buf_lru_switched_on_innodb_mon = FALSE;
+ srv_print_innodb_monitor = FALSE;
+ }
+
+ /* If there is a block in the free list, take it */
+ block = buf_LRU_get_free_only();
+ if (block) {
+
+#ifdef UNIV_DEBUG
+ block->page.zip.m_start =
+#endif /* UNIV_DEBUG */
+ block->page.zip.m_end =
+ block->page.zip.m_nonempty =
+ block->page.zip.n_blobs = 0;
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ ibool lru;
+ page_zip_set_size(&block->page.zip, zip_size);
+ mutex_enter(&LRU_list_mutex);
+ block->page.zip.data = buf_buddy_alloc(zip_size, &lru, FALSE);
+ mutex_exit(&LRU_list_mutex);
+ UNIV_MEM_DESC(block->page.zip.data, zip_size, block);
+ } else {
+ page_zip_set_size(&block->page.zip, 0);
+ block->page.zip.data = NULL;
+ }
+
+ //buf_pool_mutex_exit();
+
+ if (started_monitor) {
+ srv_print_innodb_monitor = mon_value_was;
+ }
+
+ return(block);
+ }
+
+ /* If no block was in the free list, search from the end of the LRU
+ list and try to free a block there */
+
+ //buf_pool_mutex_exit();
+
+ freed = buf_LRU_search_and_free_block(n_iterations);
+
+ if (freed > 0) {
+ goto loop;
+ }
+
+ if (n_iterations > 30) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: difficult to find free blocks in\n"
+ "InnoDB: the buffer pool (%lu search iterations)!"
+ " Consider\n"
+ "InnoDB: increasing the buffer pool size.\n"
+ "InnoDB: It is also possible that"
+ " in your Unix version\n"
+ "InnoDB: fsync is very slow, or"
+ " completely frozen inside\n"
+ "InnoDB: the OS kernel. Then upgrading to"
+ " a newer version\n"
+ "InnoDB: of your operating system may help."
+ " Look at the\n"
+ "InnoDB: number of fsyncs in diagnostic info below.\n"
+ "InnoDB: Pending flushes (fsync) log: %lu;"
+ " buffer pool: %lu\n"
+ "InnoDB: %lu OS file reads, %lu OS file writes,"
+ " %lu OS fsyncs\n"
+ "InnoDB: Starting InnoDB Monitor to print further\n"
+ "InnoDB: diagnostics to the standard output.\n",
+ (ulong) n_iterations,
+ (ulong) fil_n_pending_log_flushes,
+ (ulong) fil_n_pending_tablespace_flushes,
+ (ulong) os_n_file_reads, (ulong) os_n_file_writes,
+ (ulong) os_n_fsyncs);
+
+ mon_value_was = srv_print_innodb_monitor;
+ started_monitor = TRUE;
+ srv_print_innodb_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ }
+
+ /* No free block was found: try to flush the LRU list */
+
+ buf_flush_free_margin(TRUE);
+ ++srv_buf_pool_wait_free;
+
+ os_aio_simulated_wake_handler_threads();
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ if (buf_pool->LRU_flush_ended > 0) {
+ /* We have written pages in an LRU flush. To make the insert
+ buffer more efficient, we try to move these pages to the free
+ list. */
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ buf_LRU_try_free_flushed_blocks();
+ } else {
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+ }
+
+ if (n_iterations > 10) {
+
+ os_thread_sleep(500000);
+ }
+
+ n_iterations++;
+
+ goto loop;
+}
+
+/*******************************************************************//**
+Moves the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+UNIV_INLINE
+void
+buf_LRU_old_adjust_len(void)
+/*========================*/
+{
+ ulint old_len;
+ ulint new_len;
+
+ ut_a(buf_pool->LRU_old);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+ ut_ad(buf_LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+ ut_ad(buf_LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)
+# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)"
+#endif
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool->LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool->LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ old_len = buf_pool->LRU_old_len;
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+ * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool->LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+
+ for (;;) {
+ buf_page_t* LRU_old = buf_pool->LRU_old;
+
+ ut_a(LRU_old);
+ ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+ /* Update the LRU_old pointer if necessary */
+
+ if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+ buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
+ LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+ old_len = ++buf_pool->LRU_old_len;
+ buf_page_set_old(LRU_old, TRUE);
+
+ } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+ buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+ old_len = --buf_pool->LRU_old_len;
+ buf_page_set_old(LRU_old, FALSE);
+ } else {
+ return;
+ }
+ }
+}
+
+/*******************************************************************//**
+Initializes the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static
+void
+buf_LRU_old_init(void)
+/*==================*/
+{
+ buf_page_t* bpage;
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+ ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
+
+ /* We first initialize all blocks in the LRU list as old and then use
+ the adjust function to move the LRU_old pointer to the right
+ position */
+
+ for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage != NULL;
+ bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+ ut_ad(bpage->in_LRU_list);
+ ut_ad(buf_page_in_file(bpage));
+ /* This loop temporarily violates the
+ assertions of buf_page_set_old(). */
+ bpage->old = TRUE;
+ }
+
+ buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
+ buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Remove a block from the unzip_LRU list if it belonged to the list. */
+static
+void
+buf_unzip_LRU_remove_block_if_needed(
+/*=================================*/
+ buf_page_t* bpage) /*!< in/out: control block */
+{
+ ut_ad(buf_pool);
+ ut_ad(bpage);
+ ut_ad(buf_page_in_file(bpage));
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ if (buf_page_belongs_to_unzip_LRU(bpage)) {
+ buf_block_t* block = (buf_block_t*) bpage;
+
+ ut_ad(block->in_unzip_LRU_list);
+ block->in_unzip_LRU_list = FALSE;
+
+ UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block);
+ }
+}
+
+/******************************************************************//**
+Removes a block from the LRU list. */
+UNIV_INLINE
+void
+buf_LRU_remove_block(
+/*=================*/
+ buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_pool);
+ ut_ad(bpage);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ ut_a(buf_page_in_file(bpage));
+
+ ut_ad(bpage->in_LRU_list);
+
+ /* If the LRU_old pointer is defined and points to just this block,
+ move it backward one step */
+
+ if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) {
+
+ /* Below: the previous block is guaranteed to exist,
+ because the LRU_old pointer is only allowed to differ
+ by BUF_LRU_OLD_TOLERANCE from strict
+ buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+ list length. */
+ buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+ ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+ ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+ buf_pool->LRU_old = prev_bpage;
+ buf_page_set_old(prev_bpage, TRUE);
+
+ buf_pool->LRU_old_len++;
+ }
+
+ /* Remove the block from the LRU list */
+ UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+ bpage->in_LRU_list = FALSE;
+
+ buf_unzip_LRU_remove_block_if_needed(bpage);
+
+ /* If the LRU list is so short that LRU_old is not defined,
+ clear the "old" flags and return */
+ if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU); bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+ /* This loop temporarily violates the
+ assertions of buf_page_set_old(). */
+ bpage->old = FALSE;
+ }
+
+ buf_pool->LRU_old = NULL;
+ buf_pool->LRU_old_len = 0;
+
+ return;
+ }
+
+ ut_ad(buf_pool->LRU_old);
+
+ /* Update the LRU_old_len field if necessary */
+ if (buf_page_is_old(bpage)) {
+
+ buf_pool->LRU_old_len--;
+ }
+
+ /* Adjust the length of the old block list if necessary */
+ buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old) /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+{
+ ut_ad(buf_pool);
+ ut_ad(block);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+
+ ut_ad(!block->in_unzip_LRU_list);
+ block->in_unzip_LRU_list = TRUE;
+
+ if (old) {
+ UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block);
+ } else {
+ UT_LIST_ADD_FIRST(unzip_LRU, buf_pool->unzip_LRU, block);
+ }
+}
+
+/******************************************************************//**
+Adds a block to the LRU list end. */
+UNIV_INLINE
+void
+buf_LRU_add_block_to_end_low(
+/*=========================*/
+ buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_pool);
+ ut_ad(bpage);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ ut_a(buf_page_in_file(bpage));
+
+ ut_ad(!bpage->in_LRU_list);
+ UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage);
+ bpage->in_LRU_list = TRUE;
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+ ut_ad(buf_pool->LRU_old);
+
+ /* Adjust the length of the old block list if necessary */
+
+ buf_page_set_old(bpage, TRUE);
+ buf_pool->LRU_old_len++;
+ buf_LRU_old_adjust_len();
+
+ } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+ /* The LRU list is now long enough for LRU_old to become
+ defined: init it */
+
+ buf_LRU_old_init();
+ } else {
+ buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+ }
+
+ /* If this is a zipped block with decompressed frame as well
+ then put it on the unzip_LRU list */
+ if (buf_page_belongs_to_unzip_LRU(bpage)) {
+ buf_unzip_LRU_add_block((buf_block_t*) bpage, TRUE);
+ }
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INLINE
+void
+buf_LRU_add_block_low(
+/*==================*/
+ buf_page_t* bpage, /*!< in: control block */
+ ibool old) /*!< in: TRUE if should be put to the old blocks
+ in the LRU list, else put to the start; if the
+ LRU list is very short, the block is added to
+ the start, regardless of this parameter */
+{
+ ut_ad(buf_pool);
+ ut_ad(bpage);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(!bpage->in_LRU_list);
+
+ if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+ UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage);
+
+ bpage->freed_page_clock = buf_pool->freed_page_clock;
+ } else {
+#ifdef UNIV_LRU_DEBUG
+ /* buf_pool->LRU_old must be the first item in the LRU list
+ whose "old" flag is set. */
+ ut_a(buf_pool->LRU_old->old);
+ ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+ || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+ ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+ || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+ UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old,
+ bpage);
+ buf_pool->LRU_old_len++;
+ }
+
+ bpage->in_LRU_list = TRUE;
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+ ut_ad(buf_pool->LRU_old);
+
+ /* Adjust the length of the old block list if necessary */
+
+ buf_page_set_old(bpage, old);
+ buf_LRU_old_adjust_len();
+
+ } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+ /* The LRU list is now long enough for LRU_old to become
+ defined: init it */
+
+ buf_LRU_old_init();
+ } else {
+ buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+ }
+
+ /* If this is a zipped block with decompressed frame as well
+ then put it on the unzip_LRU list */
+ if (buf_page_belongs_to_unzip_LRU(bpage)) {
+ buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+ }
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_page_t* bpage, /*!< in: control block */
+ ibool old) /*!< in: TRUE if should be put to the old
+ blocks in the LRU list, else put to the start;
+ if the LRU list is very short, the block is
+ added to the start, regardless of this
+ parameter */
+{
+ buf_LRU_add_block_low(bpage, old);
+}
+
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+ buf_page_t* bpage) /*!< in: control block */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+
+ if (bpage->old) {
+ buf_pool->stat.n_pages_made_young++;
+ }
+
+ buf_LRU_remove_block(bpage);
+ buf_LRU_add_block_low(bpage, FALSE);
+}
+
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+ buf_page_t* bpage) /*!< in: control block */
+{
+ buf_LRU_remove_block(bpage);
+ buf_LRU_add_block_to_end_low(bpage);
+}
+
+/******************************************************************//**
+Try to free a block. If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+release buf_pool_mutex. Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
+release these two mutexes after the call. No other
+buf_page_get_mutex() may be held when calling this function.
+@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
+BUF_LRU_NOT_FREED otherwise. */
+UNIV_INTERN
+enum buf_lru_free_block_status
+buf_LRU_free_block(
+/*===============*/
+ buf_page_t* bpage, /*!< in: block to be freed */
+ ibool zip, /*!< in: TRUE if should remove also the
+ compressed page of an uncompressed page */
+ ibool* buf_pool_mutex_released,
+ /*!< in: pointer to a variable that will
+ be assigned TRUE if buf_pool_mutex
+ was temporarily released, or NULL */
+ ibool have_LRU_mutex)
+{
+ buf_page_t* b = NULL;
+ mutex_t* block_mutex = buf_page_get_mutex(bpage);
+
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(block_mutex));
+ ut_ad(buf_page_in_file(bpage));
+ //ut_ad(bpage->in_LRU_list);
+ ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in buf_page_t. On
+ other systems, Valgrind could complain about uninitialized pad
+ bytes. */
+ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
+ if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
+
+ /* Do not free buffer-fixed or I/O-fixed blocks. */
+ return(BUF_LRU_NOT_FREED);
+ }
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+ if (zip || !bpage->zip.data) {
+ /* This would completely free the block. */
+ /* Do not completely free dirty blocks. */
+
+ if (bpage->oldest_modification) {
+ return(BUF_LRU_NOT_FREED);
+ }
+ } else if (bpage->oldest_modification) {
+ /* Do not completely free dirty blocks. */
+
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ ut_ad(buf_page_get_state(bpage)
+ == BUF_BLOCK_ZIP_DIRTY);
+ return(BUF_LRU_NOT_FREED);
+ }
+
+ goto alloc;
+ } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+ /* Allocate the control block for the compressed page.
+ If it cannot be allocated (without freeing a block
+ from the LRU list), refuse to free bpage. */
+alloc:
+ //buf_pool_mutex_exit_forbid();
+ b = buf_buddy_alloc(sizeof *b, NULL, FALSE);
+ //buf_pool_mutex_exit_allow();
+
+ if (UNIV_UNLIKELY(!b)) {
+ return(BUF_LRU_CANNOT_RELOCATE);
+ }
+
+ //memcpy(b, bpage, sizeof *b);
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "Putting space %lu page %lu to free list\n",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* not to break latch order, must re-enter block_mutex */
+ mutex_exit(block_mutex);
+
+ if (!have_LRU_mutex)
+ mutex_enter(&LRU_list_mutex); /* optimistic */
+ rw_lock_x_lock(&page_hash_latch);
+ mutex_enter(block_mutex);
+
+ /* recheck states of block */
+ if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage)
+ || !buf_page_can_relocate(bpage)) {
+not_freed:
+ if (b) {
+ buf_buddy_free(b, sizeof *b, TRUE);
+ }
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ return(BUF_LRU_NOT_FREED);
+ } else if (zip || !bpage->zip.data) {
+ if (bpage->oldest_modification)
+ goto not_freed;
+ } else if (bpage->oldest_modification) {
+ if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+ ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
+ goto not_freed;
+ }
+ }
+
+ if (b) {
+ memcpy(b, bpage, sizeof *b);
+ }
+
+ if (buf_LRU_block_remove_hashed_page(bpage, zip)
+ != BUF_BLOCK_ZIP_FREE) {
+ ut_a(bpage->buf_fix_count == 0);
+
+ if (b) {
+ buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
+ const ulint fold = buf_page_address_fold(
+ bpage->space, bpage->offset);
+
+ ut_a(!buf_page_hash_get(bpage->space, bpage->offset));
+
+ while (prev_b && !prev_b->in_LRU_list) {
+ prev_b = UT_LIST_GET_PREV(LRU, prev_b);
+ }
+
+ b->state = b->oldest_modification
+ ? BUF_BLOCK_ZIP_DIRTY
+ : BUF_BLOCK_ZIP_PAGE;
+ UNIV_MEM_DESC(b->zip.data,
+ page_zip_get_size(&b->zip), b);
+
+ /* The fields in_page_hash and in_LRU_list of
+ the to-be-freed block descriptor should have
+ been cleared in
+ buf_LRU_block_remove_hashed_page(), which
+ invokes buf_LRU_remove_block(). */
+ ut_ad(!bpage->in_page_hash);
+ ut_ad(!bpage->in_LRU_list);
+ /* bpage->state was BUF_BLOCK_FILE_PAGE because
+ b != NULL. The type cast below is thus valid. */
+ ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+ /* The fields of bpage were copied to b before
+ buf_LRU_block_remove_hashed_page() was invoked. */
+ ut_ad(!b->in_zip_hash);
+ ut_ad(b->in_page_hash);
+ ut_ad(b->in_LRU_list);
+
+ HASH_INSERT(buf_page_t, hash,
+ buf_pool->page_hash, fold, b);
+
+ /* Insert b where bpage was in the LRU list. */
+ if (UNIV_LIKELY(prev_b != NULL)) {
+ ulint lru_len;
+
+ ut_ad(prev_b->in_LRU_list);
+ ut_ad(buf_page_in_file(prev_b));
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no
+ padding in buf_page_t. On other
+ systems, Valgrind could complain about
+ uninitialized pad bytes. */
+ UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
+#endif
+ UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+ prev_b, b);
+
+ if (buf_page_is_old(b)) {
+ buf_pool->LRU_old_len++;
+ if (UNIV_UNLIKELY
+ (buf_pool->LRU_old
+ == UT_LIST_GET_NEXT(LRU, b))) {
+
+ buf_pool->LRU_old = b;
+ }
+ }
+
+ lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+ ut_ad(buf_pool->LRU_old);
+ /* Adjust the length of the
+ old block list if necessary */
+ buf_LRU_old_adjust_len();
+ } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+ /* The LRU list is now long
+ enough for LRU_old to become
+ defined: init it */
+ buf_LRU_old_init();
+ }
+#ifdef UNIV_LRU_DEBUG
+ /* Check that the "old" flag is consistent
+ in the block and its neighbours. */
+ buf_page_set_old(b, buf_page_is_old(b));
+#endif /* UNIV_LRU_DEBUG */
+ } else {
+ b->in_LRU_list = FALSE;
+ buf_LRU_add_block_low(b, buf_page_is_old(b));
+ }
+
+ mutex_enter(&flush_list_mutex);
+ if (b->state == BUF_BLOCK_ZIP_PAGE) {
+ buf_LRU_insert_zip_clean(b);
+ } else {
+ /* Relocate on buf_pool->flush_list. */
+ buf_flush_relocate_on_flush_list(bpage, b);
+ }
+ mutex_exit(&flush_list_mutex);
+
+ bpage->zip.data = NULL;
+ page_zip_set_size(&bpage->zip, 0);
+
+ /* Prevent buf_page_get_gen() from
+ decompressing the block while we release
+ buf_pool_mutex and block_mutex. */
+ b->buf_fix_count++;
+ b->io_fix = BUF_IO_READ;
+ }
+
+ if (buf_pool_mutex_released) {
+ *buf_pool_mutex_released = TRUE;
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ mutex_exit(block_mutex);
+
+ /* Remove possible adaptive hash index on the page.
+ The page was declared uninitialized by
+ buf_LRU_block_remove_hashed_page(). We need to flag
+ the contents of the page valid (which it still is) in
+ order to avoid bogus Valgrind warnings.*/
+
+ UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+ btr_search_drop_page_hash_index((buf_block_t*) bpage);
+ UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+
+ if (b) {
+ /* Compute and stamp the compressed page
+ checksum while not holding any mutex. The
+ block is already half-freed
+ (BUF_BLOCK_REMOVE_HASH) and removed from
+ buf_pool->page_hash, thus inaccessible by any
+ other thread. */
+
+ mach_write_to_4(
+ b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+ UNIV_LIKELY(srv_use_checksums)
+ ? page_zip_calc_checksum(
+ b->zip.data,
+ page_zip_get_size(&b->zip))
+ : BUF_NO_CHECKSUM_MAGIC);
+ }
+
+ //buf_pool_mutex_enter();
+ if (have_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+ mutex_enter(block_mutex);
+
+ if (b) {
+ mutex_enter(&buf_pool_zip_mutex);
+ b->buf_fix_count--;
+ buf_page_set_io_fix(b, BUF_IO_NONE);
+ mutex_exit(&buf_pool_zip_mutex);
+ }
+
+ buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE);
+ } else {
+ /* The block_mutex should have been released by
+ buf_LRU_block_remove_hashed_page() when it returns
+ BUF_BLOCK_ZIP_FREE. */
+ ut_ad(block_mutex == &buf_pool_zip_mutex);
+ mutex_enter(block_mutex);
+
+ if (!have_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+
+ return(BUF_LRU_FREED);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block, /*!< in: block, must not contain a file page */
+ ibool have_page_hash_mutex)
+{
+ void* data;
+
+ ut_ad(block);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_READY_FOR_USE:
+ break;
+ default:
+ ut_error;
+ }
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->n_pointers == 0);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ut_ad(!block->page.in_free_list);
+ ut_ad(!block->page.in_flush_list);
+ ut_ad(!block->page.in_LRU_list);
+
+ buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+
+ UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+#ifdef UNIV_DEBUG
+ /* Wipe contents of page to reveal possible stale pointers to it */
+ memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#else
+ /* Wipe page_no and space_id */
+ memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+ memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4);
+#endif
+ data = block->page.zip.data;
+
+ if (data) {
+ block->page.zip.data = NULL;
+ mutex_exit(&block->mutex);
+ //buf_pool_mutex_exit_forbid();
+ buf_buddy_free(data, page_zip_get_size(&block->page.zip), have_page_hash_mutex);
+ //buf_pool_mutex_exit_allow();
+ mutex_enter(&block->mutex);
+ page_zip_set_size(&block->page.zip, 0);
+ }
+
+ mutex_enter(&free_list_mutex);
+ UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page));
+ ut_d(block->page.in_free_list = TRUE);
+ mutex_exit(&free_list_mutex);
+
+ UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE);
+}
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed and buf_pool_zip_mutex will be released.
+
+If a compressed page or a compressed-only block descriptor is freed,
+other compressed pages or compressed-only block descriptors may be
+relocated.
+@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
+was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+static
+enum buf_page_state
+buf_LRU_block_remove_hashed_page(
+/*=============================*/
+ buf_page_t* bpage, /*!< in: block, must contain a file page and
+ be in a state where it can be freed; there
+ may or may not be a hash index to the page */
+ ibool zip) /*!< in: TRUE if should remove also the
+ compressed page of an uncompressed page */
+{
+ const buf_page_t* hashed_bpage;
+ ut_ad(bpage);
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+ ut_a(bpage->buf_fix_count == 0);
+
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
+ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
+ buf_LRU_remove_block(bpage);
+
+ buf_pool->freed_page_clock += 1;
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_FILE_PAGE:
+ UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t));
+ UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+ buf_block_modify_clock_inc((buf_block_t*) bpage);
+ if (bpage->zip.data) {
+ const page_t* page = ((buf_block_t*) bpage)->frame;
+ const ulint zip_size
+ = page_zip_get_size(&bpage->zip);
+
+ ut_a(!zip || bpage->oldest_modification == 0);
+
+ switch (UNIV_EXPECT(fil_page_get_type(page),
+ FIL_PAGE_INDEX)) {
+ case FIL_PAGE_TYPE_ALLOCATED:
+ case FIL_PAGE_INODE:
+ case FIL_PAGE_IBUF_BITMAP:
+ case FIL_PAGE_TYPE_FSP_HDR:
+ case FIL_PAGE_TYPE_XDES:
+ /* These are essentially uncompressed pages. */
+ if (!zip) {
+ /* InnoDB writes the data to the
+ uncompressed page frame. Copy it
+ to the compressed page, which will
+ be preserved. */
+ memcpy(bpage->zip.data, page,
+ zip_size);
+ }
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ case FIL_PAGE_TYPE_ZBLOB2:
+ break;
+ case FIL_PAGE_INDEX:
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(&bpage->zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ break;
+ default:
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ERROR: The compressed page"
+ " to be evicted seems corrupt:", stderr);
+ ut_print_buf(stderr, page, zip_size);
+ fputs("\nInnoDB: Possibly older version"
+ " of the page:", stderr);
+ ut_print_buf(stderr, bpage->zip.data,
+ zip_size);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ break;
+ }
+ /* fall through */
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_a(bpage->oldest_modification == 0);
+ UNIV_MEM_ASSERT_W(bpage->zip.data,
+ page_zip_get_size(&bpage->zip));
+ break;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ }
+
+ hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset);
+
+ if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
+ fprintf(stderr,
+ "InnoDB: Error: page %lu %lu not found"
+ " in the hash table\n",
+ (ulong) bpage->space,
+ (ulong) bpage->offset);
+ if (hashed_bpage) {
+ fprintf(stderr,
+ "InnoDB: In hash table we find block"
+ " %p of %lu %lu which is not %p\n",
+ (const void*) hashed_bpage,
+ (ulong) hashed_bpage->space,
+ (ulong) hashed_bpage->offset,
+ (const void*) bpage);
+ }
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+ mutex_exit(buf_page_get_mutex(bpage));
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ buf_print();
+ buf_LRU_print();
+ buf_validate();
+ buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+ ut_error;
+ }
+
+ ut_ad(!bpage->in_zip_hash);
+ ut_ad(bpage->in_page_hash);
+ ut_d(bpage->in_page_hash = FALSE);
+ HASH_DELETE(buf_page_t, hash, buf_pool->page_hash,
+ buf_page_address_fold(bpage->space, bpage->offset),
+ bpage);
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->in_flush_list);
+ ut_ad(!bpage->in_LRU_list);
+ ut_a(bpage->zip.data);
+ ut_a(buf_page_get_zip_size(bpage));
+
+ UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage);
+
+ mutex_exit(&buf_pool_zip_mutex);
+ //buf_pool_mutex_exit_forbid();
+ buf_buddy_free(bpage->zip.data,
+ page_zip_get_size(&bpage->zip), TRUE);
+ buf_buddy_free(bpage, sizeof(*bpage), TRUE);
+ //buf_pool_mutex_exit_allow();
+ UNIV_MEM_UNDESC(bpage);
+ return(BUF_BLOCK_ZIP_FREE);
+
+ case BUF_BLOCK_FILE_PAGE:
+ memset(((buf_block_t*) bpage)->frame
+ + FIL_PAGE_OFFSET, 0xff, 4);
+ memset(((buf_block_t*) bpage)->frame
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+ UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+ UNIV_PAGE_SIZE);
+ buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
+
+ if (zip && bpage->zip.data) {
+ /* Free the compressed page. */
+ void* data = bpage->zip.data;
+ bpage->zip.data = NULL;
+
+ ut_ad(!bpage->in_free_list);
+ ut_ad(!bpage->in_flush_list);
+ ut_ad(!bpage->in_LRU_list);
+ mutex_exit(&((buf_block_t*) bpage)->mutex);
+ //buf_pool_mutex_exit_forbid();
+ buf_buddy_free(data, page_zip_get_size(&bpage->zip), TRUE);
+ //buf_pool_mutex_exit_allow();
+ mutex_enter(&((buf_block_t*) bpage)->mutex);
+ page_zip_set_size(&bpage->zip, 0);
+ }
+
+ return(BUF_BLOCK_REMOVE_HASH);
+
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ ut_error;
+ return(BUF_BLOCK_ZIP_FREE);
+}
+
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+ buf_block_t* block, /*!< in: block, must contain a file page and
+ be in a state where it can be freed */
+ ibool have_page_hash_mutex)
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&block->mutex));
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+}
+
+/**********************************************************************//**
+Updates buf_LRU_old_ratio.
+@return updated old_pct */
+UNIV_INTERN
+uint
+buf_LRU_old_ratio_update(
+/*=====================*/
+ uint old_pct,/*!< in: Reserve this percentage of
+ the buffer pool for "old" blocks. */
+ ibool adjust) /*!< in: TRUE=adjust the LRU list;
+ FALSE=just assign buf_LRU_old_ratio
+ during the initialization of InnoDB */
+{
+ uint ratio;
+
+ ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+ if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+ ratio = BUF_LRU_OLD_RATIO_MIN;
+ } else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+ ratio = BUF_LRU_OLD_RATIO_MAX;
+ }
+
+ if (adjust) {
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ if (ratio != buf_LRU_old_ratio) {
+ buf_LRU_old_ratio = ratio;
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU)
+ >= BUF_LRU_OLD_MIN_LEN) {
+ buf_LRU_old_adjust_len();
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ } else {
+ buf_LRU_old_ratio = ratio;
+ }
+
+ /* the reverse of
+ ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+ return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void)
+/*=====================*/
+{
+ buf_LRU_stat_t* item;
+
+ /* If we haven't started eviction yet then don't update stats. */
+ if (buf_pool->freed_page_clock == 0) {
+ goto func_exit;
+ }
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ /* Update the index. */
+ item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+ buf_LRU_stat_arr_ind++;
+ buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+ /* Add the current value and subtract the obsolete entry. */
+ buf_LRU_stat_sum.io += buf_LRU_stat_cur.io - item->io;
+ buf_LRU_stat_sum.unzip += buf_LRU_stat_cur.unzip - item->unzip;
+
+ /* Put current entry in the array. */
+ memcpy(item, &buf_LRU_stat_cur, sizeof *item);
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+func_exit:
+ /* Clear the current entry. */
+ memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+/********************************************************************//**
+Dump the LRU page list to the specific file. */
+#define LRU_DUMP_FILE "ib_lru_dump"
+
+UNIV_INTERN
+ibool
+buf_LRU_file_dump(void)
+/*===================*/
+{
+ os_file_t dump_file = (os_file_t) -1;
+ ibool success;
+ byte* buffer_base = NULL;
+ byte* buffer = NULL;
+ buf_page_t* bpage;
+ ulint buffers;
+ ulint offset;
+ ibool ret = FALSE;
+ ulint i;
+
+ for (i = 0; i < srv_n_data_files; i++) {
+ if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) {
+ fprintf(stderr,
+ " InnoDB: The name '%s' seems to be used for"
+ " innodb_data_file_path. Dumping LRU list is not"
+ " done for safeness.\n", LRU_DUMP_FILE);
+ goto end;
+ }
+ }
+
+ buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
+ buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
+ if (!buffer) {
+ fprintf(stderr,
+ " InnoDB: cannot allocate buffer.\n");
+ goto end;
+ }
+
+ dump_file = os_file_create(LRU_DUMP_FILE, OS_FILE_OVERWRITE,
+ OS_FILE_NORMAL, OS_DATA_FILE, &success);
+ if (!success) {
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ " InnoDB: cannot open %s\n", LRU_DUMP_FILE);
+ goto end;
+ }
+
+ mutex_enter(&LRU_list_mutex);
+ bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ buffers = offset = 0;
+ while (bpage != NULL) {
+ if (offset == 0) {
+ memset(buffer, 0, UNIV_PAGE_SIZE);
+ }
+
+ mach_write_to_4(buffer + offset * 4, bpage->space);
+ offset++;
+ mach_write_to_4(buffer + offset * 4, bpage->offset);
+ offset++;
+
+ if (offset == UNIV_PAGE_SIZE/4) {
+ success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
+ (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+ (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+ UNIV_PAGE_SIZE);
+ if (!success) {
+ mutex_exit(&LRU_list_mutex);
+ fprintf(stderr,
+ " InnoDB: cannot write page %lu of %s\n",
+ buffers, LRU_DUMP_FILE);
+ goto end;
+ }
+ buffers++;
+ offset = 0;
+ }
+
+ bpage = UT_LIST_GET_PREV(LRU, bpage);
+ }
+ mutex_exit(&LRU_list_mutex);
+
+ if (offset == 0) {
+ memset(buffer, 0, UNIV_PAGE_SIZE);
+ }
+
+ mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
+ offset++;
+ mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
+ offset++;
+
+ success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
+ (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+ (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+ UNIV_PAGE_SIZE);
+ if (!success) {
+ goto end;
+ }
+
+ ret = TRUE;
+end:
+ if (dump_file != (os_file_t) -1)
+ os_file_close(dump_file);
+ if (buffer_base)
+ ut_free(buffer_base);
+
+ return(ret);
+}
+
+typedef struct {
+ ib_uint32_t space_id;
+ ib_uint32_t page_no;
+} dump_record_t;
+
+static int dump_record_cmp(const void *a, const void *b)
+{
+ const dump_record_t *rec1 = (dump_record_t *) a;
+ const dump_record_t *rec2 = (dump_record_t *) b;
+
+ if (rec1->space_id < rec2->space_id)
+ return -1;
+ if (rec1->space_id > rec2->space_id)
+ return 1;
+ if (rec1->page_no < rec2->page_no)
+ return -1;
+ return rec1->page_no > rec2->page_no;
+}
+
+/********************************************************************//**
+Read the pages based on the specific file.*/
+UNIV_INTERN
+ibool
+buf_LRU_file_restore(void)
+/*======================*/
+{
+ os_file_t dump_file = (os_file_t) -1;
+ ibool success;
+ byte* buffer_base = NULL;
+ byte* buffer = NULL;
+ ulint buffers;
+ ulint offset;
+ ulint reads = 0;
+ ulint req = 0;
+ ibool terminated = FALSE;
+ ibool ret = FALSE;
+ dump_record_t* records= 0;
+ ulint size;
+ ulint size_high;
+ ulint length;
+
+ dump_file = os_file_create_simple_no_error_handling(
+ LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ if (!success || !os_file_get_size(dump_file, &size, &size_high)) {
+ os_file_get_last_error(TRUE);
+ fprintf(stderr,
+ " InnoDB: cannot open %s\n", LRU_DUMP_FILE);
+ goto end;
+ }
+ if (size == 0 || size_high > 0 || size % 8) {
+ fprintf(stderr, " InnoDB: broken LRU dump file\n");
+ goto end;
+ }
+ buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
+ buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
+ records = ut_malloc(size);
+ if (!buffer || !records) {
+ fprintf(stderr,
+ " InnoDB: cannot allocate buffer.\n");
+ goto end;
+ }
+
+ buffers = 0;
+ length = 0;
+ while (!terminated) {
+ success = os_file_read(dump_file, buffer,
+ (buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+ (buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+ UNIV_PAGE_SIZE);
+ if (!success) {
+ fprintf(stderr,
+ " InnoDB: cannot read page %lu of %s,"
+ " or meet unexpected terminal.\n",
+ buffers, LRU_DUMP_FILE);
+ goto end;
+ }
+
+ for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) {
+ ulint space_id;
+ ulint page_no;
+
+ space_id = mach_read_from_4(buffer + offset * 4);
+ page_no = mach_read_from_4(buffer + (offset + 1) * 4);
+ if (space_id == 0xFFFFFFFFUL
+ || page_no == 0xFFFFFFFFUL) {
+ terminated = TRUE;
+ break;
+ }
+
+ records[length].space_id = space_id;
+ records[length].page_no = page_no;
+ length++;
+ if (length * 8 >= size) {
+ fprintf(stderr,
+ " InnoDB: could not find the "
+ "end-of-file marker after reading "
+ "the expected %lu bytes from the "
+ "LRU dump file.\n"
+ " InnoDB: this could be caused by a "
+ "broken or incomplete file.\n"
+ " InnoDB: trying to process what has "
+ "been read so far.\n",
+ size);
+ terminated= TRUE;
+ break;
+ }
+ }
+ buffers++;
+ }
+
+ qsort(records, length, sizeof(dump_record_t), dump_record_cmp);
+
+ for (offset = 0; offset < length; offset++) {
+ ulint space_id;
+ ulint page_no;
+ ulint zip_size;
+ ulint err;
+ ib_int64_t tablespace_version;
+
+ space_id = records[offset].space_id;
+ page_no = records[offset].page_no;
+
+ if (offset % 16 == 15) {
+ os_aio_simulated_wake_handler_threads();
+ buf_flush_free_margin(FALSE);
+ }
+
+ zip_size = fil_space_get_zip_size(space_id);
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+ continue;
+ }
+
+ if (fil_area_is_exist(space_id, zip_size, page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE)) {
+
+ tablespace_version = fil_space_get_version(space_id);
+
+ req++;
+ reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+ | OS_AIO_SIMULATED_WAKE_LATER,
+ space_id, zip_size, TRUE,
+ tablespace_version, page_no, NULL);
+ buf_LRU_stat_inc_io();
+ }
+ }
+
+ os_aio_simulated_wake_handler_threads();
+ buf_flush_free_margin(FALSE);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: reading pages based on the dumped LRU list was done."
+ " (requested: %lu, read: %lu)\n", req, reads);
+ ret = TRUE;
+end:
+ if (dump_file != (os_file_t) -1)
+ os_file_close(dump_file);
+ if (buffer_base)
+ ut_free(buffer_base);
+ if (records)
+ ut_free(records);
+
+ return(ret);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void)
+/*==================*/
+{
+ buf_page_t* bpage;
+ buf_block_t* block;
+ ulint old_len;
+ ulint new_len;
+
+ ut_ad(buf_pool);
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+ ut_a(buf_pool->LRU_old);
+ old_len = buf_pool->LRU_old_len;
+ new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+ * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+ UT_LIST_GET_LEN(buf_pool->LRU)
+ - (BUF_LRU_OLD_TOLERANCE
+ + BUF_LRU_NON_OLD_MIN_LEN));
+ ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+ ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+ }
+
+ UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
+ ut_ad(ut_list_node_313->in_LRU_list));
+
+ bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+
+ old_len = 0;
+
+ while (bpage != NULL) {
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_error;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
+ == buf_page_belongs_to_unzip_LRU(bpage));
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ break;
+ }
+
+ if (buf_page_is_old(bpage)) {
+ const buf_page_t* prev
+ = UT_LIST_GET_PREV(LRU, bpage);
+ const buf_page_t* next
+ = UT_LIST_GET_NEXT(LRU, bpage);
+
+ if (!old_len++) {
+ ut_a(buf_pool->LRU_old == bpage);
+ } else {
+ ut_a(!prev || buf_page_is_old(prev));
+ }
+
+ ut_a(!next || buf_page_is_old(next));
+ }
+
+ bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ }
+
+ ut_a(buf_pool->LRU_old_len == old_len);
+
+ mutex_exit(&LRU_list_mutex);
+ mutex_enter(&free_list_mutex);
+
+ UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free,
+ ut_ad(ut_list_node_313->in_free_list));
+
+ for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
+ bpage != NULL;
+ bpage = UT_LIST_GET_NEXT(free, bpage)) {
+
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
+ }
+
+ mutex_exit(&free_list_mutex);
+ mutex_enter(&LRU_list_mutex);
+
+ UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+ ut_ad(ut_list_node_313->in_unzip_LRU_list
+ && ut_list_node_313->page.in_LRU_list));
+
+ for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
+ block;
+ block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+ ut_ad(block->in_unzip_LRU_list);
+ ut_ad(block->page.in_LRU_list);
+ ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void)
+/*===============*/
+{
+ const buf_page_t* bpage;
+
+ ut_ad(buf_pool);
+ //buf_pool_mutex_enter();
+ mutex_enter(&LRU_list_mutex);
+
+ bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+
+ while (bpage != NULL) {
+
+ fprintf(stderr, "BLOCK space %lu page %lu ",
+ (ulong) buf_page_get_space(bpage),
+ (ulong) buf_page_get_page_no(bpage));
+
+ if (buf_page_is_old(bpage)) {
+ fputs("old ", stderr);
+ }
+
+ if (bpage->buf_fix_count) {
+ fprintf(stderr, "buffix count %lu ",
+ (ulong) bpage->buf_fix_count);
+ }
+
+ if (buf_page_get_io_fix(bpage)) {
+ fprintf(stderr, "io_fix %lu ",
+ (ulong) buf_page_get_io_fix(bpage));
+ }
+
+ if (bpage->oldest_modification) {
+ fputs("modif. ", stderr);
+ }
+
+ switch (buf_page_get_state(bpage)) {
+ const byte* frame;
+ case BUF_BLOCK_FILE_PAGE:
+ frame = buf_block_get_frame((buf_block_t*) bpage);
+ fprintf(stderr, "\ntype %lu"
+ " index id %lu\n",
+ (ulong) fil_page_get_type(frame),
+ (ulong) ut_dulint_get_low(
+ btr_page_get_index_id(frame)));
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ frame = bpage->zip.data;
+ fprintf(stderr, "\ntype %lu size %lu"
+ " index id %lu\n",
+ (ulong) fil_page_get_type(frame),
+ (ulong) buf_page_get_zip_size(bpage),
+ (ulong) ut_dulint_get_low(
+ btr_page_get_index_id(frame)));
+ break;
+
+ default:
+ fprintf(stderr, "\n!state %lu!\n",
+ (ulong) buf_page_get_state(bpage));
+ break;
+ }
+
+ bpage = UT_LIST_GET_NEXT(LRU, bpage);
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&LRU_list_mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c
new file mode 100644
index 00000000000..59de70d9a8a
--- /dev/null
+++ b/storage/xtradb/buf/buf0rea.c
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.c
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0rea.h"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+
+/** The linear read-ahead area size */
+#define BUF_READ_AHEAD_LINEAR_AREA BUF_READ_AHEAD_AREA
+
+/** If there are buf_pool->curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT 2
+
+/********************************************************************//**
+Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+@return 1 if a read request was queued, 0 if the page already resided
+in buf_pool, or if the page is in the doublewrite buffer blocks in
+which case it is never read into the pool, or if the tablespace does
+not exist or is being dropped
+@return 1 if read request is issued. 0 if it is not */
+UNIV_INTERN
+ulint
+buf_read_page_low(
+/*==============*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+ trying to read from a non-existent tablespace, or a
+ tablespace which is just now being dropped */
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
+ ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
+ at read-ahead functions) */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version, /*!< in: if the space memory object has
+ this timestamp different from what we are giving here,
+ treat the tablespace as dropped; this is a timestamp we
+ use to stop dangling page reads from a tablespace
+ which we have DISCARDed + IMPORTed back */
+ ulint offset, /*!< in: page number */
+ trx_t* trx)
+{
+ buf_page_t* bpage;
+ ulint wake_later;
+
+ *err = DB_SUCCESS;
+
+ wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+ mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+ if (trx_doublewrite
+ && (space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
+ && ( (offset >= trx_doublewrite->block1
+ && offset < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (offset >= trx_doublewrite->block2
+ && offset < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: trying to read"
+ " doublewrite buffer page %lu\n",
+ (ulong) offset);
+
+ return(0);
+ }
+
+ if (ibuf_bitmap_page(zip_size, offset)
+ || trx_sys_hdr_page(space, offset)) {
+
+ /* Trx sys header is so low in the latching order that we play
+ safe and do not leave the i/o-completion to an asynchronous
+ i/o-thread. Ibuf bitmap pages must always be read with
+ syncronous i/o, to make sure they do not get involved in
+ thread deadlocks. */
+
+ sync = TRUE;
+ }
+
+ /* The following call will also check if the tablespace does not exist
+ or is being dropped; if we succeed in initing the page in the buffer
+ pool for read, then DISCARD cannot proceed until the read has
+ completed */
+ bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
+ tablespace_version, offset);
+ if (bpage == NULL) {
+ /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
+ if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
+ /* hashed log recs must be treated here */
+ recv_addr_t* recv_addr;
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_sys->apply_log_recs == FALSE) {
+ mutex_exit(&(recv_sys->mutex));
+ goto not_to_recover;
+ }
+
+ /* recv_get_fil_addr_struct() */
+ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ hash_calc_hash(ut_fold_ulint_pair(space, offset),
+ recv_sys->addr_hash));
+ while (recv_addr) {
+ if ((recv_addr->space == space)
+ && (recv_addr->page_no == offset)) {
+ break;
+ }
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ if ((recv_addr == NULL)
+ || (recv_addr->state == RECV_BEING_PROCESSED)
+ || (recv_addr->state == RECV_PROCESSED)) {
+ mutex_exit(&(recv_sys->mutex));
+ goto not_to_recover;
+ }
+
+ fprintf(stderr, " (cannot find space: %lu)", space);
+ recv_addr->state = RECV_PROCESSED;
+
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+
+ mutex_exit(&(recv_sys->mutex));
+ }
+not_to_recover:
+
+ return(0);
+ }
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Posting read request for page %lu, sync %lu\n",
+ (ulong) offset,
+ (ulong) sync);
+ }
+#endif
+
+ ut_ad(buf_page_in_file(bpage));
+
+ if (zip_size) {
+ *err = _fil_io(OS_FILE_READ | wake_later,
+ sync, space, zip_size, offset, 0, zip_size,
+ bpage->zip.data, bpage, trx);
+ } else {
+ ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+
+ *err = _fil_io(OS_FILE_READ | wake_later,
+ sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
+ ((buf_block_t*) bpage)->frame, bpage, trx);
+ }
+
+ if (srv_pass_corrupt_table) {
+ if (*err != DB_SUCCESS) {
+ bpage->is_corrupt = TRUE;
+ }
+ } else {
+ ut_a(*err == DB_SUCCESS);
+ }
+
+ if (sync) {
+ /* The i/o is already completed when we arrive from
+ fil_read */
+ buf_page_io_complete(bpage, trx);
+ }
+
+ return(1);
+}
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset, /*!< in: page number */
+ trx_t* trx)
+{
+ ib_int64_t tablespace_version;
+ ulint count;
+ ulint err;
+
+ tablespace_version = fil_space_get_version(space);
+
+ /* We do the i/o in the synchronous aio mode to save thread
+ switches: hence TRUE */
+
+ count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ zip_size, FALSE,
+ tablespace_version, offset, trx);
+ srv_buf_pool_reads += count;
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: trying to access"
+ " tablespace %lu page no. %lu,\n"
+ "InnoDB: but the tablespace does not exist"
+ " or is just being dropped.\n",
+ (ulong) space, (ulong) offset);
+ }
+
+ /* Flush pages from the end of the LRU list if necessary */
+ buf_flush_free_margin(FALSE);
+
+ /* Increment number of I/O operations used for LRU policy. */
+ buf_LRU_stat_inc_io();
+
+ return(count > 0);
+}
+
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset, /*!< in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
+ trx_t* trx)
+{
+ ib_int64_t tablespace_version;
+ buf_page_t* bpage;
+ buf_frame_t* frame;
+ buf_page_t* pred_bpage = NULL;
+ ulint pred_offset;
+ ulint succ_offset;
+ ulint count;
+ int asc_or_desc;
+ ulint new_offset;
+ ulint fail_count;
+ ulint ibuf_mode;
+ ulint low, high;
+ ulint err;
+ ulint i;
+ const ulint buf_read_ahead_linear_area
+ = BUF_READ_AHEAD_LINEAR_AREA;
+ ulint threshold;
+
+ if (!(srv_read_ahead & 2)) {
+ return(0);
+ }
+
+ if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+ }
+
+ low = (offset / buf_read_ahead_linear_area)
+ * buf_read_ahead_linear_area;
+ high = (offset / buf_read_ahead_linear_area + 1)
+ * buf_read_ahead_linear_area;
+
+ if ((offset != low) && (offset != high - 1)) {
+ /* This is not a border page of the area: return */
+
+ return(0);
+ }
+
+ if (ibuf_bitmap_page(zip_size, offset)
+ || trx_sys_hdr_page(space, offset)) {
+
+ /* If it is an ibuf bitmap page or trx sys hdr, we do
+ no read-ahead, as that could break the ibuf page access
+ order */
+
+ return(0);
+ }
+
+ /* Remember the tablespace version before we ask te tablespace size
+ below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+ do not try to read outside the bounds of the tablespace! */
+
+ tablespace_version = fil_space_get_version(space);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&buf_pool_mutex);
+
+ if (high > fil_space_get_size(space)) {
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+ /* The area is not whole, return */
+
+ return(0);
+ }
+
+ if (buf_pool->n_pend_reads
+ > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+ //buf_pool_mutex_exit();
+ mutex_exit(&buf_pool_mutex);
+
+ return(0);
+ }
+ mutex_exit(&buf_pool_mutex);
+
+ /* Check that almost all pages in the area have been accessed; if
+ offset == low, the accesses must be in a descending order, otherwise,
+ in an ascending order. */
+
+ asc_or_desc = 1;
+
+ if (offset == low) {
+ asc_or_desc = -1;
+ }
+
+ /* How many out of order accessed pages can we ignore
+ when working out the access pattern for linear readahead */
+ threshold = ut_min((64 - srv_read_ahead_threshold),
+ BUF_READ_AHEAD_AREA);
+
+ fail_count = 0;
+
+ rw_lock_s_lock(&page_hash_latch);
+ for (i = low; i < high; i++) {
+ bpage = buf_page_hash_get(space, i);
+
+ if ((bpage == NULL) || !buf_page_is_accessed(bpage)) {
+ /* Not accessed */
+ fail_count++;
+
+ } else if (pred_bpage) {
+ /* Note that buf_page_is_accessed() returns
+ the time of the first access. If some blocks
+ of the extent existed in the buffer pool at
+ the time of a linear access pattern, the first
+ access times may be nonmonotonic, even though
+ the latest access times were linear. The
+ threshold (srv_read_ahead_factor) should help
+ a little against this. */
+ int res = ut_ulint_cmp(
+ buf_page_is_accessed(bpage),
+ buf_page_is_accessed(pred_bpage));
+ /* Accesses not in the right order */
+ if (res != 0 && res != asc_or_desc) {
+ fail_count++;
+ }
+ }
+
+ if (fail_count > threshold) {
+ /* Too many failures: return */
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+ return(0);
+ }
+
+ if (bpage && buf_page_is_accessed(bpage)) {
+ pred_bpage = bpage;
+ }
+ }
+
+ /* If we got this far, we know that enough pages in the area have
+ been accessed in the right order: linear read-ahead can be sensible */
+
+ bpage = buf_page_hash_get(space, offset);
+
+ if (bpage == NULL) {
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(0);
+ }
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_PAGE:
+ frame = bpage->zip.data;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ frame = ((buf_block_t*) bpage)->frame;
+ break;
+ default:
+ ut_error;
+ break;
+ }
+
+ /* Read the natural predecessor and successor page addresses from
+ the page; NOTE that because the calling thread may have an x-latch
+ on the page, we do not acquire an s-latch on the page, this is to
+ prevent deadlocks. Even if we read values which are nonsense, the
+ algorithm will work. */
+
+ pred_offset = fil_page_get_prev(frame);
+ succ_offset = fil_page_get_next(frame);
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ if ((offset == low) && (succ_offset == offset + 1)) {
+
+ /* This is ok, we can continue */
+ new_offset = pred_offset;
+
+ } else if ((offset == high - 1) && (pred_offset == offset - 1)) {
+
+ /* This is ok, we can continue */
+ new_offset = succ_offset;
+ } else {
+ /* Successor or predecessor not in the right order */
+
+ return(0);
+ }
+
+ low = (new_offset / buf_read_ahead_linear_area)
+ * buf_read_ahead_linear_area;
+ high = (new_offset / buf_read_ahead_linear_area + 1)
+ * buf_read_ahead_linear_area;
+
+ if ((new_offset != low) && (new_offset != high - 1)) {
+ /* This is not a border page of the area: return */
+
+ return(0);
+ }
+
+ if (high > fil_space_get_size(space)) {
+ /* The area is not whole, return */
+
+ return(0);
+ }
+
+ /* If we got this far, read-ahead can be sensible: do it */
+
+ if (ibuf_inside()) {
+ ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
+ } else {
+ ibuf_mode = BUF_READ_ANY_PAGE;
+ }
+
+ count = 0;
+
+ /* Since Windows XP seems to schedule the i/o handler thread
+ very eagerly, and consequently it does not wait for the
+ full read batch to be posted, we use special heuristics here */
+
+ os_aio_simulated_put_read_threads_to_sleep();
+
+ for (i = low; i < high; i++) {
+ /* It is only sensible to do read-ahead in the non-sync
+ aio mode: hence FALSE as the first parameter */
+
+ if (!ibuf_bitmap_page(zip_size, i)) {
+ count += buf_read_page_low(
+ &err, FALSE,
+ ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
+ space, zip_size, FALSE, tablespace_version, i, trx);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: in"
+ " linear readahead trying to access\n"
+ "InnoDB: tablespace %lu page %lu,\n"
+ "InnoDB: but the tablespace does not"
+ " exist or is just being dropped.\n",
+ (ulong) space, (ulong) i);
+ }
+ }
+ }
+
+ /* In simulated aio we wake the aio handler threads only after
+ queuing all aio requests, in native aio the following call does
+ nothing: */
+
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+ buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints && (count > 0)) {
+ fprintf(stderr,
+ "LINEAR read-ahead space %lu offset %lu pages %lu\n",
+ (ulong) space, (ulong) offset, (ulong) count);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Read ahead is considered one I/O operation for the purpose of
+ LRU policy decision. */
+ buf_LRU_stat_inc_io();
+
+ buf_pool->stat.n_ra_pages_read += count;
+ return(count);
+}
+
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ const ulint* space_ids, /*!< in: array of space ids */
+ const ib_int64_t* space_versions,/*!< in: the spaces must have
+ this version number
+ (timestamp), otherwise we
+ discard the read; we use this
+ to cancel reads if DISCARD +
+ IMPORT may have changed the
+ tablespace size */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored) /*!< in: number of elements
+ in the arrays */
+{
+ ulint i;
+
+ ut_ad(!ibuf_inside());
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(n_stored < UNIV_PAGE_SIZE);
+#endif
+ while (buf_pool->n_pend_reads
+ > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+ os_thread_sleep(500000);
+ }
+
+ for (i = 0; i < n_stored; i++) {
+ ulint zip_size = fil_space_get_zip_size(space_ids[i]);
+ ulint err;
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+ goto tablespace_deleted;
+ }
+
+ buf_read_page_low(&err, sync && (i + 1 == n_stored),
+ BUF_READ_ANY_PAGE, space_ids[i],
+ zip_size, TRUE, space_versions[i],
+ page_nos[i], NULL);
+
+ if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
+tablespace_deleted:
+ /* We have deleted or are deleting the single-table
+ tablespace: remove the entries for that page */
+
+ ibuf_merge_or_delete_for_page(NULL, space_ids[i],
+ page_nos[i],
+ zip_size, FALSE);
+ }
+ }
+
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+ buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Ibuf merge read-ahead space %lu pages %lu\n",
+ (ulong) space_ids[0], (ulong) n_stored);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in
+ bytes, or 0 */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored) /*!< in: number of page numbers
+ in the array */
+{
+ ib_int64_t tablespace_version;
+ ulint count;
+ ulint err;
+ ulint i;
+
+ zip_size = fil_space_get_zip_size(space);
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+ /* It is a single table tablespace and the .ibd file is
+ missing: do nothing */
+
+ /* the log records should be treated here same reason
+ for http://bugs.mysql.com/bug.php?id=43948 */
+
+ if (recv_recovery_is_on()) {
+ recv_addr_t* recv_addr;
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_sys->apply_log_recs == FALSE) {
+ mutex_exit(&(recv_sys->mutex));
+ goto not_to_recover;
+ }
+
+ for (i = 0; i < n_stored; i++) {
+ /* recv_get_fil_addr_struct() */
+ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
+ recv_sys->addr_hash));
+ while (recv_addr) {
+ if ((recv_addr->space == space)
+ && (recv_addr->page_no == page_nos[i])) {
+ break;
+ }
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ if ((recv_addr == NULL)
+ || (recv_addr->state == RECV_BEING_PROCESSED)
+ || (recv_addr->state == RECV_PROCESSED)) {
+ continue;
+ }
+
+ recv_addr->state = RECV_PROCESSED;
+
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+
+ fprintf(stderr, " (cannot find space: %lu)", space);
+ }
+not_to_recover:
+
+ return;
+ }
+
+ tablespace_version = fil_space_get_version(space);
+
+ for (i = 0; i < n_stored; i++) {
+
+ count = 0;
+
+ os_aio_print_debug = FALSE;
+
+ while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+
+ os_aio_simulated_wake_handler_threads();
+ os_thread_sleep(10000);
+
+ count++;
+
+ if (count > 1000) {
+ fprintf(stderr,
+ "InnoDB: Error: InnoDB has waited for"
+ " 10 seconds for pending\n"
+ "InnoDB: reads to the buffer pool to"
+ " be finished.\n"
+ "InnoDB: Number of pending reads %lu,"
+ " pending pread calls %lu\n",
+ (ulong) buf_pool->n_pend_reads,
+ (ulong)os_file_n_pending_preads);
+
+ os_aio_print_debug = TRUE;
+ }
+ }
+
+ os_aio_print_debug = FALSE;
+
+ if ((i + 1 == n_stored) && sync) {
+ buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+ zip_size, TRUE, tablespace_version,
+ page_nos[i], NULL);
+ } else {
+ buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+ | OS_AIO_SIMULATED_WAKE_LATER,
+ space, zip_size, TRUE,
+ tablespace_version, page_nos[i], NULL);
+ }
+ }
+
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+ buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr,
+ "Recovery applies read-ahead pages %lu\n",
+ (ulong) n_stored);
+ }
+#endif /* UNIV_DEBUG */
+}
diff --git a/storage/xtradb/build/debian/README.Maintainer b/storage/xtradb/build/debian/README.Maintainer
new file mode 100644
index 00000000000..9554bdd7c62
--- /dev/null
+++ b/storage/xtradb/build/debian/README.Maintainer
@@ -0,0 +1,116 @@
+
+###########################
+## FIXME for 5.1 ##
+###########################
+
+* put this trigger-recreation thing into the init scripts -- what?!
+* Let debian-i10n-english review all template changes before the translaters start.
+* Mark debconf translations as obsolete with debconf-updatepo.
+
+###########################################################################
+# Here are some information that are only of interest for the current and #
+# following Debian maintainers of MySQL. #
+###########################################################################
+
+The debian/ directory is under SVN control, see debian/control for URL.
+
+#
+# Preparing a new version
+#
+The new orig.tar.gz (without non-free documentation) is created in /tmp/ when
+running this command:
+
+debian/rules get-orig-source
+
+#
+# mysqlreport
+#
+The authors e-mail address is <public@codenode.com>.
+
+#
+# Remarks to dependencies
+#
+libwrap0-dev (>= 7.6-8.3)
+ According to bug report 114582 where where build problems on
+ IA-64/sid with at least two prior versions.
+psmisc
+ /usr/bin/killall in the initscript
+
+zlib1g in libmysqlclient-dev:
+ "mysql_config --libs" ads "-lz"
+
+Build-Dep:
+
+debhelper (>=4.1.16):
+ See po-debconf(7).
+
+autoconf (>= 2.13-20), automake1.7
+ Try to get rid of them.
+
+doxygen, tetex-bin, tetex-extra, gs
+ for ndb/docs/*tex
+
+#
+# Remarks to the start scripts
+#
+
+## initscripts rely on mysqladmin from a different package
+We have the problem that "/etc/init.d/mysql stop" relies on mysqladmin which
+is in another package (mysql-client) and a passwordless access that's maybe
+only available if the user configured his /root/.my.cnf. Can this be a problem?
+* normal mode: not because the user is required to have it. Else:
+* purge/remove: not, same as normal mode
+* upgrade: not, same as normal mode
+* first install: not, it depends on mysql-client which at least is unpacked
+ so mysqladmin is there (to ping). It is not yet configured
+ passwordles but if there's a server running then there's a
+ /root/.my.cnf. Anyways, we simply kill anything that's mysqld.
+
+## Passwordless access for the maintainer scripts
+Another issue is that the scripts needs passwordless access. To ensure this
+a debian-sys-maint user is configured which has process and shutdown privs.
+The file with the randomly (that's important!) generated password must be
+present as long as the databases remain installed because else a new install
+would have no access. This file should be used like:
+ mysqladmin --defaults-file=/etc/mysql/debian.cnf restart
+to avoid providing the password in plaintext on a commandline where it would
+be visible to any user via the "ps" command.
+
+## When to start the daemon?
+We aim to give the admin full control on when MySQL is running.
+Issues to be faced here:
+OLD:
+ 1. Debconf asks whether MySQL should be started on boot so update-rc.d is
+ only run if the answer has been yes. The admin is likely to forget
+ this decision but update-rc.d checks for an existing line in
+ /etc/runlevel.conf and leaves it intact.
+ 2. On initial install, if the answer is yes, the daemon has to be started.
+ 3. On upgrades it should only be started if it was already running, everything
+ else is confusing. Especiall relying on an debconf decision made month ago
+ is considered suboptimal. See bug #274264
+ Implementation so far:
+ prerm (called on upgrade before stopping the server):
+ check for a running server and set flag if necessary
+ preinst (called on initial install and before unpacking when upgrading):
+ check for the debconf variable and set flag if necessary
+ postinst (called on initial install and after each upgrade after unpacking):
+ call update-rc.d if debconf says yes
+ call invoce-rc.d if the flag has been set
+ Problems remaining:
+ dpkg-reconfigure and setting mysql start on boot to yes did not start mysql
+ (ok "start on boot" literally does not mean "start now" so that might have been ok)
+NEW:
+ 1. --- no debconf anymore for the sake of simplicity. We have runlevel.conf,
+ the admin should use it
+ 2. On initial install the server is started.
+ 3. On upgrades the server is started exactly if it was running before so the
+ runlevel configuration is irrelevant. It will be preserved by the mean of
+ update-rc.d's builtin check.
+ Implementation:
+ prerm (called on upgrade before stopping the server):
+ check for a running server and set flag if necessary
+ preinst (called on initial install and before unpacking when upgrading):
+ check for $1 beeing (initial) "install" and set flag
+ postinst (called on initial install and after each upgrade after unpacking):
+ call update-rc.d
+ call invoce-rc.d if the flag has been set
diff --git a/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in b/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in
new file mode 100644
index 00000000000..f7316d4e345
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in
@@ -0,0 +1,6 @@
+all:
+
+distclean:
+ -rm -f Makefile
+
+.PHONY: all distclean clean install check
diff --git a/storage/xtradb/build/debian/additions/Docs__Makefile.in b/storage/xtradb/build/debian/additions/Docs__Makefile.in
new file mode 100644
index 00000000000..f7316d4e345
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/Docs__Makefile.in
@@ -0,0 +1,6 @@
+all:
+
+distclean:
+ -rm -f Makefile
+
+.PHONY: all distclean clean install check
diff --git a/storage/xtradb/build/debian/additions/debian-start b/storage/xtradb/build/debian/additions/debian-start
new file mode 100644
index 00000000000..10628019e40
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/debian-start
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# This script is executed by "/etc/init.d/mysql" on every (re)start.
+#
+# Changes to this file will be preserved when updating the Debian package.
+#
+
+source /usr/share/mysql/debian-start.inc.sh
+
+MYSQL="/usr/bin/mysql --defaults-file=/etc/mysql/debian.cnf"
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+MYUPGRADE="/usr/bin/mysql_upgrade --defaults-extra-file=/etc/mysql/debian.cnf"
+MYCHECK="/usr/bin/mysqlcheck --defaults-file=/etc/mysql/debian.cnf"
+MYCHECK_SUBJECT="WARNING: mysqlcheck has found corrupt tables"
+MYCHECK_PARAMS="--all-databases --fast --silent"
+MYCHECK_RCPT="root"
+
+# The following commands should be run when the server is up but in background
+# where they do not block the server start and in one shell instance so that
+# they run sequentially. They are supposed not to echo anything to stdout.
+# If you want to disable the check for crashed tables comment
+# "check_for_crashed_tables" out.
+# (There may be no output to stdout inside the background process!)
+echo "Checking for corrupt, not cleanly closed and upgrade needing tables."
+(
+ upgrade_system_tables_if_necessary;
+ check_root_accounts;
+ check_for_crashed_tables;
+) >&2 &
+
+exit 0
diff --git a/storage/xtradb/build/debian/additions/debian-start.inc.sh b/storage/xtradb/build/debian/additions/debian-start.inc.sh
new file mode 100644
index 00000000000..736cb3448eb
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/debian-start.inc.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# This file is included by /etc/mysql/debian-start
+#
+
+## Check all unclosed tables.
+# - Requires the server to be up.
+# - Is supposed to run silently in background.
+function check_for_crashed_tables() {
+ set -e
+ set -u
+
+ # But do it in the background to not stall the boot process.
+ logger -p daemon.info -i -t$0 "Triggering myisam-recover for all MyISAM tables"
+
+ # Checking for $? is unreliable so the size of the output is checked.
+ # Some table handlers like HEAP do not support CHECK TABLE.
+ tempfile=`tempfile`
+ # We have to use xargs in this case, because a for loop barfs on the
+ # spaces in the thing to be looped over.
+ LC_ALL=C $MYSQL --skip-column-names --batch -e '
+ select concat("select count(*) into @discard from `",
+ TABLE_SCHEMA, "`.`", TABLE_NAME, "`")
+ from information_schema.TABLES where ENGINE="MyISAM"' | \
+ xargs -i $MYSQL --skip-column-names --silent --batch \
+ --force -e "{}" >$tempfile
+ if [ -s $tempfile ]; then
+ (
+ /bin/echo -e "\n" \
+ "Improperly closed tables are also reported if clients are accessing\n" \
+ "the tables *now*. A list of current connections is below.\n";
+ $MYADMIN processlist status
+ ) >> $tempfile
+ # Check for presence as a dependency on mailx would require an MTA.
+ if [ -x /usr/bin/mailx ]; then
+ mailx -e -s"$MYCHECK_SUBJECT" $MYCHECK_RCPT < $tempfile
+ fi
+ (echo "$MYCHECK_SUBJECT"; cat $tempfile) | logger -p daemon.warn -i -t$0
+ fi
+ rm $tempfile
+}
+
+## Check for tables needing an upgrade.
+# - Requires the server to be up.
+# - Is supposed to run silently in background.
+function upgrade_system_tables_if_necessary() {
+ set -e
+ set -u
+
+ logger -p daemon.info -i -t$0 "Upgrading MySQL tables if necessary."
+
+ # Filter all "duplicate column", "duplicate key" and "unknown column"
+ # errors as the script is designed to be idempotent.
+ LC_ALL=C $MYUPGRADE \
+ 2>&1 \
+ | egrep -v '^(1|@had|ERROR (1054|1060|1061))' \
+ | logger -p daemon.warn -i -t$0
+}
+
+## Check for the presence of both, root accounts with and without password.
+# This might have been caused by a bug related to mysql_install_db (#418672).
+function check_root_accounts() {
+ set -e
+ set -u
+
+ logger -p daemon.info -i -t$0 "Checking for insecure root accounts."
+
+ ret=$( echo "SELECT count(*) FROM mysql.user WHERE user='root' and password='';" | $MYSQL --skip-column-names )
+ if [ "$ret" -ne "0" ]; then
+ logger -p daemon.warn -i -t$0 "WARNING: mysql.user contains $ret root accounts without password!"
+ fi
+}
diff --git a/storage/xtradb/build/debian/additions/echo_stderr b/storage/xtradb/build/debian/additions/echo_stderr
new file mode 100644
index 00000000000..67b3ed7cfb3
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/echo_stderr
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo "$*" 1>&2
diff --git a/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm b/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm
new file mode 100644
index 00000000000..3aaa7acd5b8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm
@@ -0,0 +1,1089 @@
+use strict;
+use warnings FATAL => 'all';
+
+package InnoDBParser;
+
+# This program is copyright (c) 2006 Baron Schwartz, baron at xaprb dot com.
+# Feedback and improvements are gratefully received.
+#
+# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
+# systems, you can issue `man perlgpl' or `man perlartistic' to read these
+
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+
+our $VERSION = '1.6.0';
+
+use Data::Dumper;
+$Data::Dumper::Sortkeys = 1;
+use English qw(-no_match_vars);
+use List::Util qw(max);
+
+# Some common patterns
+my $d = qr/(\d+)/; # Digit
+my $f = qr/(\d+\.\d+)/; # Float
+my $t = qr/(\d+ \d+)/; # Transaction ID
+my $i = qr/((?:\d{1,3}\.){3}\d+)/; # IP address
+my $n = qr/([^`\s]+)/; # MySQL object name
+my $w = qr/(\w+)/; # Words
+my $fl = qr/([\w\.\/]+) line $d/; # Filename and line number
+my $h = qr/((?:0x)?[0-9a-f]*)/; # Hex
+my $s = qr/(\d{6} .\d:\d\d:\d\d)/; # InnoDB timestamp
+
+# If you update this variable, also update the SYNOPSIS in the pod.
+my %innodb_section_headers = (
+ "TRANSACTIONS" => "tx",
+ "BUFFER POOL AND MEMORY" => "bp",
+ "SEMAPHORES" => "sm",
+ "LOG" => "lg",
+ "ROW OPERATIONS" => "ro",
+ "INSERT BUFFER AND ADAPTIVE HASH INDEX" => "ib",
+ "FILE I/O" => "io",
+ "LATEST DETECTED DEADLOCK" => "dl",
+ "LATEST FOREIGN KEY ERROR" => "fk",
+);
+
+my %parser_for = (
+ tx => \&parse_tx_section,
+ bp => \&parse_bp_section,
+ sm => \&parse_sm_section,
+ lg => \&parse_lg_section,
+ ro => \&parse_ro_section,
+ ib => \&parse_ib_section,
+ io => \&parse_io_section,
+ dl => \&parse_dl_section,
+ fk => \&parse_fk_section,
+);
+
+my %fk_parser_for = (
+ Transaction => \&parse_fk_transaction_error,
+ Error => \&parse_fk_bad_constraint_error,
+ Cannot => \&parse_fk_cant_drop_parent_error,
+);
+
+# A thread's proc_info can be at least 98 different things I've found in the
+# source. Fortunately, most of them begin with a gerunded verb. These are
+# the ones that don't.
+my %is_proc_info = (
+ 'After create' => 1,
+ 'Execution of init_command' => 1,
+ 'FULLTEXT initialization' => 1,
+ 'Reopen tables' => 1,
+ 'Repair done' => 1,
+ 'Repair with keycache' => 1,
+ 'System lock' => 1,
+ 'Table lock' => 1,
+ 'Thread initialized' => 1,
+ 'User lock' => 1,
+ 'copy to tmp table' => 1,
+ 'discard_or_import_tablespace' => 1,
+ 'end' => 1,
+ 'got handler lock' => 1,
+ 'got old table' => 1,
+ 'init' => 1,
+ 'key cache' => 1,
+ 'locks' => 1,
+ 'malloc' => 1,
+ 'query end' => 1,
+ 'rename result table' => 1,
+ 'rename' => 1,
+ 'setup' => 1,
+ 'statistics' => 1,
+ 'status' => 1,
+ 'table cache' => 1,
+ 'update' => 1,
+);
+
+sub new {
+ bless {}, shift;
+}
+
+# Parse the status and return it.
+# See srv_printf_innodb_monitor in innobase/srv/srv0srv.c
+# Pass in the text to parse, whether to be in debugging mode, which sections
+# to parse (hashref; if empty, parse all), and whether to parse full info from
+# locks and such (probably shouldn't unless you need to).
+sub parse_status_text {
+ my ( $self, $fulltext, $debug, $sections, $full ) = @_;
+
+ die "I can't parse undef" unless defined $fulltext;
+ $fulltext =~ s/[\r\n]+/\n/g;
+
+ $sections ||= {};
+ die '$sections must be a hashref' unless ref($sections) eq 'HASH';
+
+ my %innodb_data = (
+ got_all => 0, # Whether I was able to get the whole thing
+ ts => '', # Timestamp the server put on it
+ last_secs => 0, # Num seconds the averages are over
+ sections => {}, # Parsed values from each section
+ );
+
+ if ( $debug ) {
+ $innodb_data{'fulltext'} = $fulltext;
+ }
+
+ # Get the most basic info about the status: beginning and end, and whether
+ # I got the whole thing (if there has been a big deadlock and there are
+ # too many locks to print, the output might be truncated)
+ my ( $time_text ) = $fulltext =~ m/^$s INNODB MONITOR OUTPUT$/m;
+ $innodb_data{'ts'} = [ parse_innodb_timestamp( $time_text ) ];
+ $innodb_data{'timestring'} = ts_to_string($innodb_data{'ts'});
+ ( $innodb_data{'last_secs'} ) = $fulltext
+ =~ m/Per second averages calculated from the last $d seconds/;
+
+ ( my $got_all ) = $fulltext =~ m/END OF INNODB MONITOR OUTPUT/;
+ $innodb_data{'got_all'} = $got_all || 0;
+
+ # Split it into sections. Each section begins with
+ # -----
+ # LABEL
+ # -----
+ my %innodb_sections;
+ my @matches = $fulltext
+ =~ m#\n(---+)\n([A-Z /]+)\n\1\n(.*?)(?=\n(---+)\n[A-Z /]+\n\4\n|$)#gs;
+ while ( my ( $start, $name, $text, $end ) = splice(@matches, 0, 4) ) {
+ $innodb_sections{$name} = [ $text, $end ? 1 : 0 ];
+ }
+ # The Row Operations section is a special case, because instead of ending
+ # with the beginning of another section, it ends with the end of the file.
+ # So this section is complete if the entire file is complete.
+ $innodb_sections{'ROW OPERATIONS'}->[1] ||= $innodb_data{'got_all'};
+
+ # Just for sanity's sake, make sure I understand what to do with each
+ # section
+ eval {
+ foreach my $section ( keys %innodb_sections ) {
+ my $header = $innodb_section_headers{$section};
+ die "Unknown section $section in $fulltext\n"
+ unless $header;
+ $innodb_data{'sections'}->{ $header }
+ ->{'fulltext'} = $innodb_sections{$section}->[0];
+ $innodb_data{'sections'}->{ $header }
+ ->{'complete'} = $innodb_sections{$section}->[1];
+ }
+ };
+ if ( $EVAL_ERROR ) {
+ _debug( $debug, $EVAL_ERROR);
+ }
+
+ # ################################################################
+ # Parse the detailed data out of the sections.
+ # ################################################################
+ eval {
+ foreach my $section ( keys %parser_for ) {
+ if ( defined $innodb_data{'sections'}->{$section}
+ && (!%$sections || (defined($sections->{$section} && $sections->{$section})) )) {
+ $parser_for{$section}->(
+ $innodb_data{'sections'}->{$section},
+ $innodb_data{'sections'}->{$section}->{'complete'},
+ $debug,
+ $full )
+ or delete $innodb_data{'sections'}->{$section};
+ }
+ else {
+ delete $innodb_data{'sections'}->{$section};
+ }
+ }
+ };
+ if ( $EVAL_ERROR ) {
+ _debug( $debug, $EVAL_ERROR);
+ }
+
+ return \%innodb_data;
+}
+
+# Parses the status text and returns it flattened out as a single hash.
+sub get_status_hash {
+ my ( $self, $fulltext, $debug, $sections, $full ) = @_;
+
+ # Parse the status text...
+ my $innodb_status
+ = $self->parse_status_text($fulltext, $debug, $sections, $full );
+
+ # Flatten the hierarchical structure into a single list by grabbing desired
+ # sections from it.
+ return
+ (map { 'IB_' . $_ => $innodb_status->{$_} } qw(timestring last_secs got_all)),
+ (map { 'IB_bp_' . $_ => $innodb_status->{'sections'}->{'bp'}->{$_} }
+ qw( writes_pending buf_pool_hit_rate total_mem_alloc buf_pool_reads
+ awe_mem_alloc pages_modified writes_pending_lru page_creates_sec
+ reads_pending pages_total buf_pool_hits writes_pending_single_page
+ page_writes_sec pages_read pages_written page_reads_sec
+ writes_pending_flush_list buf_pool_size add_pool_alloc
+ dict_mem_alloc pages_created buf_free complete )),
+ (map { 'IB_tx_' . $_ => $innodb_status->{'sections'}->{'tx'}->{$_} }
+ qw( num_lock_structs history_list_len purge_done_for transactions
+ purge_undo_for is_truncated trx_id_counter complete )),
+ (map { 'IB_ib_' . $_ => $innodb_status->{'sections'}->{'ib'}->{$_} }
+ qw( hash_table_size hash_searches_s non_hash_searches_s
+ bufs_in_node_heap used_cells size free_list_len seg_size inserts
+ merged_recs merges complete )),
+ (map { 'IB_lg_' . $_ => $innodb_status->{'sections'}->{'lg'}->{$_} }
+ qw( log_ios_done pending_chkp_writes last_chkp log_ios_s
+ log_flushed_to log_seq_no pending_log_writes complete )),
+ (map { 'IB_sm_' . $_ => $innodb_status->{'sections'}->{'sm'}->{$_} }
+ qw( wait_array_size rw_shared_spins rw_excl_os_waits mutex_os_waits
+ mutex_spin_rounds mutex_spin_waits rw_excl_spins rw_shared_os_waits
+ waits signal_count reservation_count complete )),
+ (map { 'IB_ro_' . $_ => $innodb_status->{'sections'}->{'ro'}->{$_} }
+ qw( queries_in_queue n_reserved_extents main_thread_state
+ main_thread_proc_no main_thread_id read_sec del_sec upd_sec ins_sec
+ read_views_open num_rows_upd num_rows_ins num_rows_read
+ queries_inside num_rows_del complete )),
+ (map { 'IB_fk_' . $_ => $innodb_status->{'sections'}->{'fk'}->{$_} }
+ qw( trigger parent_table child_index parent_index attempted_op
+ child_db timestring fk_name records col_name reason txn parent_db
+ type child_table parent_col complete )),
+ (map { 'IB_io_' . $_ => $innodb_status->{'sections'}->{'io'}->{$_} }
+ qw( pending_buffer_pool_flushes pending_pwrites pending_preads
+ pending_normal_aio_reads fsyncs_s os_file_writes pending_sync_ios
+ reads_s flush_type avg_bytes_s pending_ibuf_aio_reads writes_s
+ threads os_file_reads pending_aio_writes pending_log_ios os_fsyncs
+ pending_log_flushes complete )),
+ (map { 'IB_dl_' . $_ => $innodb_status->{'sections'}->{'dl'}->{$_} }
+ qw( timestring rolled_back txns complete ));
+
+}
+
+sub ts_to_string {
+ my $parts = shift;
+ return sprintf('%02d-%02d-%02d %02d:%02d:%02d', @$parts);
+}
+
+sub parse_innodb_timestamp {
+ my $text = shift;
+ my ( $y, $m, $d, $h, $i, $s )
+ = $text =~ m/^(\d\d)(\d\d)(\d\d) +(\d+):(\d+):(\d+)$/;
+ die("Can't get timestamp from $text\n") unless $y;
+ $y += 2000;
+ return ( $y, $m, $d, $h, $i, $s );
+}
+
+sub parse_fk_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ my $fulltext = $section->{'fulltext'};
+
+ return 0 unless $fulltext;
+
+ my ( $ts, $type ) = $fulltext =~ m/^$s\s+(\w+)/m;
+ $section->{'ts'} = [ parse_innodb_timestamp( $ts ) ];
+ $section->{'timestring'} = ts_to_string($section->{'ts'});
+ $section->{'type'} = $type;
+
+ # Decide which type of FK error happened, and dispatch to the right parser.
+ if ( $type && $fk_parser_for{$type} ) {
+ $fk_parser_for{$type}->( $section, $complete, $debug, $fulltext, $full );
+ }
+
+ delete $section->{'fulltext'} unless $debug;
+
+ return 1;
+}
+
+sub parse_fk_cant_drop_parent_error {
+ my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+ # Parse the parent/child table info out
+ @{$section}{ qw(attempted_op parent_db parent_table) } = $fulltext
+ =~ m{Cannot $w table `(.*)/(.*)`}m;
+ @{$section}{ qw(child_db child_table) } = $fulltext
+ =~ m{because it is referenced by `(.*)/(.*)`}m;
+
+ ( $section->{'reason'} ) = $fulltext =~ m/(Cannot .*)/s;
+ $section->{'reason'} =~ s/\n(?:InnoDB: )?/ /gm
+ if $section->{'reason'};
+
+ # Certain data may not be present. Make them '' if not present.
+ map { $section->{$_} ||= "" }
+ qw(child_index fk_name col_name parent_col);
+}
+
+# See dict/dict0dict.c, function dict_foreign_error_report
+# I don't care much about these. There are lots of different messages, and
+# they come from someone trying to create a foreign key, or similar
+# statements. They aren't indicative of some transaction trying to insert,
+# delete or update data. Sometimes it is possible to parse out a lot of
+# information about the tables and indexes involved, but often the message
+# contains the DDL string the user entered, which is way too much for this
+# module to try to handle.
+sub parse_fk_bad_constraint_error {
+ my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+ # Parse the parent/child table and index info out
+ @{$section}{ qw(child_db child_table) } = $fulltext
+ =~ m{Error in foreign key constraint of table (.*)/(.*):$}m;
+ $section->{'attempted_op'} = 'DDL';
+
+ # FK name, parent info... if possible.
+ @{$section}{ qw(fk_name col_name parent_db parent_table parent_col) }
+ = $fulltext
+ =~ m/CONSTRAINT `?$n`? FOREIGN KEY \(`?$n`?\) REFERENCES (?:`?$n`?\.)?`?$n`? \(`?$n`?\)/;
+
+ if ( !defined($section->{'fk_name'}) ) {
+ # Try to parse SQL a user might have typed in a CREATE statement or such
+ @{$section}{ qw(col_name parent_db parent_table parent_col) }
+ = $fulltext
+ =~ m/FOREIGN\s+KEY\s*\(`?$n`?\)\s+REFERENCES\s+(?:`?$n`?\.)?`?$n`?\s*\(`?$n`?\)/i;
+ }
+ $section->{'parent_db'} ||= $section->{'child_db'};
+
+ # Name of the child index (index in the same table where the FK is, see
+ # definition of dict_foreign_struct in include/dict0mem.h, where it is
+ # called foreign_index, as opposed to referenced_index which is in the
+ # parent table. This may not be possible to find.
+ @{$section}{ qw(child_index) } = $fulltext
+ =~ m/^The index in the foreign key in table is $n$/m;
+
+ @{$section}{ qw(reason) } = $fulltext =~ m/:\s*([^:]+)(?= Constraint:|$)/ms;
+ $section->{'reason'} =~ s/\s+/ /g
+ if $section->{'reason'};
+
+ # Certain data may not be present. Make them '' if not present.
+ map { $section->{$_} ||= "" }
+ qw(child_index fk_name col_name parent_table parent_col);
+}
+
+# see source file row/row0ins.c
+sub parse_fk_transaction_error {
+ my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+ # Parse the txn info out
+ my ( $txn ) = $fulltext
+ =~ m/Transaction:\n(TRANSACTION.*)\nForeign key constraint fails/s;
+ if ( $txn ) {
+ $section->{'txn'} = parse_tx_text( $txn, $complete, $debug, $full );
+ }
+
+ # Parse the parent/child table and index info out. There are two types: an
+ # update or a delete of a parent record leaves a child orphaned
+ # (row_ins_foreign_report_err), and an insert or update of a child record has
+ # no matching parent record (row_ins_foreign_report_add_err).
+
+ @{$section}{ qw(reason child_db child_table) }
+ = $fulltext =~ m{^(Foreign key constraint fails for table `(.*)/(.*)`:)$}m;
+
+ @{$section}{ qw(fk_name col_name parent_db parent_table parent_col) }
+ = $fulltext
+ =~ m/CONSTRAINT `$n` FOREIGN KEY \(`$n`\) REFERENCES (?:`$n`\.)?`$n` \(`$n`\)/;
+ $section->{'parent_db'} ||= $section->{'child_db'};
+
+ # Special case, which I don't know how to trigger, but see
+ # innobase/row/row0ins.c row_ins_check_foreign_constraint
+ if ( $fulltext =~ m/ibd file does not currently exist!/ ) {
+ my ( $attempted_op, $index, $records )
+ = $fulltext =~ m/^Trying to (add to index) `$n` tuple:\n(.*))?/sm;
+ $section->{'child_index'} = $index;
+ $section->{'attempted_op'} = $attempted_op || '';
+ if ( $records && $full ) {
+ ( $section->{'records'} )
+ = parse_innodb_record_dump( $records, $complete, $debug );
+ }
+ @{$section}{qw(parent_db parent_table)}
+ =~ m/^But the parent table `$n`\.`$n`$/m;
+ }
+ else {
+ my ( $attempted_op, $which, $index )
+ = $fulltext =~ m/^Trying to ([\w ]*) in (child|parent) table, in index `$n` tuple:$/m;
+ if ( $which ) {
+ $section->{$which . '_index'} = $index;
+ $section->{'attempted_op'} = $attempted_op || '';
+
+ # Parse out the related records in the other table.
+ my ( $search_index, $records );
+ if ( $which eq 'child' ) {
+ ( $search_index, $records ) = $fulltext
+ =~ m/^But in parent table [^,]*, in index `$n`,\nthe closest match we can find is record:\n(.*)/ms;
+ $section->{'parent_index'} = $search_index;
+ }
+ else {
+ ( $search_index, $records ) = $fulltext
+ =~ m/^But in child table [^,]*, in index `$n`, (?:the record is not available|there is a record:\n(.*))?/ms;
+ $section->{'child_index'} = $search_index;
+ }
+ if ( $records && $full ) {
+ $section->{'records'}
+ = parse_innodb_record_dump( $records, $complete, $debug );
+ }
+ else {
+ $section->{'records'} = '';
+ }
+ }
+ }
+
+ # Parse out the tuple trying to be updated, deleted or inserted.
+ my ( $trigger ) = $fulltext =~ m/^(DATA TUPLE: \d+ fields;\n.*)$/m;
+ if ( $trigger ) {
+ $section->{'trigger'} = parse_innodb_record_dump( $trigger, $complete, $debug );
+ }
+
+ # Certain data may not be present. Make them '' if not present.
+ map { $section->{$_} ||= "" }
+ qw(child_index fk_name col_name parent_table parent_col);
+}
+
+# There are new-style and old-style record formats. See rem/rem0rec.c
+# TODO: write some tests for this
+sub parse_innodb_record_dump {
+ my ( $dump, $complete, $debug ) = @_;
+ return undef unless $dump;
+
+ my $result = {};
+
+ if ( $dump =~ m/PHYSICAL RECORD/ ) {
+ my $style = $dump =~ m/compact format/ ? 'new' : 'old';
+ $result->{'style'} = $style;
+
+ # This is a new-style record.
+ if ( $style eq 'new' ) {
+ @{$result}{qw( heap_no type num_fields info_bits )}
+ = $dump
+ =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; compact format; info bits $d$/m;
+ }
+
+ # OK, it's old-style. Unfortunately there are variations here too.
+ elsif ( $dump =~ m/-byte offs / ) {
+ # Older-old style.
+ @{$result}{qw( heap_no type num_fields byte_offset info_bits )}
+ = $dump
+ =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; $d-byte offs [A-Z]+; info bits $d$/m;
+ if ( $dump !~ m/-byte offs TRUE/ ) {
+ $result->{'byte_offset'} = 0;
+ }
+ }
+ else {
+ # Newer-old style.
+ @{$result}{qw( heap_no type num_fields byte_offset info_bits )}
+ = $dump
+ =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; $d-byte offsets; info bits $d$/m;
+ }
+
+ }
+ else {
+ $result->{'style'} = 'tuple';
+ @{$result}{qw( type num_fields )}
+ = $dump =~ m/^(DATA TUPLE): $d fields;$/m;
+ }
+
+ # Fill in default values for things that couldn't be parsed.
+ map { $result->{$_} ||= 0 }
+ qw(heap_no num_fields byte_offset info_bits);
+ map { $result->{$_} ||= '' }
+ qw(style type );
+
+ my @fields = $dump =~ m/ (\d+:.*?;?);(?=$| \d+:)/gm;
+ $result->{'fields'} = [ map { parse_field($_, $complete, $debug ) } @fields ];
+
+ return $result;
+}
+
+# New/old-style applies here. See rem/rem0rec.c
+# $text should not include the leading space or the second trailing semicolon.
+sub parse_field {
+ my ( $text, $complete, $debug ) = @_;
+
+ # Sample fields:
+ # '4: SQL NULL, size 4 '
+ # '1: len 6; hex 000000005601; asc V ;'
+ # '6: SQL NULL'
+ # '5: len 30; hex 687474703a2f2f7777772e737765657477617465722e636f6d2f73746f72; asc http://www.sweetwater.com/stor;...(truncated)'
+ my ( $id, $nullsize, $len, $hex, $asc, $truncated );
+ ( $id, $nullsize ) = $text =~ m/^$d: SQL NULL, size $d $/;
+ if ( !defined($id) ) {
+ ( $id ) = $text =~ m/^$d: SQL NULL$/;
+ }
+ if ( !defined($id) ) {
+ ( $id, $len, $hex, $asc, $truncated )
+ = $text =~ m/^$d: len $d; hex $h; asc (.*);(\.\.\.\(truncated\))?$/;
+ }
+
+ die "Could not parse this field: '$text'" unless defined $id;
+ return {
+ id => $id,
+ len => defined($len) ? $len : defined($nullsize) ? $nullsize : 0,
+ 'hex' => defined($hex) ? $hex : '',
+ asc => defined($asc) ? $asc : '',
+ trunc => $truncated ? 1 : 0,
+ };
+
+}
+
+sub parse_dl_section {
+ my ( $dl, $complete, $debug, $full ) = @_;
+ return unless $dl;
+ my $fulltext = $dl->{'fulltext'};
+ return 0 unless $fulltext;
+
+ my ( $ts ) = $fulltext =~ m/^$s$/m;
+ return 0 unless $ts;
+
+ $dl->{'ts'} = [ parse_innodb_timestamp( $ts ) ];
+ $dl->{'timestring'} = ts_to_string($dl->{'ts'});
+ $dl->{'txns'} = {};
+
+ my @sections
+ = $fulltext
+ =~ m{
+ ^\*{3}\s([^\n]*) # *** (1) WAITING FOR THIS...
+ (.*?) # Followed by anything, non-greedy
+ (?=(?:^\*{3})|\z) # Followed by another three stars or EOF
+ }gmsx;
+
+
+ # Loop through each section. There are no assumptions about how many
+ # there are, who holds and wants what locks, and who gets rolled back.
+ while ( my ($header, $body) = splice(@sections, 0, 2) ) {
+ my ( $txn_id, $what ) = $header =~ m/^\($d\) (.*):$/;
+ next unless $txn_id;
+ $dl->{'txns'}->{$txn_id} ||= {};
+ my $txn = $dl->{'txns'}->{$txn_id};
+
+ if ( $what eq 'TRANSACTION' ) {
+ $txn->{'tx'} = parse_tx_text( $body, $complete, $debug, $full );
+ }
+ else {
+ push @{$txn->{'locks'}}, parse_innodb_record_locks( $body, $complete, $debug, $full );
+ }
+ }
+
+ @{ $dl }{ qw(rolled_back) }
+ = $fulltext =~ m/^\*\*\* WE ROLL BACK TRANSACTION \($d\)$/m;
+
+ # Make sure certain values aren't undef
+ map { $dl->{$_} ||= '' } qw(rolled_back);
+
+ delete $dl->{'fulltext'} unless $debug;
+ return 1;
+}
+
+sub parse_innodb_record_locks {
+ my ( $text, $complete, $debug, $full ) = @_;
+ my @result;
+
+ foreach my $lock ( $text =~ m/(^(?:RECORD|TABLE) LOCKS?.*$)/gm ) {
+ my $hash = {};
+ @{$hash}{ qw(lock_type space_id page_no n_bits index db table txn_id lock_mode) }
+ = $lock
+ =~ m{^(RECORD|TABLE) LOCKS? (?:space id $d page no $d n bits $d index `?$n`? of )?table `$n(?:/|`\.`)$n` trx id $t lock.mode (\S+)}m;
+ ( $hash->{'special'} )
+ = $lock =~ m/^(?:RECORD|TABLE) .*? locks (rec but not gap|gap before rec)/m;
+ $hash->{'insert_intention'}
+ = $lock =~ m/^(?:RECORD|TABLE) .*? insert intention/m ? 1 : 0;
+ $hash->{'waiting'}
+ = $lock =~ m/^(?:RECORD|TABLE) .*? waiting/m ? 1 : 0;
+
+ # Some things may not be in the text, so make sure they are not
+ # undef.
+ map { $hash->{$_} ||= 0 } qw(n_bits page_no space_id);
+ map { $hash->{$_} ||= "" } qw(index special);
+ push @result, $hash;
+ }
+
+ return @result;
+}
+
+sub parse_tx_text {
+ my ( $txn, $complete, $debug, $full ) = @_;
+
+ my ( $txn_id, $txn_status, $active_secs, $proc_no, $os_thread_id )
+ = $txn
+ =~ m/^(?:---)?TRANSACTION $t, (\D*?)(?: $d sec)?, (?:process no $d, )?OS thread id $d/m;
+ my ( $thread_status, $thread_decl_inside )
+ = $txn
+ =~ m/OS thread id \d+(?: ([^,]+?))?(?:, thread declared inside InnoDB $d)?$/m;
+
+ # Parsing the line that begins 'MySQL thread id' is complicated. The only
+ # thing always in the line is the thread and query id. See function
+ # innobase_mysql_print_thd in InnoDB source file sql/ha_innodb.cc.
+ my ( $thread_line ) = $txn =~ m/^(MySQL thread id .*)$/m;
+ my ( $mysql_thread_id, $query_id, $hostname, $ip, $user, $query_status );
+
+ if ( $thread_line ) {
+ # These parts can always be gotten.
+ ( $mysql_thread_id, $query_id ) = $thread_line =~ m/^MySQL thread id $d, query id $d/m;
+
+ # If it's a master/slave thread, "Has (read|sent) all" may be the thread's
+ # proc_info. In these cases, there won't be any host/ip/user info
+ ( $query_status ) = $thread_line =~ m/(Has (?:read|sent) all .*$)/m;
+ if ( defined($query_status) ) {
+ $user = 'system user';
+ }
+
+ # It may be the case that the query id is the last thing in the line.
+ elsif ( $thread_line =~ m/query id \d+ / ) {
+ # The IP address is the only non-word thing left, so it's the most
+ # useful marker for where I have to start guessing.
+ ( $hostname, $ip ) = $thread_line =~ m/query id \d+(?: ([A-Za-z]\S+))? $i/m;
+ if ( defined $ip ) {
+ ( $user, $query_status ) = $thread_line =~ m/$ip $w(?: (.*))?$/;
+ }
+ else { # OK, there wasn't an IP address.
+ # There might not be ANYTHING except the query status.
+ ( $query_status ) = $thread_line =~ m/query id \d+ (.*)$/;
+ if ( $query_status !~ m/^\w+ing/ && !exists($is_proc_info{$query_status}) ) {
+ # The remaining tokens are, in order: hostname, user, query_status.
+ # It's basically impossible to know which is which.
+ ( $hostname, $user, $query_status ) = $thread_line
+ =~ m/query id \d+(?: ([A-Za-z]\S+))?(?: $w(?: (.*))?)?$/m;
+ }
+ else {
+ $user = 'system user';
+ }
+ }
+ }
+ }
+
+ my ( $lock_wait_status, $lock_structs, $heap_size, $row_locks, $undo_log_entries )
+ = $txn
+ =~ m/^(?:(\D*) )?$d lock struct\(s\), heap size $d(?:, $d row lock\(s\))?(?:, undo log entries $d)?$/m;
+ my ( $lock_wait_time )
+ = $txn
+ =~ m/^------- TRX HAS BEEN WAITING $d SEC/m;
+
+ my $locks;
+ # If the transaction has locks, grab the locks.
+ if ( $txn =~ m/^TABLE LOCK|RECORD LOCKS/ ) {
+ $locks = [parse_innodb_record_locks($txn, $complete, $debug, $full)];
+ }
+
+ my ( $tables_in_use, $tables_locked )
+ = $txn
+ =~ m/^mysql tables in use $d, locked $d$/m;
+ my ( $txn_doesnt_see_ge, $txn_sees_lt )
+ = $txn
+ =~ m/^Trx read view will not see trx with id >= $t, sees < $t$/m;
+ my $has_read_view = defined($txn_doesnt_see_ge);
+ # Only a certain number of bytes of the query text are included here, at least
+ # under some circumstances. Some versions include 300, some 600.
+ my ( $query_text )
+ = $txn
+ =~ m{
+ ^MySQL\sthread\sid\s[^\n]+\n # This comes before the query text
+ (.*?) # The query text
+ (?= # Followed by any of...
+ ^Trx\sread\sview
+ |^-------\sTRX\sHAS\sBEEN\sWAITING
+ |^TABLE\sLOCK
+ |^RECORD\sLOCKS\sspace\sid
+ |^(?:---)?TRANSACTION
+ |^\*\*\*\s\(\d\)
+ |\Z
+ )
+ }xms;
+ if ( $query_text ) {
+ $query_text =~ s/\s+$//;
+ }
+ else {
+ $query_text = '';
+ }
+
+ my %stuff = (
+ active_secs => $active_secs,
+ has_read_view => $has_read_view,
+ heap_size => $heap_size,
+ hostname => $hostname,
+ ip => $ip,
+ lock_structs => $lock_structs,
+ lock_wait_status => $lock_wait_status,
+ lock_wait_time => $lock_wait_time,
+ mysql_thread_id => $mysql_thread_id,
+ os_thread_id => $os_thread_id,
+ proc_no => $proc_no,
+ query_id => $query_id,
+ query_status => $query_status,
+ query_text => $query_text,
+ row_locks => $row_locks,
+ tables_in_use => $tables_in_use,
+ tables_locked => $tables_locked,
+ thread_decl_inside => $thread_decl_inside,
+ thread_status => $thread_status,
+ txn_doesnt_see_ge => $txn_doesnt_see_ge,
+ txn_id => $txn_id,
+ txn_sees_lt => $txn_sees_lt,
+ txn_status => $txn_status,
+ undo_log_entries => $undo_log_entries,
+ user => $user,
+ );
+ $stuff{'fulltext'} = $txn if $debug;
+ $stuff{'locks'} = $locks if $locks;
+
+ # Some things may not be in the txn text, so make sure they are not
+ # undef.
+ map { $stuff{$_} ||= 0 } qw(active_secs heap_size lock_structs
+ tables_in_use undo_log_entries tables_locked has_read_view
+ thread_decl_inside lock_wait_time proc_no row_locks);
+ map { $stuff{$_} ||= "" } qw(thread_status txn_doesnt_see_ge
+ txn_sees_lt query_status ip query_text lock_wait_status user);
+ $stuff{'hostname'} ||= $stuff{'ip'};
+
+ return \%stuff;
+}
+
+sub parse_tx_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+ $section->{'transactions'} = [];
+
+ # Handle the individual transactions
+ my @transactions = $fulltext =~ m/(---TRANSACTION \d.*?)(?=\n---TRANSACTION|$)/gs;
+ foreach my $txn ( @transactions ) {
+ my $stuff = parse_tx_text( $txn, $complete, $debug, $full );
+ delete $stuff->{'fulltext'} unless $debug;
+ push @{$section->{'transactions'}}, $stuff;
+ }
+
+ # Handle the general info
+ @{$section}{ 'trx_id_counter' }
+ = $fulltext =~ m/^Trx id counter $t$/m;
+ @{$section}{ 'purge_done_for', 'purge_undo_for' }
+ = $fulltext =~ m/^Purge done for trx's n:o < $t undo n:o < $t$/m;
+ @{$section}{ 'history_list_len' } # This isn't present in some 4.x versions
+ = $fulltext =~ m/^History list length $d$/m;
+ @{$section}{ 'num_lock_structs' }
+ = $fulltext =~ m/^Total number of lock structs in row lock hash table $d$/m;
+ @{$section}{ 'is_truncated' }
+ = $fulltext =~ m/^\.\.\. truncated\.\.\.$/m ? 1 : 0;
+
+ # Fill in things that might not be present
+ foreach ( qw(history_list_len) ) {
+ $section->{$_} ||= 0;
+ }
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+# I've read the source for this section.
+sub parse_ro_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+
+ # Grab the info
+ @{$section}{ 'queries_inside', 'queries_in_queue' }
+ = $fulltext =~ m/^$d queries inside InnoDB, $d queries in queue$/m;
+ ( $section->{ 'read_views_open' } )
+ = $fulltext =~ m/^$d read views open inside InnoDB$/m;
+ ( $section->{ 'n_reserved_extents' } )
+ = $fulltext =~ m/^$d tablespace extents now reserved for B-tree/m;
+ @{$section}{ 'main_thread_proc_no', 'main_thread_id', 'main_thread_state' }
+ = $fulltext =~ m/^Main thread (?:process no. $d, )?id $d, state: (.*)$/m;
+ @{$section}{ 'num_rows_ins', 'num_rows_upd', 'num_rows_del', 'num_rows_read' }
+ = $fulltext =~ m/^Number of rows inserted $d, updated $d, deleted $d, read $d$/m;
+ @{$section}{ 'ins_sec', 'upd_sec', 'del_sec', 'read_sec' }
+ = $fulltext =~ m#^$f inserts/s, $f updates/s, $f deletes/s, $f reads/s$#m;
+ $section->{'main_thread_proc_no'} ||= 0;
+
+ map { $section->{$_} ||= 0 } qw(read_views_open n_reserved_extents);
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+sub parse_lg_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section;
+ my $fulltext = $section->{'fulltext'};
+
+ # Grab the info
+ ( $section->{ 'log_seq_no' } )
+ = $fulltext =~ m/Log sequence number \s*(\d.*)$/m;
+ ( $section->{ 'log_flushed_to' } )
+ = $fulltext =~ m/Log flushed up to \s*(\d.*)$/m;
+ ( $section->{ 'last_chkp' } )
+ = $fulltext =~ m/Last checkpoint at \s*(\d.*)$/m;
+ @{$section}{ 'pending_log_writes', 'pending_chkp_writes' }
+ = $fulltext =~ m/$d pending log writes, $d pending chkp writes/;
+ @{$section}{ 'log_ios_done', 'log_ios_s' }
+ = $fulltext =~ m#$d log i/o's done, $f log i/o's/second#;
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+sub parse_ib_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+
+ # Some servers will output ibuf information for tablespace 0, as though there
+ # might be many tablespaces with insert buffers. (In practice I believe
+ # the source code shows there will only ever be one). I have to parse both
+ # cases here, but I assume there will only be one.
+ @{$section}{ 'size', 'free_list_len', 'seg_size' }
+ = $fulltext =~ m/^Ibuf(?: for space 0)?: size $d, free list len $d, seg size $d,$/m;
+ @{$section}{ 'inserts', 'merged_recs', 'merges' }
+ = $fulltext =~ m/^$d inserts, $d merged recs, $d merges$/m;
+
+ @{$section}{ 'hash_table_size', 'used_cells', 'bufs_in_node_heap' }
+ = $fulltext =~ m/^Hash table size $d, used cells $d, node heap has $d buffer\(s\)$/m;
+ @{$section}{ 'hash_searches_s', 'non_hash_searches_s' }
+ = $fulltext =~ m{^$f hash searches/s, $f non-hash searches/s$}m;
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+sub parse_wait_array {
+ my ( $text, $complete, $debug, $full ) = @_;
+ my %result;
+
+ @result{ qw(thread waited_at_filename waited_at_line waited_secs) }
+ = $text =~ m/^--Thread $d has waited at $fl for $f seconds/m;
+
+ # Depending on whether it's a SYNC_MUTEX,RW_LOCK_EX,RW_LOCK_SHARED,
+ # there will be different text output
+ if ( $text =~ m/^Mutex at/m ) {
+ $result{'request_type'} = 'M';
+ @result{ qw( lock_mem_addr lock_cfile_name lock_cline lock_var) }
+ = $text =~ m/^Mutex at $h created file $fl, lock var $d$/m;
+ @result{ qw( waiters_flag )}
+ = $text =~ m/^waiters flag $d$/m;
+ }
+ else {
+ @result{ qw( request_type lock_mem_addr lock_cfile_name lock_cline) }
+ = $text =~ m/^(.)-lock on RW-latch at $h created in file $fl$/m;
+ @result{ qw( writer_thread writer_lock_mode ) }
+ = $text =~ m/^a writer \(thread id $d\) has reserved it in mode (.*)$/m;
+ @result{ qw( num_readers waiters_flag )}
+ = $text =~ m/^number of readers $d, waiters flag $d$/m;
+ @result{ qw(last_s_file_name last_s_line ) }
+ = $text =~ m/Last time read locked in file $fl$/m;
+ @result{ qw(last_x_file_name last_x_line ) }
+ = $text =~ m/Last time write locked in file $fl$/m;
+ }
+
+ $result{'cell_waiting'} = $text =~ m/^wait has ended$/m ? 0 : 1;
+ $result{'cell_event_set'} = $text =~ m/^wait is ending$/m ? 1 : 0;
+
+ # Because there are two code paths, some things won't get set.
+ map { $result{$_} ||= '' }
+ qw(last_s_file_name last_x_file_name writer_lock_mode);
+ map { $result{$_} ||= 0 }
+ qw(num_readers lock_var last_s_line last_x_line writer_thread);
+
+ return \%result;
+}
+
+sub parse_sm_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return 0 unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+
+ # Grab the info
+ @{$section}{ 'reservation_count', 'signal_count' }
+ = $fulltext =~ m/^OS WAIT ARRAY INFO: reservation count $d, signal count $d$/m;
+ @{$section}{ 'mutex_spin_waits', 'mutex_spin_rounds', 'mutex_os_waits' }
+ = $fulltext =~ m/^Mutex spin waits $d, rounds $d, OS waits $d$/m;
+ @{$section}{ 'rw_shared_spins', 'rw_shared_os_waits', 'rw_excl_spins', 'rw_excl_os_waits' }
+ = $fulltext =~ m/^RW-shared spins $d, OS waits $d; RW-excl spins $d, OS waits $d$/m;
+
+ # Look for info on waits.
+ my @waits = $fulltext =~ m/^(--Thread.*?)^(?=Mutex spin|--Thread)/gms;
+ $section->{'waits'} = [ map { parse_wait_array($_, $complete, $debug) } @waits ];
+ $section->{'wait_array_size'} = scalar(@waits);
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+# I've read the source for this section.
+sub parse_bp_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+
+ # Grab the info
+ @{$section}{ 'total_mem_alloc', 'add_pool_alloc' }
+ = $fulltext =~ m/^Total memory allocated $d; in additional pool allocated $d$/m;
+ @{$section}{'dict_mem_alloc'} = $fulltext =~ m/Dictionary memory allocated $d/;
+ @{$section}{'awe_mem_alloc'} = $fulltext =~ m/$d MB of AWE memory/;
+ @{$section}{'buf_pool_size'} = $fulltext =~ m/^Buffer pool size\s*$d$/m;
+ @{$section}{'buf_free'} = $fulltext =~ m/^Free buffers\s*$d$/m;
+ @{$section}{'pages_total'} = $fulltext =~ m/^Database pages\s*$d$/m;
+ @{$section}{'pages_modified'} = $fulltext =~ m/^Modified db pages\s*$d$/m;
+ @{$section}{'pages_read', 'pages_created', 'pages_written'}
+ = $fulltext =~ m/^Pages read $d, created $d, written $d$/m;
+ @{$section}{'page_reads_sec', 'page_creates_sec', 'page_writes_sec'}
+ = $fulltext =~ m{^$f reads/s, $f creates/s, $f writes/s$}m;
+ @{$section}{'buf_pool_hits', 'buf_pool_reads'}
+ = $fulltext =~ m{Buffer pool hit rate $d / $d$}m;
+ if ($fulltext =~ m/^No buffer pool page gets since the last printout$/m) {
+ @{$section}{'buf_pool_hits', 'buf_pool_reads'} = (0, 0);
+ @{$section}{'buf_pool_hit_rate'} = '--';
+ }
+ else {
+ @{$section}{'buf_pool_hit_rate'}
+ = $fulltext =~ m{Buffer pool hit rate (\d+ / \d+)$}m;
+ }
+ @{$section}{'reads_pending'} = $fulltext =~ m/^Pending reads $d/m;
+ @{$section}{'writes_pending_lru', 'writes_pending_flush_list', 'writes_pending_single_page' }
+ = $fulltext =~ m/^Pending writes: LRU $d, flush list $d, single page $d$/m;
+
+ map { $section->{$_} ||= 0 }
+ qw(writes_pending_lru writes_pending_flush_list writes_pending_single_page
+ awe_mem_alloc dict_mem_alloc);
+ @{$section}{'writes_pending'} = List::Util::sum(
+ @{$section}{ qw(writes_pending_lru writes_pending_flush_list writes_pending_single_page) });
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+# I've read the source for this.
+sub parse_io_section {
+ my ( $section, $complete, $debug, $full ) = @_;
+ return unless $section && $section->{'fulltext'};
+ my $fulltext = $section->{'fulltext'};
+ $section->{'threads'} = {};
+
+ # Grab the I/O thread info
+ my @threads = $fulltext =~ m<^(I/O thread \d+ .*)$>gm;
+ foreach my $thread (@threads) {
+ my ( $tid, $state, $purpose, $event_set )
+ = $thread =~ m{I/O thread $d state: (.+?) \((.*)\)(?: ev set)?$}m;
+ if ( defined $tid ) {
+ $section->{'threads'}->{$tid} = {
+ thread => $tid,
+ state => $state,
+ purpose => $purpose,
+ event_set => $event_set ? 1 : 0,
+ };
+ }
+ }
+
+ # Grab the reads/writes/flushes info
+ @{$section}{ 'pending_normal_aio_reads', 'pending_aio_writes' }
+ = $fulltext =~ m/^Pending normal aio reads: $d, aio writes: $d,$/m;
+ @{$section}{ 'pending_ibuf_aio_reads', 'pending_log_ios', 'pending_sync_ios' }
+ = $fulltext =~ m{^ ibuf aio reads: $d, log i/o's: $d, sync i/o's: $d$}m;
+ @{$section}{ 'flush_type', 'pending_log_flushes', 'pending_buffer_pool_flushes' }
+ = $fulltext =~ m/^Pending flushes \($w\) log: $d; buffer pool: $d$/m;
+ @{$section}{ 'os_file_reads', 'os_file_writes', 'os_fsyncs' }
+ = $fulltext =~ m/^$d OS file reads, $d OS file writes, $d OS fsyncs$/m;
+ @{$section}{ 'reads_s', 'avg_bytes_s', 'writes_s', 'fsyncs_s' }
+ = $fulltext =~ m{^$f reads/s, $d avg bytes/read, $f writes/s, $f fsyncs/s$}m;
+ @{$section}{ 'pending_preads', 'pending_pwrites' }
+ = $fulltext =~ m/$d pending preads, $d pending pwrites$/m;
+ @{$section}{ 'pending_preads', 'pending_pwrites' } = (0, 0)
+ unless defined($section->{'pending_preads'});
+
+ delete $section->{'fulltext'} unless $debug;
+ return 1;
+}
+
+sub _debug {
+ my ( $debug, $msg ) = @_;
+ if ( $debug ) {
+ die $msg;
+ }
+ else {
+ warn $msg;
+ }
+ return 1;
+}
+
+1;
+
+# end_of_package
+# ############################################################################
+# Perldoc section. I put this last as per the Dog book.
+# ############################################################################
+=pod
+
+=head1 NAME
+
+InnoDBParser - Parse InnoDB monitor text.
+
+=head1 DESCRIPTION
+
+InnoDBParser tries to parse the output of the InnoDB monitor. One way to get
+this output is to connect to a MySQL server and issue the command SHOW ENGINE
+INNODB STATUS (omit 'ENGINE' on earlier versions of MySQL). The goal is to
+turn text into data that something else (e.g. innotop) can use.
+
+The output comes from all over, but the place to start in the source is
+innobase/srv/srv0srv.c.
+
+=head1 SYNOPSIS
+
+ use InnoDBParser;
+ use DBI;
+
+ # Get the status text.
+ my $dbh = DBI->connect(
+ "DBI::mysql:test;host=localhost",
+ 'user',
+ 'password'
+ );
+ my $query = 'SHOW /*!5 ENGINE */ INNODB STATUS';
+ my $text = $dbh->selectcol_arrayref($query)->[0];
+
+ # 1 or 0
+ my $debug = 1;
+
+ # Choose sections of the monitor text you want. Possible values:
+ # TRANSACTIONS => tx
+ # BUFFER POOL AND MEMORY => bp
+ # SEMAPHORES => sm
+ # LOG => lg
+ # ROW OPERATIONS => ro
+ # INSERT BUFFER AND ADAPTIVE HASH INDEX => ib
+ # FILE I/O => io
+ # LATEST DETECTED DEADLOCK => dl
+ # LATEST FOREIGN KEY ERROR => fk
+
+ my $required_sections = {
+ tx => 1,
+ };
+
+ # Parse the status text.
+ my $parser = InnoDBParser->new;
+ $innodb_status = $parser->parse_status_text(
+ $text,
+ $debug,
+ # Omit the following parameter to get all sections.
+ $required_sections,
+ );
+
+=head1 COPYRIGHT, LICENSE AND WARRANTY
+
+This package is copyright (c) 2006 Baron Schwartz, baron at xaprb dot com.
+Feedback and improvements are gratefully received.
+
+THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+=head1 AUTHOR
+
+Baron Schwartz, baron at xaprb dot com.
+
+=head1 BUGS
+
+None known, but I bet there are some. The InnoDB monitor text wasn't really
+designed to be parsable.
+
+=head1 SEE ALSO
+
+innotop - a program that can format the parsed status information for humans
+to read and enjoy.
+
+=cut
diff --git a/storage/xtradb/build/debian/additions/innotop/changelog.innotop b/storage/xtradb/build/debian/additions/innotop/changelog.innotop
new file mode 100644
index 00000000000..baff706e235
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/changelog.innotop
@@ -0,0 +1,318 @@
+Changelog for innotop and InnoDBParser:
+
+2007-11-09: version 1.6.0
+
+ * S mode crashed on non-numeric values.
+ * New user-defined columns crashed upon restart.
+ * Added --color option to control terminal coloring.
+
+2007-09-18: version 1.5.2
+
+ * Added the ability to monitor InnoDB status from a file.
+ * Changed W mode to L mode; it monitors all locks, not just lock waits.
+
+2007-09-16: version 1.5.1
+
+ * Added C (Command Summary) mode.
+ * Fixed a bug in the 'avg' aggregate function.
+
+2007-09-10: version 1.5.0
+
+ Changes:
+ * Added plugin functionality.
+ * Added group-by functionality.
+ * Moved the configuration file to a directory.
+ * Enhanced filtering and sorting on pivoted tables.
+ * Many small bug fixes.
+
+2007-07-16: version 1.4.3
+
+ Changes:
+ * Added standard --version command-line option
+ * Changed colors to cyan instead of blue; more visible on dark terminals.
+ * Added information to the filter-choosing dialog.
+ * Added column auto-completion when entering a filter expression.
+ * Changed Term::ReadKey from optional to mandatory.
+ * Clarified username in password prompting.
+ * Ten thousand words of documentation!
+
+ Bugs fixed:
+ * innotop crashed in W mode when InnoDB status data was truncated.
+ * innotop didn't display errors in tables if debug was enabled.
+ * The colored() subroutine wasn't being created in non-interactive mode.
+ * Don't prompt to save password except the first time.
+
+2007-05-03: version 1.4.2
+
+ This version contains all changes to the trunk until revision 239; some
+ changes in revisions 240:250 are included.
+
+ MAJOR CHANGES:
+
+ * Quick-filters to easily filter any column in any display
+ * Compatibility with MySQL 3.23 through 6.0
+ * Improved error handling when a server is down, permissions denied, etc
+ * Use additional SHOW INNODB STATUS information in 5.1.x
+ * Make all modes use tables consistently, so they can all be edited,
+ filtered, colored and sorted consistently
+ * Combine V, G and S modes into S mode, with v, g, and s hot-keys
+ * Let DBD driver read MySQL option files; permit connections without
+ user/pass/etc
+ * Compile SQL-like expressions into Perl subroutines; eliminate need to
+ know Perl
+ * Do not save all config data to config file, only save user's customizations
+ * Rewritten and improved command-line option handling
+ * Added --count, --delay, and other command-line options to support
+ run-and-exit operation
+ * Improve built-in variable sets
+ * Improve help screen with three-part balanced-column layout
+ * Simplify table-editor and improve hotkey support
+ * Require Perl to have high-resolution time support (Time::HiRes)
+ * Help the user choose a query to analyze or kill
+ * Enable EXPLAIN, show-full-query in T mode just like Q mode
+ * Let data-extraction access current, previous and incremental data sets
+ all at once
+
+ MINOR CHANGES:
+
+ * Column stabilizing for Q mode
+ * New color rules for T, Q, W modes
+ * Apply slave I/O filter to Q mode
+ * Improve detection of server version and other meta-data
+ * Make connection timeout a config variable
+ * Improve cross-version-compatible SQL syntax
+ * Get some information from the DBD driver instead of asking MySQL for it
+ * Improved error messages
+ * Improve server group creation/editing
+ * Improve connection/thread killing
+ * Fix broken key bindings and restore previously mapped hot-keys for
+ choosing columns
+ * Some documentation updates (but not nearly enough)
+ * Allow the user to specify graphing char in S mode (formerly G mode)
+ * Allow easy switching between variable sets in S mode
+ * Bind 'n' key globally to choose the 'next' server connection
+ * Bind '%' key globally to filter displayed tables
+ * Allow aligning columns on the decimal place for easy readability
+ * Add hide_hdr config variable to hide column headers in tables
+ * Add a feature to smartly run PURGE MASTER LOGS in Replication mode
+ * Enable debug mode as a globally configurable variable
+ * Improve error messages when an expression or filter doesn't compile or has
+ a run-time error; die on error when debug is enabled
+ * Allow user-configurable delays after executing SQL (to let the server
+ settle down before taking another measurement)
+ * Add an expression to show how long until a transaction is finished
+ * Add skip_innodb as a global config variable
+ * Add '%' after percentages to help disambiguate (user-configurable)
+ * Add column to M mode to help see how fast slave is catching up to master
+
+ BUG FIXES:
+
+ * T and W modes had wrong value for wait_status column
+ * Error tracking on connections didn't reset when the connection recovered
+ * wait_timeout on connections couldn't be set before MySQL 4.0.3
+ * There was a crash on 3.23 when wiping deadlocks
+ * Lettercase changes in some result sets (SHOW MASTER/SLAVE STATUS) between
+ MySQL versions crashed innotop
+ * Inactive connections crashed innotop upon access to DBD driver
+ * set_precision did not respect user defaults for number of digits
+ * --inc command-line option could not be negated
+ * InnoDB status parsing was not always parsing all needed information
+ * S mode (formerly G mode) could crash trying to divide non-numeric data
+ * M table didn't show Slave_open_temp_tables variable; incorrect lettercase
+ * DBD drivers with broken AutoCommit would crash innotop
+ * Some key bindings had incorrect labels
+ * Some config-file loading routines could load data for things that didn't
+ exist
+ * Headers printed too often in S mode
+ * High-resolution time was not used even when the user had it
+ * Non-interactive mode printed blank lines sometimes
+ * Q-mode header and statusbar showed different QPS numbers
+ * Formulas for key-cache and query-cache hit ratios were wrong
+ * Mac OS "Darwin" machines were mis-identified as Microsoft Windows
+ * Some multiplications crashed when given undefined input
+ * The commify transformation did not check its input and could crash
+ * Specifying an invalid mode on the command line or config file could crash
+ innotop
+
+2007-03-29: version 1.4.1
+
+ * More tweaks to display of connection errors.
+ * Fixed a problem with skip-innodb in MySQL 5.1.
+ * Fix a bug with dead connections in single-connection mode.
+ * Fix a regex to allow parsing more data from truncated deadlocks.
+ * Don't load active cxns from the config file if the cxn isn't defined.
+
+2007-03-03: version 1.4.0
+
+ * Further tweak error handling and display of connection errors
+ * More centralization of querying
+ * Fix forking so it doesn't kill all database connections
+ * Allow user to run innotop without permissions for GLOBAL variables and status
+
+2007-02-11: version 1.3.6
+
+ * Handle some connection failures so innotop doesn't crash because of one server.
+ * Enable incremental display in more modes.
+ * Tweaks to colorizing, color editor, and default color rules.
+ * Tweaks to default sorting rules.
+ * Use prepared statements for efficiency.
+ * Bug fixes and code cleanups.
+ * Data storage is keyed on clock ticks now.
+
+2007-02-03: version 1.3.5
+
+ * Bug fixes.
+ * More tools for editing configuration from within innotop.
+ * Filters and transformations are constrained to valid values.
+ * Support for colorizing rows.
+ * Sorting by multiple columns.
+ * Compress headers when display is very wide.
+ * Stabilize and limit column widths.
+ * Check config file formats when upgrading so upgrades go smoothly.
+ * Make D mode handle many connections at once.
+ * Extract simple expressions from data sets in column src property.
+ This makes innotop more awk-ish.
+
+2007-01-16: version 1.3
+
+ * Readline support.
+ * Can be used unattended, or in a pipe-and-filter mode
+ where it outputs tab-separated data to standard output.
+ * You can specify a config file on the command line.
+ Config files can be marked read-only.
+ * Monitor multiple servers simultaneously.
+ * Server groups to help manage many servers conveniently.
+ * Monitor master/slave status, and control slaves.
+ * Columns can have user-defined expressions as their data sources.
+ * Better configuration tools.
+ * InnoDB status information is merged into SHOW VARIABLES and
+ SHOW STATUS information, so you can access it all together.
+ * High-precision time support in more places.
+ * Lots of tweaks to make things display more readably and compactly.
+ * Column transformations and filters.
+
+2007-01-16: version 1.0.1
+ * NOTE: innotop is now hosted at Sourceforge, in Subversion not CVS.
+ The new project homepage is http://sourceforge.net/projects/innotop/
+ * Tweak default T/Q mode sort columns to match what people expect.
+ * Fix broken InnoDBParser.pm documentation (and hence man page).
+
+2007-01-06: version 1.0
+ * NOTE: innotop is now hosted at Sourceforge, in Subversion not CVS.
+ The new project homepage is http://sourceforge.net/projects/innotop/
+ * Prevent control characters from freaking terminal out.
+ * Set timeout to keep busy servers from closing connection.
+ * There is only one InnoDB insert buffer.
+ * Make licenses clear and consistent.
+
+2006-11-14: innotop 0.1.160, InnoDBParser version 1.69
+ * Support for ANSI color on Microsoft Windows (more readable, compact
+ display; thanks Gisbert W. Selke).
+ * Better handling of $ENV{HOME} on Windows.
+ * Added a LICENSE file to the package as per Gentoo bug:
+ http://bugs.gentoo.org/show_bug.cgi?id=147600
+
+2006-11-11: innotop 0.1.157, InnoDBParser version 1.69
+ * Add Microsoft Windows support.
+
+2006-10-19: innotop 0.1.154, InnoDBParser version 1.69
+ * Add O (Open Tables) mode
+ * Add some more checks to handle incomplete InnoDB status information
+
+2006-09-30: innotop 0.1.152, InnoDBParser version 1.69
+ * Figured out what was wrong with package $VERSION variable: it wasn't
+ after the package declaration!
+
+2006-09-28: innotop 0.1.152, InnoDBParser version 1.67
+ * Make more efforts towards crash-resistance and tolerance of completely
+ messed-up inputs. If innotop itself is broken, it is now much harder to
+ tell, because it just keeps on running without complaining.
+ * Fix a small bug parsing out some information and displaying it.
+
+2006-09-05: innotop 0.1.149, InnoDBParser version 1.64
+ * Try to find and eliminate any parsing code that assumes pattern matches
+ will succeed.
+
+2006-09-05: innotop 0.1.149, InnoDBParser version 1.62
+ * Make innotop crash-resistant, so I can declare it STABLE finally.
+ * Instead of using SQL conditional comments, detect MySQL version.
+
+2006-08-22: innotop 0.1.147, InnoDBParser version 1.60
+ * Fix some innotop bugs with undefined values, bad formatting etc.
+
+2006-08-19: innotop 0.1.146, InnoDBParser version 1.60
+ * Make innotop handle some unexpected NULL values in Q mode.
+ * Add OS wait information to W mode, so it is now "everything that waits."
+ * Center section captions better.
+ * Make R mode more readable and compact.
+ * Make InnoDBParser parse lock waits even when they've been waiting 0 secs.
+
+2006-08-12: innotop 0.1.139, InnoDBParser version 1.59
+ * Add more documentation
+ * Tweak V mode to show more info in less space.
+ * Fix a bug in G mode.
+
+2006-08-10: innotop 0.1.132, InnoDBParser version 1.58
+ * Handle yet more types of FK error... it will never end!
+ * Handle some special cases when DEADLOCK info truncated
+ * Add a bit more FK info to F mode in innotop
+ * More tests added to the test suite
+
+2006-08-07: innotop 0.1.131, InnoDBParser version 1.55
+ * Fix another issue with configuration
+ * Handle another type of FK error
+
+2006-08-03: innotop 0.1.130, InnoDBParser version 1.54
+ * Fix an issue loading config file
+ * Add heap_no to 'D' (InnoDB Deadlock) mode to ease deadlock debugging.
+
+2006-08-02: innotop 0.1.128, InnoDBParser version 1.54
+ * Parse lock wait information from the TRANSACTION section.
+ * Even more OS-specific parsing... pain in the butt...
+ * Add 'W' (InnoDB Lock Wait) mode.
+ * Fix some minor display issues with statusbar.
+
+2006-08-02: innotop 0.1.125, InnoDBParser version 1.50
+ * Don't try to get references to Perl built-in functions like time()
+ * Handle more OS-specific variations of InnoDB status text
+ * Add some more information to various places in innotop
+
+2006-08-01: innotop 0.1.123, InnoDBParser version 1.47
+
+ * Enhance S and G modes: clear screen and re-print headers
+ * Don't crash when deadlock data is truncated
+ * Make Analyze mode say how to get back to whatever you came from
+ * Display 'nothing to display' when there is nothing
+ * Add ability to read InnoDB status text from a file (mostly helps test)
+ * Add table of Wait Array Information in Row Op/Semaphore mode
+ * Add table of lock information in InnoDB deadlock mode
+ * Ensure new features in upgrades don't get masked by existing config files
+ * Tweak default column choices for T mode
+ * Enhance foreign key parsing
+ * Enhance physical record and data tuple parsing
+ * Enhance lock parsing (handle old-style and new-style formats)
+
+2006-07-24: innotop 0.1.112, InnoDBParser version 1.36
+
+ * InnoDBParser enhancements for FK error messages.
+ * A fix to innotop to prevent it from crashing while trying to display a FK
+ error message.
+ * Some minor cosmetic changes to number formatting in innotop.
+
+2006-07-22: innotop 0.1.106, InnoDBParser version 1.35
+
+ * InnoDBParser is much more complete and accurate.
+ * Tons of bug fixes.
+ * Add partitions to EXPLAIN mode.
+ * Enhance Q mode header, add T mode header.
+ * Share some configuration variables across modes.
+ * Add formatted time columns to Q, T modes.
+ * Add command-line argument parsing.
+ * Turn off echo when asking for password.
+ * Add option to specify port when connecting.
+ * Let display-optimized-query display multiple notes.
+ * Lots of small improvements, such as showing more info in statusbar.
+
+2006-07-02: innotop 0.1.74, InnoDBParser version 1.24
+
+ * Initial release for public consumption.
diff --git a/storage/xtradb/build/debian/additions/innotop/innotop b/storage/xtradb/build/debian/additions/innotop/innotop
new file mode 100644
index 00000000000..e2bfc1bd965
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/innotop
@@ -0,0 +1,9485 @@
+#!/usr/bin/perl
+
+# vim: tw=160:nowrap:expandtab:tabstop=3:shiftwidth=3:softtabstop=3
+
+use strict;
+use warnings FATAL => 'all';
+use sigtrap qw(handler finish untrapped normal-signals);
+
+use Data::Dumper;
+use DBI;
+use English qw(-no_match_vars);
+use File::Basename qw(dirname);
+use Getopt::Long;
+use List::Util qw(max min maxstr sum);
+use InnoDBParser;
+use POSIX qw(ceil);
+use Time::HiRes qw(time sleep);
+use Term::ReadKey qw(ReadMode ReadKey);
+
+# Version, license and warranty information. {{{1
+# ###########################################################################
+our $VERSION = '1.6.0';
+our $SVN_REV = sprintf("%d", q$Revision: 383 $ =~ m/(\d+)/g);
+our $SVN_URL = sprintf("%s", q$URL: https://innotop.svn.sourceforge.net/svnroot/innotop/trunk/innotop $ =~ m$svnroot/innotop/(\S+)$g);
+
+my $innotop_license = <<"LICENSE";
+
+This is innotop version $VERSION, a MySQL and InnoDB monitor.
+
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+
+THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA.
+LICENSE
+
+# Configuration information and global setup {{{1
+# ###########################################################################
+
+# Really, really, super-global variables.
+my @config_versions = (
+ "000-000-000", "001-003-000", # config file was one big name-value hash.
+ "001-003-000", "001-004-002", # config file contained non-user-defined stuff.
+);
+
+my $clear_screen_sub;
+
+# This defines expected properties and defaults for the column definitions that
+# eventually end up in tbl_meta.
+my %col_props = (
+ hdr => '',
+ just => '-',
+ dec => 0, # Whether to align the column on the decimal point
+ num => 0,
+ label => '',
+ user => 0,
+ src => '',
+ tbl => '', # Helps when writing/reading custom columns in config files
+ minw => 0,
+ maxw => 0,
+ trans => [],
+ agg => 'first', # Aggregate function
+ aggonly => 0, # Whether to show only when tbl_meta->{aggregate} is true
+);
+
+# Actual DBI connections to MySQL servers.
+my %dbhs;
+
+# Command-line parameters {{{2
+# ###########################################################################
+
+my @opt_spec = (
+ { s => 'help', d => 'Show this help message' },
+ { s => 'color|C!', d => 'Use terminal coloring (default)', c => 'color' },
+ { s => 'config|c=s', d => 'Config file to read' },
+ { s => 'nonint|n', d => 'Non-interactive, output tab-separated fields' },
+ { s => 'count=i', d => 'Number of updates before exiting' },
+ { s => 'delay|d=f', d => 'Delay between updates in seconds', c => 'interval' },
+ { s => 'mode|m=s', d => 'Operating mode to start in', c => 'mode' },
+ { s => 'inc|i!', d => 'Measure incremental differences', c => 'status_inc' },
+ { s => 'version', d => 'Output version information and exit' },
+);
+
+# This is the container for the command-line options' values to be stored in
+# after processing. Initial values are defaults.
+my %opts = (
+ n => !( -t STDIN && -t STDOUT ), # If in/out aren't to terminals, we're interactive
+);
+# Post-process...
+my %opt_seen;
+foreach my $spec ( @opt_spec ) {
+ my ( $long, $short ) = $spec->{s} =~ m/^(\w+)(?:\|([^!+=]*))?/;
+ $spec->{k} = $short || $long;
+ $spec->{l} = $long;
+ $spec->{t} = $short;
+ $spec->{n} = $spec->{s} =~ m/!/;
+ $opts{$spec->{k}} = undef unless defined $opts{$spec->{k}};
+ die "Duplicate option $spec->{k}" if $opt_seen{$spec->{k}}++;
+}
+
+Getopt::Long::Configure('no_ignore_case', 'bundling');
+GetOptions( map { $_->{s} => \$opts{$_->{k}} } @opt_spec) or $opts{help} = 1;
+
+if ( $opts{version} ) {
+ print "innotop Ver $VERSION Changeset $SVN_REV from $SVN_URL\n";
+ exit(0);
+}
+
+if ( $opts{'help'} ) {
+ print "Usage: innotop <options> <innodb-status-file>\n\n";
+ my $maxw = max(map { length($_->{l}) + ($_->{n} ? 4 : 0)} @opt_spec);
+ foreach my $spec ( sort { $a->{l} cmp $b->{l} } @opt_spec ) {
+ my $long = $spec->{n} ? "[no]$spec->{l}" : $spec->{l};
+ my $short = $spec->{t} ? "-$spec->{t}" : '';
+ printf(" --%-${maxw}s %-4s %s\n", $long, $short, $spec->{d});
+ }
+ print <<USAGE;
+
+innotop is a MySQL and InnoDB transaction/status monitor, like 'top' for
+MySQL. It displays queries, InnoDB transactions, lock waits, deadlocks,
+foreign key errors, open tables, replication status, buffer information,
+row operations, logs, I/O operations, load graph, and more. You can
+monitor many servers at once with innotop.
+
+USAGE
+ exit(1);
+}
+
+# Meta-data (table definitions etc) {{{2
+# ###########################################################################
+
+# Expressions {{{3
+# Convenience so I can copy/paste these in several places...
+# ###########################################################################
+my %exprs = (
+ Host => q{my $host = host || hostname || ''; ($host) = $host =~ m/^((?:[\d.]+(?=:))|(?:[a-zA-Z]\w+))/; return $host || ''},
+ Port => q{my ($p) = host =~ m/:(.*)$/; return $p || 0},
+ OldVersions => q{dulint_to_int(IB_tx_trx_id_counter) - dulint_to_int(IB_tx_purge_done_for)},
+ MaxTxnTime => q/max(map{ $_->{active_secs} } @{ IB_tx_transactions }) || 0/,
+ NumTxns => q{scalar @{ IB_tx_transactions } },
+ DirtyBufs => q{ $cur->{IB_bp_pages_modified} / ($cur->{IB_bp_buf_pool_size} || 1) },
+ BufPoolFill => q{ $cur->{IB_bp_pages_total} / ($cur->{IB_bp_buf_pool_size} || 1) },
+ ServerLoad => q{ $cur->{Threads_connected}/(Questions||1)/Uptime_hires },
+ TxnTimeRemain => q{ defined undo_log_entries && defined $pre->{undo_log_entries} && undo_log_entries < $pre->{undo_log_entries} ? undo_log_entries / (($pre->{undo_log_entries} - undo_log_entries)/((active_secs-$pre->{active_secs})||1))||1 : 0},
+ SlaveCatchupRate => ' defined $cur->{seconds_behind_master} && defined $pre->{seconds_behind_master} && $cur->{seconds_behind_master} < $pre->{seconds_behind_master} ? ($pre->{seconds_behind_master}-$cur->{seconds_behind_master})/($cur->{Uptime_hires}-$pre->{Uptime_hires}) : 0',
+ QcacheHitRatio => q{(Qcache_hits||0)/(((Com_select||0)+(Qcache_hits||0))||1)},
+);
+
+# ###########################################################################
+# Column definitions {{{3
+# Defines every column in every table. A named column has the following
+# properties:
+# * hdr Column header/title
+# * label Documentation for humans.
+# * num Whether it's numeric (for sorting).
+# * just Alignment; generated from num, user-overridable in tbl_meta
+# * minw, maxw Auto-generated, user-overridable.
+# Values from this hash are just copied to tbl_meta, which is where everything
+# else in the program should read from.
+# ###########################################################################
+
+my %columns = (
+ active_secs => { hdr => 'SecsActive', num => 1, label => 'Seconds transaction has been active', },
+ add_pool_alloc => { hdr => 'Add\'l Pool', num => 1, label => 'Additonal pool allocated' },
+ attempted_op => { hdr => 'Action', num => 0, label => 'The action that caused the error' },
+ awe_mem_alloc => { hdr => 'AWE Memory', num => 1, label => '[Windows] AWE memory allocated' },
+ binlog_cache_overflow => { hdr => 'Binlog Cache', num => 1, label => 'Transactions too big for binlog cache that went to disk' },
+ binlog_do_db => { hdr => 'Binlog Do DB', num => 0, label => 'binlog-do-db setting' },
+ binlog_ignore_db => { hdr => 'Binlog Ignore DB', num => 0, label => 'binlog-ignore-db setting' },
+ bps_in => { hdr => 'BpsIn', num => 1, label => 'Bytes per second received by the server', },
+ bps_out => { hdr => 'BpsOut', num => 1, label => 'Bytes per second sent by the server', },
+ buf_free => { hdr => 'Free Bufs', num => 1, label => 'Buffers free in the buffer pool' },
+ buf_pool_hit_rate => { hdr => 'Hit Rate', num => 0, label => 'Buffer pool hit rate' },
+ buf_pool_hits => { hdr => 'Hits', num => 1, label => 'Buffer pool hits' },
+ buf_pool_reads => { hdr => 'Reads', num => 1, label => 'Buffer pool reads' },
+ buf_pool_size => { hdr => 'Size', num => 1, label => 'Buffer pool size' },
+ bufs_in_node_heap => { hdr => 'Node Heap Bufs', num => 1, label => 'Buffers in buffer pool node heap' },
+ bytes_behind_master => { hdr => 'ByteLag', num => 1, label => 'Bytes the slave lags the master in binlog' },
+ cell_event_set => { hdr => 'Ending?', num => 1, label => 'Whether the cell event is set' },
+ cell_waiting => { hdr => 'Waiting?', num => 1, label => 'Whether the cell is waiting' },
+ child_db => { hdr => 'Child DB', num => 0, label => 'The database of the child table' },
+ child_index => { hdr => 'Child Index', num => 0, label => 'The index in the child table' },
+ child_table => { hdr => 'Child Table', num => 0, label => 'The child table' },
+ cmd => { hdr => 'Cmd', num => 0, label => 'Type of command being executed', },
+ cnt => { hdr => 'Cnt', num => 0, label => 'Count', agg => 'count', aggonly => 1 },
+ connect_retry => { hdr => 'Connect Retry', num => 1, label => 'Slave connect-retry timeout' },
+ cxn => { hdr => 'CXN', num => 0, label => 'Connection from which the data came', },
+ db => { hdr => 'DB', num => 0, label => 'Current database', },
+ dict_mem_alloc => { hdr => 'Dict Mem', num => 1, label => 'Dictionary memory allocated' },
+ dirty_bufs => { hdr => 'Dirty Buf', num => 1, label => 'Dirty buffer pool pages' },
+ dl_txn_num => { hdr => 'Num', num => 0, label => 'Deadlocked transaction number', },
+ event_set => { hdr => 'Evt Set?', num => 1, label => '[Win32] if a wait event is set', },
+ exec_master_log_pos => { hdr => 'Exec Master Log Pos', num => 1, label => 'Exec Master Log Position' },
+ fk_name => { hdr => 'Constraint', num => 0, label => 'The name of the FK constraint' },
+ free_list_len => { hdr => 'Free List Len', num => 1, label => 'Length of the free list' },
+ has_read_view => { hdr => 'Rd View', num => 1, label => 'Whether the transaction has a read view' },
+ hash_searches_s => { hdr => 'Hash/Sec', num => 1, label => 'Number of hash searches/sec' },
+ hash_table_size => { hdr => 'Size', num => 1, label => 'Number of non-hash searches/sec' },
+ heap_no => { hdr => 'Heap', num => 1, label => 'Heap number' },
+ heap_size => { hdr => 'Heap', num => 1, label => 'Heap size' },
+ history_list_len => { hdr => 'History', num => 1, label => 'History list length' },
+ host_and_domain => { hdr => 'Host', num => 0, label => 'Hostname/IP and domain' },
+ host_and_port => { hdr => 'Host/IP', num => 0, label => 'Hostname or IP address, and port number', },
+ hostname => { hdr => 'Host', num => 0, label => 'Hostname' },
+ index => { hdr => 'Index', num => 0, label => 'The index involved' },
+ index_ref => { hdr => 'Index Ref', num => 0, label => 'Index referenced' },
+ info => { hdr => 'Query', num => 0, label => 'Info or the current query', },
+ insert_intention => { hdr => 'Ins Intent', num => 1, label => 'Whether the thread was trying to insert' },
+ inserts => { hdr => 'Inserts', num => 1, label => 'Inserts' },
+ io_bytes_s => { hdr => 'Bytes/Sec', num => 1, label => 'Average I/O bytes/sec' },
+ io_flush_type => { hdr => 'Flush Type', num => 0, label => 'I/O Flush Type' },
+ io_fsyncs_s => { hdr => 'fsyncs/sec', num => 1, label => 'I/O fsyncs/sec' },
+ io_reads_s => { hdr => 'Reads/Sec', num => 1, label => 'Average I/O reads/sec' },
+ io_writes_s => { hdr => 'Writes/Sec', num => 1, label => 'Average I/O writes/sec' },
+ ip => { hdr => 'IP', num => 0, label => 'IP address' },
+ is_name_locked => { hdr => 'Locked', num => 1, label => 'Whether table is name locked', },
+ key_buffer_hit => { hdr => 'KCacheHit', num => 1, label => 'Key cache hit ratio', },
+ key_len => { hdr => 'Key Length', num => 1, label => 'Number of bytes used in the key' },
+ last_chkp => { hdr => 'Last Checkpoint', num => 0, label => 'Last log checkpoint' },
+ last_errno => { hdr => 'Last Errno', num => 1, label => 'Last error number' },
+ last_error => { hdr => 'Last Error', num => 0, label => 'Last error' },
+ last_s_file_name => { hdr => 'S-File', num => 0, label => 'Filename where last read locked' },
+ last_s_line => { hdr => 'S-Line', num => 1, label => 'Line where last read locked' },
+ last_x_file_name => { hdr => 'X-File', num => 0, label => 'Filename where last write locked' },
+ last_x_line => { hdr => 'X-Line', num => 1, label => 'Line where last write locked' },
+ last_pct => { hdr => 'Pct', num => 1, label => 'Last Percentage' },
+ last_total => { hdr => 'Last Total', num => 1, label => 'Last Total' },
+ last_value => { hdr => 'Last Incr', num => 1, label => 'Last Value' },
+ load => { hdr => 'Load', num => 1, label => 'Server load' },
+ lock_cfile_name => { hdr => 'Crtd File', num => 0, label => 'Filename where lock created' },
+ lock_cline => { hdr => 'Crtd Line', num => 1, label => 'Line where lock created' },
+ lock_mem_addr => { hdr => 'Addr', num => 0, label => 'The lock memory address' },
+ lock_mode => { hdr => 'Mode', num => 0, label => 'The lock mode' },
+ lock_structs => { hdr => 'LStrcts', num => 1, label => 'Number of lock structs' },
+ lock_type => { hdr => 'Type', num => 0, label => 'The lock type' },
+ lock_var => { hdr => 'Lck Var', num => 1, label => 'The lock variable' },
+ lock_wait_time => { hdr => 'Wait', num => 1, label => 'How long txn has waited for a lock' },
+ log_flushed_to => { hdr => 'Flushed To', num => 0, label => 'Log position flushed to' },
+ log_ios_done => { hdr => 'IO Done', num => 1, label => 'Log I/Os done' },
+ log_ios_s => { hdr => 'IO/Sec', num => 1, label => 'Average log I/Os per sec' },
+ log_seq_no => { hdr => 'Sequence No.', num => 0, label => 'Log sequence number' },
+ main_thread_id => { hdr => 'Main Thread ID', num => 1, label => 'Main thread ID' },
+ main_thread_proc_no => { hdr => 'Main Thread Proc', num => 1, label => 'Main thread process number' },
+ main_thread_state => { hdr => 'Main Thread State', num => 0, label => 'Main thread state' },
+ master_file => { hdr => 'File', num => 0, label => 'Master file' },
+ master_host => { hdr => 'Master', num => 0, label => 'Master server hostname' },
+ master_log_file => { hdr => 'Master Log File', num => 0, label => 'Master log file' },
+ master_port => { hdr => 'Master Port', num => 1, label => 'Master port' },
+ master_pos => { hdr => 'Position', num => 1, label => 'Master position' },
+ master_ssl_allowed => { hdr => 'Master SSL Allowed', num => 0, label => 'Master SSL Allowed' },
+ master_ssl_ca_file => { hdr => 'Master SSL CA File', num => 0, label => 'Master SSL Cert Auth File' },
+ master_ssl_ca_path => { hdr => 'Master SSL CA Path', num => 0, label => 'Master SSL Cert Auth Path' },
+ master_ssl_cert => { hdr => 'Master SSL Cert', num => 0, label => 'Master SSL Cert' },
+ master_ssl_cipher => { hdr => 'Master SSL Cipher', num => 0, label => 'Master SSL Cipher' },
+ master_ssl_key => { hdr => 'Master SSL Key', num => 0, label => 'Master SSL Key' },
+ master_user => { hdr => 'Master User', num => 0, label => 'Master username' },
+ max_txn => { hdr => 'MaxTxnTime', num => 1, label => 'MaxTxn' },
+ merged_recs => { hdr => 'Merged Recs', num => 1, label => 'Merged records' },
+ merges => { hdr => 'Merges', num => 1, label => 'Merges' },
+ mutex_os_waits => { hdr => 'Waits', num => 1, label => 'Mutex OS Waits' },
+ mutex_spin_rounds => { hdr => 'Rounds', num => 1, label => 'Mutex Spin Rounds' },
+ mutex_spin_waits => { hdr => 'Spins', num => 1, label => 'Mutex Spin Waits' },
+ mysql_thread_id => { hdr => 'ID', num => 1, label => 'MySQL connection (thread) ID', },
+ name => { hdr => 'Name', num => 0, label => 'Variable Name' },
+ n_bits => { hdr => '# Bits', num => 1, label => 'Number of bits' },
+ non_hash_searches_s => { hdr => 'Non-Hash/Sec', num => 1, label => 'Non-hash searches/sec' },
+ num_deletes => { hdr => 'Del', num => 1, label => 'Number of deletes' },
+ num_deletes_sec => { hdr => 'Del/Sec', num => 1, label => 'Number of deletes' },
+ num_inserts => { hdr => 'Ins', num => 1, label => 'Number of inserts' },
+ num_inserts_sec => { hdr => 'Ins/Sec', num => 1, label => 'Number of inserts' },
+ num_readers => { hdr => 'Readers', num => 1, label => 'Number of readers' },
+ num_reads => { hdr => 'Read', num => 1, label => 'Number of reads' },
+ num_reads_sec => { hdr => 'Read/Sec', num => 1, label => 'Number of reads' },
+ num_res_ext => { hdr => 'BTree Extents', num => 1, label => 'Number of extents reserved for B-Tree' },
+ num_rows => { hdr => 'Row Count', num => 1, label => 'Number of rows estimated to examine' },
+ num_times_open => { hdr => 'In Use', num => 1, label => '# times table is opened', },
+ num_txns => { hdr => 'Txns', num => 1, label => 'Number of transactions' },
+ num_updates => { hdr => 'Upd', num => 1, label => 'Number of updates' },
+ num_updates_sec => { hdr => 'Upd/Sec', num => 1, label => 'Number of updates' },
+ os_file_reads => { hdr => 'OS Reads', num => 1, label => 'OS file reads' },
+ os_file_writes => { hdr => 'OS Writes', num => 1, label => 'OS file writes' },
+ os_fsyncs => { hdr => 'OS fsyncs', num => 1, label => 'OS fsyncs' },
+ os_thread_id => { hdr => 'OS Thread', num => 1, label => 'The operating system thread ID' },
+ p_aio_writes => { hdr => 'Async Wrt', num => 1, label => 'Pending asynchronous I/O writes' },
+ p_buf_pool_flushes => { hdr => 'Buffer Pool Flushes', num => 1, label => 'Pending buffer pool flushes' },
+ p_ibuf_aio_reads => { hdr => 'IBuf Async Rds', num => 1, label => 'Pending insert buffer asynch I/O reads' },
+ p_log_flushes => { hdr => 'Log Flushes', num => 1, label => 'Pending log flushes' },
+ p_log_ios => { hdr => 'Log I/Os', num => 1, label => 'Pending log I/O operations' },
+ p_normal_aio_reads => { hdr => 'Async Rds', num => 1, label => 'Pending asynchronous I/O reads' },
+ p_preads => { hdr => 'preads', num => 1, label => 'Pending p-reads' },
+ p_pwrites => { hdr => 'pwrites', num => 1, label => 'Pending p-writes' },
+ p_sync_ios => { hdr => 'Sync I/Os', num => 1, label => 'Pending synchronous I/O operations' },
+ page_creates_sec => { hdr => 'Creates/Sec', num => 1, label => 'Page creates/sec' },
+ page_no => { hdr => 'Page', num => 1, label => 'Page number' },
+ page_reads_sec => { hdr => 'Reads/Sec', num => 1, label => 'Page reads per second' },
+ page_writes_sec => { hdr => 'Writes/Sec', num => 1, label => 'Page writes per second' },
+ pages_created => { hdr => 'Created', num => 1, label => 'Pages created' },
+ pages_modified => { hdr => 'Dirty Pages', num => 1, label => 'Pages modified (dirty)' },
+ pages_read => { hdr => 'Reads', num => 1, label => 'Pages read' },
+ pages_total => { hdr => 'Pages', num => 1, label => 'Pages total' },
+ pages_written => { hdr => 'Writes', num => 1, label => 'Pages written' },
+ parent_col => { hdr => 'Parent Column', num => 0, label => 'The referred column in the parent table', },
+ parent_db => { hdr => 'Parent DB', num => 0, label => 'The database of the parent table' },
+ parent_index => { hdr => 'Parent Index', num => 0, label => 'The referred index in the parent table' },
+ parent_table => { hdr => 'Parent Table', num => 0, label => 'The parent table' },
+ part_id => { hdr => 'Part ID', num => 1, label => 'Sub-part ID of the query' },
+ partitions => { hdr => 'Partitions', num => 0, label => 'Query partitions used' },
+ pct => { hdr => 'Pct', num => 1, label => 'Percentage' },
+ pending_chkp_writes => { hdr => 'Chkpt Writes', num => 1, label => 'Pending log checkpoint writes' },
+ pending_log_writes => { hdr => 'Log Writes', num => 1, label => 'Pending log writes' },
+ port => { hdr => 'Port', num => 1, label => 'Client port number', },
+ possible_keys => { hdr => 'Poss. Keys', num => 0, label => 'Possible keys' },
+ proc_no => { hdr => 'Proc', num => 1, label => 'Process number' },
+ q_cache_hit => { hdr => 'QCacheHit', num => 1, label => 'Query cache hit ratio', },
+ qps => { hdr => 'QPS', num => 1, label => 'How many queries/sec', },
+ queries_in_queue => { hdr => 'Queries Queued', num => 1, label => 'Queries in queue' },
+ queries_inside => { hdr => 'Queries Inside', num => 1, label => 'Queries inside InnoDB' },
+ query_id => { hdr => 'Query ID', num => 1, label => 'Query ID' },
+ query_status => { hdr => 'Query Status', num => 0, label => 'The query status' },
+ query_text => { hdr => 'Query Text', num => 0, label => 'The query text' },
+ questions => { hdr => 'Questions', num => 1, label => 'How many queries the server has gotten', },
+ read_master_log_pos => { hdr => 'Read Master Pos', num => 1, label => 'Read master log position' },
+ read_views_open => { hdr => 'Rd Views', num => 1, label => 'Number of read views open' },
+ reads_pending => { hdr => 'Pending Reads', num => 1, label => 'Reads pending' },
+ relay_log_file => { hdr => 'Relay File', num => 0, label => 'Relay log file' },
+ relay_log_pos => { hdr => 'Relay Pos', num => 1, label => 'Relay log position' },
+ relay_log_size => { hdr => 'Relay Size', num => 1, label => 'Relay log size' },
+ relay_master_log_file => { hdr => 'Relay Master File', num => 0, label => 'Relay master log file' },
+ replicate_do_db => { hdr => 'Do DB', num => 0, label => 'Replicate-do-db setting' },
+ replicate_do_table => { hdr => 'Do Table', num => 0, label => 'Replicate-do-table setting' },
+ replicate_ignore_db => { hdr => 'Ignore DB', num => 0, label => 'Replicate-ignore-db setting' },
+ replicate_ignore_table => { hdr => 'Ignore Table', num => 0, label => 'Replicate-do-table setting' },
+ replicate_wild_do_table => { hdr => 'Wild Do Table', num => 0, label => 'Replicate-wild-do-table setting' },
+ replicate_wild_ignore_table => { hdr => 'Wild Ignore Table', num => 0, label => 'Replicate-wild-ignore-table setting' },
+ request_type => { hdr => 'Type', num => 0, label => 'Type of lock the thread waits for' },
+ reservation_count => { hdr => 'ResCnt', num => 1, label => 'Reservation Count' },
+ row_locks => { hdr => 'RLocks', num => 1, label => 'Number of row locks' },
+ rw_excl_os_waits => { hdr => 'RW Waits', num => 1, label => 'R/W Excl. OS Waits' },
+ rw_excl_spins => { hdr => 'RW Spins', num => 1, label => 'R/W Excl. Spins' },
+ rw_shared_os_waits => { hdr => 'Sh Waits', num => 1, label => 'R/W Shared OS Waits' },
+ rw_shared_spins => { hdr => 'Sh Spins', num => 1, label => 'R/W Shared Spins' },
+ scan_type => { hdr => 'Type', num => 0, label => 'Scan type in chosen' },
+ seg_size => { hdr => 'Seg. Size', num => 1, label => 'Segment size' },
+ select_type => { hdr => 'Select Type', num => 0, label => 'Type of select used' },
+ signal_count => { hdr => 'Signals', num => 1, label => 'Signal Count' },
+ size => { hdr => 'Size', num => 1, label => 'Size of the tablespace' },
+ skip_counter => { hdr => 'Skip Counter', num => 1, label => 'Skip counter' },
+ slave_catchup_rate => { hdr => 'Catchup', num => 1, label => 'How fast the slave is catching up in the binlog' },
+ slave_io_running => { hdr => 'Slave-IO', num => 0, label => 'Whether the slave I/O thread is running' },
+ slave_io_state => { hdr => 'Slave IO State', num => 0, label => 'Slave I/O thread state' },
+ slave_open_temp_tables => { hdr => 'Temp', num => 1, label => 'Slave open temp tables' },
+ slave_sql_running => { hdr => 'Slave-SQL', num => 0, label => 'Whether the slave SQL thread is running' },
+ slow => { hdr => 'Slow', num => 1, label => 'How many slow queries', },
+ space_id => { hdr => 'Space', num => 1, label => 'Tablespace ID' },
+ special => { hdr => 'Special', num => 0, label => 'Special/Other info' },
+ state => { hdr => 'State', num => 0, label => 'Connection state', maxw => 18, },
+ tables_in_use => { hdr => 'Tbl Used', num => 1, label => 'Number of tables in use' },
+ tables_locked => { hdr => 'Tbl Lck', num => 1, label => 'Number of tables locked' },
+ tbl => { hdr => 'Table', num => 0, label => 'Table', },
+ thread => { hdr => 'Thread', num => 1, label => 'Thread number' },
+ thread_decl_inside => { hdr => 'Thread Inside', num => 0, label => 'What the thread is declared inside' },
+ thread_purpose => { hdr => 'Purpose', num => 0, label => "The thread's purpose" },
+ thread_status => { hdr => 'Thread Status', num => 0, label => 'The thread status' },
+ time => { hdr => 'Time', num => 1, label => 'Time since the last event', },
+ time_behind_master => { hdr => 'TimeLag', num => 1, label => 'Time slave lags master' },
+ timestring => { hdr => 'Timestring', num => 0, label => 'Time the event occurred' },
+ total => { hdr => 'Total', num => 1, label => 'Total' },
+ total_mem_alloc => { hdr => 'Memory', num => 1, label => 'Total memory allocated' },
+ truncates => { hdr => 'Trunc', num => 0, label => 'Whether the deadlock is truncating InnoDB status' },
+ txn_doesnt_see_ge => { hdr => "Txn Won't See", num => 0, label => 'Where txn read view is limited' },
+ txn_id => { hdr => 'ID', num => 0, label => 'Transaction ID' },
+ txn_sees_lt => { hdr => 'Txn Sees', num => 1, label => 'Where txn read view is limited' },
+ txn_status => { hdr => 'Txn Status', num => 0, label => 'Transaction status' },
+ txn_time_remain => { hdr => 'Remaining', num => 1, label => 'Time until txn rollback/commit completes' },
+ undo_log_entries => { hdr => 'Undo', num => 1, label => 'Number of undo log entries' },
+ undo_for => { hdr => 'Undo', num => 0, label => 'Undo for' },
+ until_condition => { hdr => 'Until Condition', num => 0, label => 'Slave until condition' },
+ until_log_file => { hdr => 'Until Log File', num => 0, label => 'Slave until log file' },
+ until_log_pos => { hdr => 'Until Log Pos', num => 1, label => 'Slave until log position' },
+ used_cells => { hdr => 'Cells Used', num => 1, label => 'Number of cells used' },
+ used_bufs => { hdr => 'Used Bufs', num => 1, label => 'Number of buffer pool pages used' },
+ user => { hdr => 'User', num => 0, label => 'Database username', },
+ value => { hdr => 'Value', num => 1, label => 'Value' },
+ versions => { hdr => 'Versions', num => 1, label => 'Number of InnoDB MVCC versions unpurged' },
+ victim => { hdr => 'Victim', num => 0, label => 'Whether this txn was the deadlock victim' },
+ wait_array_size => { hdr => 'Wait Array Size', num => 1, label => 'Wait Array Size' },
+ wait_status => { hdr => 'Lock Status', num => 0, label => 'Status of txn locks' },
+ waited_at_filename => { hdr => 'File', num => 0, label => 'Filename at which thread waits' },
+ waited_at_line => { hdr => 'Line', num => 1, label => 'Line at which thread waits' },
+ waiters_flag => { hdr => 'Waiters', num => 1, label => 'Waiters Flag' },
+ waiting => { hdr => 'Waiting', num => 1, label => 'Whether lock is being waited for' },
+ when => { hdr => 'When', num => 0, label => 'Time scale' },
+ writer_lock_mode => { hdr => 'Wrtr Lck Mode', num => 0, label => 'Writer lock mode' },
+ writer_thread => { hdr => 'Wrtr Thread', num => 1, label => 'Writer thread ID' },
+ writes_pending => { hdr => 'Writes', num => 1, label => 'Number of writes pending' },
+ writes_pending_flush_list => { hdr => 'Flush List Writes', num => 1, label => 'Number of flush list writes pending' },
+ writes_pending_lru => { hdr => 'LRU Writes', num => 1, label => 'Number of LRU writes pending' },
+ writes_pending_single_page => { hdr => '1-Page Writes', num => 1, label => 'Number of 1-page writes pending' },
+);
+
+# Apply a default property or three. By default, columns are not width-constrained,
+# aligned left, and sorted alphabetically, not numerically.
+foreach my $col ( values %columns ) {
+ map { $col->{$_} ||= 0 } qw(num minw maxw);
+ $col->{just} = $col->{num} ? '' : '-';
+}
+
+# Filters {{{3
+# This hash defines every filter that can be applied to a table. These
+# become part of tbl_meta as well. Each filter is just an expression that
+# returns true or false.
+# Properties of each entry:
+# * func: the subroutine
+# * name: the name, repeated
+# * user: whether it's a user-defined filter (saved in config)
+# * text: text of the subroutine
+# * note: explanation
+my %filters = ();
+
+# These are pre-processed to live in %filters above, by compiling them.
+my %builtin_filters = (
+ hide_self => {
+ text => <<' END',
+ return ( !$set->{info} || $set->{info} ne 'SHOW FULL PROCESSLIST' )
+ && ( !$set->{query_text} || $set->{query_text} !~ m/INNODB STATUS$/ );
+ END
+ note => 'Removes the innotop processes from the list',
+ tbls => [qw(innodb_transactions processlist)],
+ },
+ hide_inactive => {
+ text => <<' END',
+ return ( !defined($set->{txn_status}) || $set->{txn_status} ne 'not started' )
+ && ( !defined($set->{cmd}) || $set->{cmd} !~ m/Sleep|Binlog Dump/ )
+ && ( !defined($set->{info}) || $set->{info} =~ m/\S/ );
+ END
+ note => 'Removes processes which are not doing anything',
+ tbls => [qw(innodb_transactions processlist)],
+ },
+ hide_slave_io => {
+ text => <<' END',
+ return !$set->{state} || $set->{state} !~ m/^(?:Waiting for master|Has read all relay)/;
+ END
+ note => 'Removes slave I/O threads from the list',
+ tbls => [qw(processlist slave_io_status)],
+ },
+ table_is_open => {
+ text => <<' END',
+ return $set->{num_times_open} + $set->{is_name_locked};
+ END
+ note => 'Removes tables that are not in use or locked',
+ tbls => [qw(open_tables)],
+ },
+ cxn_is_master => {
+ text => <<' END',
+ return $set->{master_file} ? 1 : 0;
+ END
+ note => 'Removes servers that are not masters',
+ tbls => [qw(master_status)],
+ },
+ cxn_is_slave => {
+ text => <<' END',
+ return $set->{master_host} ? 1 : 0;
+ END
+ note => 'Removes servers that are not slaves',
+ tbls => [qw(slave_io_status slave_sql_status)],
+ },
+ thd_is_not_waiting => {
+ text => <<' END',
+ return $set->{thread_status} !~ m#waiting for i/o request#;
+ END
+ note => 'Removes idle I/O threads',
+ tbls => [qw(io_threads)],
+ },
+);
+foreach my $key ( keys %builtin_filters ) {
+ my ( $sub, $err ) = compile_filter($builtin_filters{$key}->{text});
+ $filters{$key} = {
+ func => $sub,
+ text => $builtin_filters{$key}->{text},
+ user => 0,
+ name => $key, # useful for later
+ note => $builtin_filters{$key}->{note},
+ tbls => $builtin_filters{$key}->{tbls},
+ }
+}
+
+# Variable sets {{{3
+# Sets (arrayrefs) of variables that are used in S mode. They are read/written to
+# the config file.
+my %var_sets = (
+ general => {
+ text => join(
+ ', ',
+ 'set_precision(Questions/Uptime_hires) as QPS',
+ 'set_precision(Com_commit/Uptime_hires) as Commit_PS',
+ 'set_precision((Com_rollback||0)/(Com_commit||1)) as Rollback_Commit',
+ 'set_precision(('
+ . join('+', map { "($_||0)" }
+ qw(Com_delete Com_delete_multi Com_insert Com_insert_select Com_replace
+ Com_replace_select Com_select Com_update Com_update_multi))
+ . ')/(Com_commit||1)) as Write_Commit',
+ 'set_precision((Com_select+(Qcache_hits||0))/(('
+ . join('+', map { "($_||0)" }
+ qw(Com_delete Com_delete_multi Com_insert Com_insert_select Com_replace
+ Com_replace_select Com_select Com_update Com_update_multi))
+ . ')||1)) as R_W_Ratio',
+ 'set_precision(Opened_tables/Uptime_hires) as Opens_PS',
+ 'percent($cur->{Open_tables}/($cur->{table_cache})) as Table_Cache_Used',
+ 'set_precision(Threads_created/Uptime_hires) as Threads_PS',
+ 'percent($cur->{Threads_cached}/($cur->{thread_cache_size}||1)) as Thread_Cache_Used',
+ 'percent($cur->{Max_used_connections}/($cur->{max_connections}||1)) as CXN_Used_Ever',
+ 'percent($cur->{Threads_connected}/($cur->{max_connections}||1)) as CXN_Used_Now',
+ ),
+ },
+ commands => {
+ text => join(
+ ', ',
+ qw(Uptime Questions Com_delete Com_delete_multi Com_insert
+ Com_insert_select Com_replace Com_replace_select Com_select Com_update
+ Com_update_multi)
+ ),
+ },
+ query_status => {
+ text => join(
+ ',',
+ qw( Uptime Select_full_join Select_full_range_join Select_range
+ Select_range_check Select_scan Slow_queries Sort_merge_passes
+ Sort_range Sort_rows Sort_scan)
+ ),
+ },
+ innodb => {
+ text => join(
+ ',',
+ qw( Uptime Innodb_row_lock_current_waits Innodb_row_lock_time
+ Innodb_row_lock_time_avg Innodb_row_lock_time_max Innodb_row_lock_waits
+ Innodb_rows_deleted Innodb_rows_inserted Innodb_rows_read
+ Innodb_rows_updated)
+ ),
+ },
+ txn => {
+ text => join(
+ ',',
+ qw( Uptime Com_begin Com_commit Com_rollback Com_savepoint
+ Com_xa_commit Com_xa_end Com_xa_prepare Com_xa_recover Com_xa_rollback
+ Com_xa_start)
+ ),
+ },
+ key_cache => {
+ text => join(
+ ',',
+ qw( Uptime Key_blocks_not_flushed Key_blocks_unused Key_blocks_used
+ Key_read_requests Key_reads Key_write_requests Key_writes )
+ ),
+ },
+ query_cache => {
+ text => join(
+ ',',
+ "percent($exprs{QcacheHitRatio}) as Hit_Pct",
+ 'set_precision((Qcache_hits||0)/(Qcache_inserts||1)) as Hit_Ins',
+ 'set_precision((Qcache_lowmem_prunes||0)/Uptime_hires) as Lowmem_Prunes_sec',
+ 'percent(1-((Qcache_free_blocks||0)/(Qcache_total_blocks||1))) as Blocks_used',
+ qw( Qcache_free_blocks Qcache_free_memory Qcache_not_cached Qcache_queries_in_cache)
+ ),
+ },
+ handler => {
+ text => join(
+ ',',
+ qw( Uptime Handler_read_key Handler_read_first Handler_read_next
+ Handler_read_prev Handler_read_rnd Handler_read_rnd_next Handler_delete
+ Handler_update Handler_write)
+ ),
+ },
+ cxns_files_threads => {
+ text => join(
+ ',',
+ qw( Uptime Aborted_clients Aborted_connects Bytes_received Bytes_sent
+ Compression Connections Created_tmp_disk_tables Created_tmp_files
+ Created_tmp_tables Max_used_connections Open_files Open_streams
+ Open_tables Opened_tables Table_locks_immediate Table_locks_waited
+ Threads_cached Threads_connected Threads_created Threads_running)
+ ),
+ },
+ prep_stmt => {
+ text => join(
+ ',',
+ qw( Uptime Com_dealloc_sql Com_execute_sql Com_prepare_sql Com_reset
+ Com_stmt_close Com_stmt_execute Com_stmt_fetch Com_stmt_prepare
+ Com_stmt_reset Com_stmt_send_long_data )
+ ),
+ },
+ innodb_health => {
+ text => join(
+ ',',
+ "$exprs{OldVersions} as OldVersions",
+ qw(IB_sm_mutex_spin_waits IB_sm_mutex_spin_rounds IB_sm_mutex_os_waits),
+ "$exprs{NumTxns} as NumTxns",
+ "$exprs{MaxTxnTime} as MaxTxnTime",
+ qw(IB_ro_queries_inside IB_ro_queries_in_queue),
+ "set_precision($exprs{DirtyBufs} * 100) as dirty_bufs",
+ "set_precision($exprs{BufPoolFill} * 100) as buf_fill",
+ qw(IB_bp_pages_total IB_bp_pages_read IB_bp_pages_written IB_bp_pages_created)
+ ),
+ },
+ innodb_health2 => {
+ text => join(
+ ', ',
+ 'percent(1-((Innodb_buffer_pool_pages_free||0)/($cur->{Innodb_buffer_pool_pages_total}||1))) as BP_page_cache_usage',
+ 'percent(1-((Innodb_buffer_pool_reads||0)/(Innodb_buffer_pool_read_requests||1))) as BP_cache_hit_ratio',
+ 'Innodb_buffer_pool_wait_free',
+ 'Innodb_log_waits',
+ ),
+ },
+ slow_queries => {
+ text => join(
+ ', ',
+ 'set_precision(Slow_queries/Uptime_hires) as Slow_PS',
+ 'set_precision(Select_full_join/Uptime_hires) as Full_Join_PS',
+ 'percent(Select_full_join/(Com_select||1)) as Full_Join_Ratio',
+ ),
+ },
+);
+
+# Server sets {{{3
+# Defines sets of servers between which the user can quickly switch.
+my %server_groups;
+
+# Connections {{{3
+# This hash defines server connections. Each connection is a string that can be passed to
+# the DBI connection. These are saved in the connections section in the config file.
+my %connections;
+# Defines the parts of connections.
+my @conn_parts = qw(user have_user pass have_pass dsn savepass dl_table);
+
+# Graph widths {{{3
+# This hash defines the max values seen for various status/variable values, for graphing.
+# These are stored in their own section in the config file. These are just initial values:
+my %mvs = (
+ Com_select => 50,
+ Com_insert => 50,
+ Com_update => 50,
+ Com_delete => 50,
+ Questions => 100,
+);
+
+# ###########################################################################
+# Valid Term::ANSIColor color strings.
+# ###########################################################################
+my %ansicolors = map { $_ => 1 }
+ qw( black blink blue bold clear concealed cyan dark green magenta on_black
+ on_blue on_cyan on_green on_magenta on_red on_white on_yellow red reset
+ reverse underline underscore white yellow);
+
+# ###########################################################################
+# Valid comparison operators for color rules
+# ###########################################################################
+my %comp_ops = (
+ '==' => 'Numeric equality',
+ '>' => 'Numeric greater-than',
+ '<' => 'Numeric less-than',
+ '>=' => 'Numeric greater-than/equal',
+ '<=' => 'Numeric less-than/equal',
+ '!=' => 'Numeric not-equal',
+ 'eq' => 'String equality',
+ 'gt' => 'String greater-than',
+ 'lt' => 'String less-than',
+ 'ge' => 'String greater-than/equal',
+ 'le' => 'String less-than/equal',
+ 'ne' => 'String not-equal',
+ '=~' => 'Pattern match',
+ '!~' => 'Negated pattern match',
+);
+
+# ###########################################################################
+# Valid aggregate functions.
+# ###########################################################################
+my %agg_funcs = (
+ first => sub {
+ return $_[0]
+ },
+ count => sub {
+ return 0 + @_;
+ },
+ avg => sub {
+ my @args = grep { defined $_ } @_;
+ return (sum(map { m/([\d\.-]+)/g } @args) || 0) / (scalar(@args) || 1);
+ },
+ sum => \&sum,
+);
+
+# ###########################################################################
+# Valid functions for transformations.
+# ###########################################################################
+my %trans_funcs = (
+ shorten => \&shorten,
+ secs_to_time => \&secs_to_time,
+ no_ctrl_char => \&no_ctrl_char,
+ percent => \&percent,
+ commify => \&commify,
+ dulint_to_int => \&dulint_to_int,
+ set_precision => \&set_precision,
+);
+
+# Table definitions {{{3
+# This hash defines every table that can get displayed in every mode. Each
+# table specifies columns and column data sources. The column is
+# defined by the %columns hash.
+#
+# Example: foo => { src => 'bar' } means the foo column (look at
+# $columns{foo} for its definition) gets its data from the 'bar' element of
+# the current data set, whatever that is.
+#
+# These columns are post-processed after being defined, because they get stuff
+# from %columns. After all the config is loaded for columns, there's more
+# post-processing too; the subroutines compiled from src get added to
+# the hash elements for extract_values to use.
+# ###########################################################################
+
+my %tbl_meta = (
+ adaptive_hash_index => {
+ capt => 'Adaptive Hash Index',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ hash_table_size => { src => 'IB_ib_hash_table_size', trans => [qw(shorten)], },
+ used_cells => { src => 'IB_ib_used_cells' },
+ bufs_in_node_heap => { src => 'IB_ib_bufs_in_node_heap' },
+ hash_searches_s => { src => 'IB_ib_hash_searches_s' },
+ non_hash_searches_s => { src => 'IB_ib_non_hash_searches_s' },
+ },
+ visible => [ qw(cxn hash_table_size used_cells bufs_in_node_heap hash_searches_s non_hash_searches_s) ],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'ib',
+ group_by => [],
+ aggregate => 0,
+ },
+ buffer_pool => {
+ capt => 'Buffer Pool',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ total_mem_alloc => { src => 'IB_bp_total_mem_alloc', trans => [qw(shorten)], },
+ awe_mem_alloc => { src => 'IB_bp_awe_mem_alloc', trans => [qw(shorten)], },
+ add_pool_alloc => { src => 'IB_bp_add_pool_alloc', trans => [qw(shorten)], },
+ buf_pool_size => { src => 'IB_bp_buf_pool_size', trans => [qw(shorten)], },
+ buf_free => { src => 'IB_bp_buf_free' },
+ buf_pool_hit_rate => { src => 'IB_bp_buf_pool_hit_rate' },
+ buf_pool_reads => { src => 'IB_bp_buf_pool_reads' },
+ buf_pool_hits => { src => 'IB_bp_buf_pool_hits' },
+ dict_mem_alloc => { src => 'IB_bp_dict_mem_alloc' },
+ pages_total => { src => 'IB_bp_pages_total' },
+ pages_modified => { src => 'IB_bp_pages_modified' },
+ reads_pending => { src => 'IB_bp_reads_pending' },
+ writes_pending => { src => 'IB_bp_writes_pending' },
+ writes_pending_lru => { src => 'IB_bp_writes_pending_lru' },
+ writes_pending_flush_list => { src => 'IB_bp_writes_pending_flush_list' },
+ writes_pending_single_page => { src => 'IB_bp_writes_pending_single_page' },
+ page_creates_sec => { src => 'IB_bp_page_creates_sec' },
+ page_reads_sec => { src => 'IB_bp_page_reads_sec' },
+ page_writes_sec => { src => 'IB_bp_page_writes_sec' },
+ pages_created => { src => 'IB_bp_pages_created' },
+ pages_read => { src => 'IB_bp_pages_read' },
+ pages_written => { src => 'IB_bp_pages_written' },
+ },
+ visible => [ qw(cxn buf_pool_size buf_free pages_total pages_modified buf_pool_hit_rate total_mem_alloc add_pool_alloc)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'bp',
+ group_by => [],
+ aggregate => 0,
+ },
+ # TODO: a new step in set_to_tbl: join result to itself, grouped?
+ # TODO: this would also enable pulling Q and T data together.
+ # TODO: using a SQL-ish language would also allow pivots to be easier -- treat the pivoted data as a view and SELECT from it.
+ cmd_summary => {
+ capt => 'Command Summary',
+ cust => {},
+ cols => {
+ name => { src => 'name' },
+ total => { src => 'total' },
+ value => { src => 'value', agg => 'sum'},
+ pct => { src => 'value/total', trans => [qw(percent)] },
+ last_total => { src => 'last_total' },
+ last_value => { src => 'last_value', agg => 'sum'},
+ last_pct => { src => 'last_value/last_total', trans => [qw(percent)] },
+ },
+ visible => [qw(name value pct last_value last_pct)],
+ filters => [qw()],
+ sort_cols => '-value',
+ sort_dir => '1',
+ innodb => '',
+ group_by => [qw(name)],
+ aggregate => 1,
+ },
+ deadlock_locks => {
+ capt => 'Deadlock Locks',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ mysql_thread_id => { src => 'mysql_thread_id' },
+ dl_txn_num => { src => 'dl_txn_num' },
+ lock_type => { src => 'lock_type' },
+ space_id => { src => 'space_id' },
+ page_no => { src => 'page_no' },
+ heap_no => { src => 'heap_no' },
+ n_bits => { src => 'n_bits' },
+ index => { src => 'index' },
+ db => { src => 'db' },
+ tbl => { src => 'table' },
+ lock_mode => { src => 'lock_mode' },
+ special => { src => 'special' },
+ insert_intention => { src => 'insert_intention' },
+ waiting => { src => 'waiting' },
+ },
+ visible => [ qw(cxn mysql_thread_id waiting lock_mode db tbl index special insert_intention)],
+ filters => [],
+ sort_cols => 'cxn mysql_thread_id',
+ sort_dir => '1',
+ innodb => 'dl',
+ group_by => [],
+ aggregate => 0,
+ },
+ deadlock_transactions => {
+ capt => 'Deadlock Transactions',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ active_secs => { src => 'active_secs' },
+ dl_txn_num => { src => 'dl_txn_num' },
+ has_read_view => { src => 'has_read_view' },
+ heap_size => { src => 'heap_size' },
+ host_and_domain => { src => 'hostname' },
+ hostname => { src => $exprs{Host} },
+ ip => { src => 'ip' },
+ lock_structs => { src => 'lock_structs' },
+ lock_wait_time => { src => 'lock_wait_time', trans => [ qw(secs_to_time) ] },
+ mysql_thread_id => { src => 'mysql_thread_id' },
+ os_thread_id => { src => 'os_thread_id' },
+ proc_no => { src => 'proc_no' },
+ query_id => { src => 'query_id' },
+ query_status => { src => 'query_status' },
+ query_text => { src => 'query_text', trans => [ qw(no_ctrl_char) ] },
+ row_locks => { src => 'row_locks' },
+ tables_in_use => { src => 'tables_in_use' },
+ tables_locked => { src => 'tables_locked' },
+ thread_decl_inside => { src => 'thread_decl_inside' },
+ thread_status => { src => 'thread_status' },
+ 'time' => { src => 'active_secs', trans => [ qw(secs_to_time) ] },
+ timestring => { src => 'timestring' },
+ txn_doesnt_see_ge => { src => 'txn_doesnt_see_ge' },
+ txn_id => { src => 'txn_id' },
+ txn_sees_lt => { src => 'txn_sees_lt' },
+ txn_status => { src => 'txn_status' },
+ truncates => { src => 'truncates' },
+ undo_log_entries => { src => 'undo_log_entries' },
+ user => { src => 'user' },
+ victim => { src => 'victim' },
+ wait_status => { src => 'lock_wait_status' },
+ },
+ visible => [ qw(cxn mysql_thread_id timestring user hostname victim time undo_log_entries lock_structs query_text)],
+ filters => [],
+ sort_cols => 'cxn mysql_thread_id',
+ sort_dir => '1',
+ innodb => 'dl',
+ group_by => [],
+ aggregate => 0,
+ },
+ explain => {
+ capt => 'EXPLAIN Results',
+ cust => {},
+ cols => {
+ part_id => { src => 'id' },
+ select_type => { src => 'select_type' },
+ tbl => { src => 'table' },
+ partitions => { src => 'partitions' },
+ scan_type => { src => 'type' },
+ possible_keys => { src => 'possible_keys' },
+ index => { src => 'key' },
+ key_len => { src => 'key_len' },
+ index_ref => { src => 'ref' },
+ num_rows => { src => 'rows' },
+ special => { src => 'extra' },
+ },
+ visible => [ qw(select_type tbl partitions scan_type possible_keys index key_len index_ref num_rows special)],
+ filters => [],
+ sort_cols => '',
+ sort_dir => '1',
+ innodb => '',
+ group_by => [],
+ aggregate => 0,
+ },
+ file_io_misc => {
+ capt => 'File I/O Misc',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ io_bytes_s => { src => 'IB_io_avg_bytes_s' },
+ io_flush_type => { src => 'IB_io_flush_type' },
+ io_fsyncs_s => { src => 'IB_io_fsyncs_s' },
+ io_reads_s => { src => 'IB_io_reads_s' },
+ io_writes_s => { src => 'IB_io_writes_s' },
+ os_file_reads => { src => 'IB_io_os_file_reads' },
+ os_file_writes => { src => 'IB_io_os_file_writes' },
+ os_fsyncs => { src => 'IB_io_os_fsyncs' },
+ },
+ visible => [ qw(cxn os_file_reads os_file_writes os_fsyncs io_reads_s io_writes_s io_bytes_s)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'io',
+ group_by => [],
+ aggregate => 0,
+ },
+ fk_error => {
+ capt => 'Foreign Key Error Info',
+ cust => {},
+ cols => {
+ timestring => { src => 'IB_fk_timestring' },
+ child_db => { src => 'IB_fk_child_db' },
+ child_table => { src => 'IB_fk_child_table' },
+ child_index => { src => 'IB_fk_child_index' },
+ fk_name => { src => 'IB_fk_fk_name' },
+ parent_db => { src => 'IB_fk_parent_db' },
+ parent_table => { src => 'IB_fk_parent_table' },
+ parent_col => { src => 'IB_fk_parent_col' },
+ parent_index => { src => 'IB_fk_parent_index' },
+ attempted_op => { src => 'IB_fk_attempted_op' },
+ },
+ visible => [ qw(timestring child_db child_table child_index parent_db parent_table parent_col parent_index fk_name attempted_op)],
+ filters => [],
+ sort_cols => '',
+ sort_dir => '1',
+ innodb => 'fk',
+ group_by => [],
+ aggregate => 0,
+ },
+ insert_buffers => {
+ capt => 'Insert Buffers',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ inserts => { src => 'IB_ib_inserts' },
+ merged_recs => { src => 'IB_ib_merged_recs' },
+ merges => { src => 'IB_ib_merges' },
+ size => { src => 'IB_ib_size' },
+ free_list_len => { src => 'IB_ib_free_list_len' },
+ seg_size => { src => 'IB_ib_seg_size' },
+ },
+ visible => [ qw(cxn inserts merged_recs merges size free_list_len seg_size)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'ib',
+ group_by => [],
+ aggregate => 0,
+ },
+ innodb_locks => {
+ capt => 'InnoDB Locks',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ db => { src => 'db' },
+ index => { src => 'index' },
+ insert_intention => { src => 'insert_intention' },
+ lock_mode => { src => 'lock_mode' },
+ lock_type => { src => 'lock_type' },
+ lock_wait_time => { src => 'lock_wait_time', trans => [ qw(secs_to_time) ] },
+ mysql_thread_id => { src => 'mysql_thread_id' },
+ n_bits => { src => 'n_bits' },
+ page_no => { src => 'page_no' },
+ space_id => { src => 'space_id' },
+ special => { src => 'special' },
+ tbl => { src => 'table' },
+ 'time' => { src => 'active_secs', hdr => 'Active', trans => [ qw(secs_to_time) ] },
+ txn_id => { src => 'txn_id' },
+ waiting => { src => 'waiting' },
+ },
+ visible => [ qw(cxn mysql_thread_id lock_type waiting lock_wait_time time lock_mode db tbl index insert_intention special)],
+ filters => [],
+ sort_cols => 'cxn -lock_wait_time',
+ sort_dir => '1',
+ innodb => 'tx',
+ colors => [
+ { col => 'lock_wait_time', op => '>', arg => 60, color => 'red' },
+ { col => 'lock_wait_time', op => '>', arg => 30, color => 'yellow' },
+ { col => 'lock_wait_time', op => '>', arg => 10, color => 'green' },
+ ],
+ group_by => [],
+ aggregate => 0,
+ },
+ innodb_transactions => {
+ capt => 'InnoDB Transactions',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ active_secs => { src => 'active_secs' },
+ has_read_view => { src => 'has_read_view' },
+ heap_size => { src => 'heap_size' },
+ hostname => { src => $exprs{Host} },
+ ip => { src => 'ip' },
+ wait_status => { src => 'lock_wait_status' },
+ lock_wait_time => { src => 'lock_wait_time', trans => [ qw(secs_to_time) ] },
+ lock_structs => { src => 'lock_structs' },
+ mysql_thread_id => { src => 'mysql_thread_id' },
+ os_thread_id => { src => 'os_thread_id' },
+ proc_no => { src => 'proc_no' },
+ query_id => { src => 'query_id' },
+ query_status => { src => 'query_status' },
+ query_text => { src => 'query_text', trans => [ qw(no_ctrl_char) ] },
+ txn_time_remain => { src => $exprs{TxnTimeRemain}, trans => [ qw(secs_to_time) ] },
+ row_locks => { src => 'row_locks' },
+ tables_in_use => { src => 'tables_in_use' },
+ tables_locked => { src => 'tables_locked' },
+ thread_decl_inside => { src => 'thread_decl_inside' },
+ thread_status => { src => 'thread_status' },
+ 'time' => { src => 'active_secs', trans => [ qw(secs_to_time) ], agg => 'sum' },
+ txn_doesnt_see_ge => { src => 'txn_doesnt_see_ge' },
+ txn_id => { src => 'txn_id' },
+ txn_sees_lt => { src => 'txn_sees_lt' },
+ txn_status => { src => 'txn_status', minw => 10, maxw => 10 },
+ undo_log_entries => { src => 'undo_log_entries' },
+ user => { src => 'user', maxw => 10 },
+ cnt => { src => 'mysql_thread_id', minw => 0 },
+ },
+ visible => [ qw(cxn cnt mysql_thread_id user hostname txn_status time undo_log_entries query_text)],
+ filters => [ qw( hide_self hide_inactive ) ],
+ sort_cols => '-active_secs txn_status cxn mysql_thread_id',
+ sort_dir => '1',
+ innodb => 'tx',
+ hide_caption => 1,
+ colors => [
+ { col => 'wait_status', op => 'eq', arg => 'LOCK WAIT', color => 'black on_red' },
+ { col => 'time', op => '>', arg => 600, color => 'red' },
+ { col => 'time', op => '>', arg => 300, color => 'yellow' },
+ { col => 'time', op => '>', arg => 60, color => 'green' },
+ { col => 'time', op => '>', arg => 30, color => 'cyan' },
+ { col => 'txn_status', op => 'eq', arg => 'not started', color => 'white' },
+ ],
+ group_by => [ qw(cxn txn_status) ],
+ aggregate => 0,
+ },
+ io_threads => {
+ capt => 'I/O Threads',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ thread => { src => 'thread' },
+ thread_purpose => { src => 'purpose' },
+ event_set => { src => 'event_set' },
+ thread_status => { src => 'state' },
+ },
+ visible => [ qw(cxn thread thread_purpose thread_status)],
+ filters => [ qw() ],
+ sort_cols => 'cxn thread',
+ sort_dir => '1',
+ innodb => 'io',
+ group_by => [],
+ aggregate => 0,
+ },
+ log_statistics => {
+ capt => 'Log Statistics',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ last_chkp => { src => 'IB_lg_last_chkp' },
+ log_flushed_to => { src => 'IB_lg_log_flushed_to' },
+ log_ios_done => { src => 'IB_lg_log_ios_done' },
+ log_ios_s => { src => 'IB_lg_log_ios_s' },
+ log_seq_no => { src => 'IB_lg_log_seq_no' },
+ pending_chkp_writes => { src => 'IB_lg_pending_chkp_writes' },
+ pending_log_writes => { src => 'IB_lg_pending_log_writes' },
+ },
+ visible => [ qw(cxn log_seq_no log_flushed_to last_chkp log_ios_done log_ios_s)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'lg',
+ group_by => [],
+ aggregate => 0,
+ },
+ master_status => {
+ capt => 'Master Status',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ binlog_do_db => { src => 'binlog_do_db' },
+ binlog_ignore_db => { src => 'binlog_ignore_db' },
+ master_file => { src => 'file' },
+ master_pos => { src => 'position' },
+ binlog_cache_overflow => { src => '(Binlog_cache_disk_use||0)/(Binlog_cache_use||1)', trans => [ qw(percent) ] },
+ },
+ visible => [ qw(cxn master_file master_pos binlog_cache_overflow)],
+ filters => [ qw(cxn_is_master) ],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => '',
+ group_by => [],
+ aggregate => 0,
+ },
+ pending_io => {
+ capt => 'Pending I/O',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ p_normal_aio_reads => { src => 'IB_io_pending_normal_aio_reads' },
+ p_aio_writes => { src => 'IB_io_pending_aio_writes' },
+ p_ibuf_aio_reads => { src => 'IB_io_pending_ibuf_aio_reads' },
+ p_sync_ios => { src => 'IB_io_pending_sync_ios' },
+ p_buf_pool_flushes => { src => 'IB_io_pending_buffer_pool_flushes' },
+ p_log_flushes => { src => 'IB_io_pending_log_flushes' },
+ p_log_ios => { src => 'IB_io_pending_log_ios' },
+ p_preads => { src => 'IB_io_pending_preads' },
+ p_pwrites => { src => 'IB_io_pending_pwrites' },
+ },
+ visible => [ qw(cxn p_normal_aio_reads p_aio_writes p_ibuf_aio_reads p_sync_ios p_log_flushes p_log_ios)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'io',
+ group_by => [],
+ aggregate => 0,
+ },
+ open_tables => {
+ capt => 'Open Tables',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ db => { src => 'database' },
+ tbl => { src => 'table' },
+ num_times_open => { src => 'in_use' },
+ is_name_locked => { src => 'name_locked' },
+ },
+ visible => [ qw(cxn db tbl num_times_open is_name_locked)],
+ filters => [ qw(table_is_open) ],
+ sort_cols => '-num_times_open cxn db tbl',
+ sort_dir => '1',
+ innodb => '',
+ group_by => [],
+ aggregate => 0,
+ },
+ page_statistics => {
+ capt => 'Page Statistics',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ pages_read => { src => 'IB_bp_pages_read' },
+ pages_written => { src => 'IB_bp_pages_written' },
+ pages_created => { src => 'IB_bp_pages_created' },
+ page_reads_sec => { src => 'IB_bp_page_reads_sec' },
+ page_writes_sec => { src => 'IB_bp_page_writes_sec' },
+ page_creates_sec => { src => 'IB_bp_page_creates_sec' },
+ },
+ visible => [ qw(cxn pages_read pages_written pages_created page_reads_sec page_writes_sec page_creates_sec)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'bp',
+ group_by => [],
+ aggregate => 0,
+ },
+ processlist => {
+ capt => 'MySQL Process List',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn', minw => 6, maxw => 10 },
+ mysql_thread_id => { src => 'id', minw => 6, maxw => 0 },
+ user => { src => 'user', minw => 5, maxw => 8 },
+ hostname => { src => $exprs{Host}, minw => 13, maxw => 8, },
+ port => { src => $exprs{Port}, minw => 0, maxw => 0, },
+ host_and_port => { src => 'host', minw => 0, maxw => 0 },
+ db => { src => 'db', minw => 6, maxw => 12 },
+ cmd => { src => 'command', minw => 5, maxw => 0 },
+ time => { src => 'time', minw => 5, maxw => 0, trans => [ qw(secs_to_time) ], agg => 'sum' },
+ state => { src => 'state', minw => 0, maxw => 0 },
+ info => { src => 'info', minw => 0, maxw => 0, trans => [ qw(no_ctrl_char) ] },
+ cnt => { src => 'id', minw => 0, maxw => 0 },
+ },
+ visible => [ qw(cxn cmd cnt mysql_thread_id user hostname db time info)],
+ filters => [ qw(hide_self hide_inactive hide_slave_io) ],
+ sort_cols => '-time cxn hostname mysql_thread_id',
+ sort_dir => '1',
+ innodb => '',
+ hide_caption => 1,
+ colors => [
+ { col => 'state', op => 'eq', arg => 'Locked', color => 'black on_red' },
+ { col => 'cmd', op => 'eq', arg => 'Sleep', color => 'white' },
+ { col => 'user', op => 'eq', arg => 'system user', color => 'white' },
+ { col => 'cmd', op => 'eq', arg => 'Connect', color => 'white' },
+ { col => 'cmd', op => 'eq', arg => 'Binlog Dump', color => 'white' },
+ { col => 'time', op => '>', arg => 600, color => 'red' },
+ { col => 'time', op => '>', arg => 120, color => 'yellow' },
+ { col => 'time', op => '>', arg => 60, color => 'green' },
+ { col => 'time', op => '>', arg => 30, color => 'cyan' },
+ ],
+ group_by => [qw(cxn cmd)],
+ aggregate => 0,
+ },
+
+ # TODO: some more columns:
+ # kb_used=hdr='BufUsed' minw='0' num='0' src='percent(1 - ((Key_blocks_unused * key_cache_block_size) / (key_buffer_size||1)))' dec='0' trans='' tbl='q_header' just='-' user='1' maxw='0' label='User-defined'
+ # retries=hdr='Retries' minw='0' num='0' src='Slave_retried_transactions' dec='0' trans='' tbl='slave_sql_status' just='-' user='1' maxw='0' label='User-defined'
+ # thd=hdr='Thd' minw='0' num='0' src='Threads_connected' dec='0' trans='' tbl='slave_sql_status' just='-' user='1' maxw='0' label='User-defined'
+
+ q_header => {
+ capt => 'Q-mode Header',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ questions => { src => 'Questions' },
+ qps => { src => 'Questions/Uptime_hires', dec => 1, trans => [qw(shorten)] },
+ load => { src => $exprs{ServerLoad}, dec => 1, trans => [qw(shorten)] },
+ slow => { src => 'Slow_queries', dec => 1, trans => [qw(shorten)] },
+ q_cache_hit => { src => $exprs{QcacheHitRatio}, dec => 1, trans => [qw(percent)] },
+ key_buffer_hit => { src => '1-(Key_reads/(Key_read_requests||1))', dec => 1, trans => [qw(percent)] },
+ bps_in => { src => 'Bytes_received/Uptime_hires', dec => 1, trans => [qw(shorten)] },
+ bps_out => { src => 'Bytes_sent/Uptime_hires', dec => 1, trans => [qw(shorten)] },
+ when => { src => 'when' },
+ },
+ visible => [ qw(cxn when load qps slow q_cache_hit key_buffer_hit bps_in bps_out)],
+ filters => [],
+ sort_cols => 'when cxn',
+ sort_dir => '1',
+ innodb => '',
+ hide_caption => 1,
+ group_by => [],
+ aggregate => 0,
+ },
+ row_operations => {
+ capt => 'InnoDB Row Operations',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ num_inserts => { src => 'IB_ro_num_rows_ins' },
+ num_updates => { src => 'IB_ro_num_rows_upd' },
+ num_reads => { src => 'IB_ro_num_rows_read' },
+ num_deletes => { src => 'IB_ro_num_rows_del' },
+ num_inserts_sec => { src => 'IB_ro_ins_sec' },
+ num_updates_sec => { src => 'IB_ro_upd_sec' },
+ num_reads_sec => { src => 'IB_ro_read_sec' },
+ num_deletes_sec => { src => 'IB_ro_del_sec' },
+ },
+ visible => [ qw(cxn num_inserts num_updates num_reads num_deletes num_inserts_sec
+ num_updates_sec num_reads_sec num_deletes_sec)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'ro',
+ group_by => [],
+ aggregate => 0,
+ },
+ row_operation_misc => {
+ capt => 'Row Operation Misc',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ queries_in_queue => { src => 'IB_ro_queries_in_queue' },
+ queries_inside => { src => 'IB_ro_queries_inside' },
+ read_views_open => { src => 'IB_ro_read_views_open' },
+ main_thread_id => { src => 'IB_ro_main_thread_id' },
+ main_thread_proc_no => { src => 'IB_ro_main_thread_proc_no' },
+ main_thread_state => { src => 'IB_ro_main_thread_state' },
+ num_res_ext => { src => 'IB_ro_n_reserved_extents' },
+ },
+ visible => [ qw(cxn queries_in_queue queries_inside read_views_open main_thread_state)],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'ro',
+ group_by => [],
+ aggregate => 0,
+ },
+ semaphores => {
+ capt => 'InnoDB Semaphores',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ mutex_os_waits => { src => 'IB_sm_mutex_os_waits' },
+ mutex_spin_rounds => { src => 'IB_sm_mutex_spin_rounds' },
+ mutex_spin_waits => { src => 'IB_sm_mutex_spin_waits' },
+ reservation_count => { src => 'IB_sm_reservation_count' },
+ rw_excl_os_waits => { src => 'IB_sm_rw_excl_os_waits' },
+ rw_excl_spins => { src => 'IB_sm_rw_excl_spins' },
+ rw_shared_os_waits => { src => 'IB_sm_rw_shared_os_waits' },
+ rw_shared_spins => { src => 'IB_sm_rw_shared_spins' },
+ signal_count => { src => 'IB_sm_signal_count' },
+ wait_array_size => { src => 'IB_sm_wait_array_size' },
+ },
+ visible => [ qw(cxn mutex_os_waits mutex_spin_waits mutex_spin_rounds
+ rw_excl_os_waits rw_excl_spins rw_shared_os_waits rw_shared_spins
+ signal_count reservation_count )],
+ filters => [],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => 'sm',
+ group_by => [],
+ aggregate => 0,
+ },
+ slave_io_status => {
+ capt => 'Slave I/O Status',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ connect_retry => { src => 'connect_retry' },
+ master_host => { src => 'master_host', hdr => 'Master'},
+ master_log_file => { src => 'master_log_file', hdr => 'File' },
+ master_port => { src => 'master_port' },
+ master_ssl_allowed => { src => 'master_ssl_allowed' },
+ master_ssl_ca_file => { src => 'master_ssl_ca_file' },
+ master_ssl_ca_path => { src => 'master_ssl_ca_path' },
+ master_ssl_cert => { src => 'master_ssl_cert' },
+ master_ssl_cipher => { src => 'master_ssl_cipher' },
+ master_ssl_key => { src => 'master_ssl_key' },
+ master_user => { src => 'master_user' },
+ read_master_log_pos => { src => 'read_master_log_pos', hdr => 'Pos' },
+ relay_log_size => { src => 'relay_log_space', trans => [qw(shorten)] },
+ slave_io_running => { src => 'slave_io_running', hdr => 'On?' },
+ slave_io_state => { src => 'slave_io_state', hdr => 'State' },
+ },
+ visible => [ qw(cxn master_host slave_io_running master_log_file relay_log_size read_master_log_pos slave_io_state)],
+ filters => [ qw( cxn_is_slave ) ],
+ sort_cols => 'slave_io_running cxn',
+ colors => [
+ { col => 'slave_io_running', op => 'ne', arg => 'Yes', color => 'black on_red' },
+ ],
+ sort_dir => '1',
+ innodb => '',
+ group_by => [],
+ aggregate => 0,
+ },
+ slave_sql_status => {
+ capt => 'Slave SQL Status',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ exec_master_log_pos => { src => 'exec_master_log_pos', hdr => 'Master Pos' },
+ last_errno => { src => 'last_errno' },
+ last_error => { src => 'last_error' },
+ master_host => { src => 'master_host', hdr => 'Master' },
+ relay_log_file => { src => 'relay_log_file' },
+ relay_log_pos => { src => 'relay_log_pos' },
+ relay_log_size => { src => 'relay_log_space', trans => [qw(shorten)] },
+ relay_master_log_file => { src => 'relay_master_log_file', hdr => 'Master File' },
+ replicate_do_db => { src => 'replicate_do_db' },
+ replicate_do_table => { src => 'replicate_do_table' },
+ replicate_ignore_db => { src => 'replicate_ignore_db' },
+ replicate_ignore_table => { src => 'replicate_ignore_table' },
+ replicate_wild_do_table => { src => 'replicate_wild_do_table' },
+ replicate_wild_ignore_table => { src => 'replicate_wild_ignore_table' },
+ skip_counter => { src => 'skip_counter' },
+ slave_sql_running => { src => 'slave_sql_running', hdr => 'On?' },
+ until_condition => { src => 'until_condition' },
+ until_log_file => { src => 'until_log_file' },
+ until_log_pos => { src => 'until_log_pos' },
+ time_behind_master => { src => 'seconds_behind_master', trans => [ qw(secs_to_time) ] },
+ bytes_behind_master => { src => 'master_log_file && master_log_file eq relay_master_log_file ? read_master_log_pos - exec_master_log_pos : 0', trans => [qw(shorten)] },
+ slave_catchup_rate => { src => $exprs{SlaveCatchupRate}, trans => [ qw(set_precision) ] },
+ slave_open_temp_tables => { src => 'Slave_open_temp_tables' },
+ },
+ visible => [ qw(cxn master_host slave_sql_running time_behind_master slave_catchup_rate slave_open_temp_tables relay_log_pos last_error)],
+ filters => [ qw( cxn_is_slave ) ],
+ sort_cols => 'slave_sql_running cxn',
+ sort_dir => '1',
+ innodb => '',
+ colors => [
+ { col => 'slave_sql_running', op => 'ne', arg => 'Yes', color => 'black on_red' },
+ { col => 'time_behind_master', op => '>', arg => 600, color => 'red' },
+ { col => 'time_behind_master', op => '>', arg => 60, color => 'yellow' },
+ { col => 'time_behind_master', op => '==', arg => 0, color => 'white' },
+ ],
+ group_by => [],
+ aggregate => 0,
+ },
+ t_header => {
+ capt => 'T-Mode Header',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ dirty_bufs => { src => $exprs{DirtyBufs}, trans => [qw(percent)] },
+ history_list_len => { src => 'IB_tx_history_list_len' },
+ lock_structs => { src => 'IB_tx_num_lock_structs' },
+ num_txns => { src => $exprs{NumTxns} },
+ max_txn => { src => $exprs{MaxTxnTime}, trans => [qw(secs_to_time)] },
+ undo_for => { src => 'IB_tx_purge_undo_for' },
+ used_bufs => { src => $exprs{BufPoolFill}, trans => [qw(percent)]},
+ versions => { src => $exprs{OldVersions} },
+ },
+ visible => [ qw(cxn history_list_len versions undo_for dirty_bufs used_bufs num_txns max_txn lock_structs)],
+ filters => [ ],
+ sort_cols => 'cxn',
+ sort_dir => '1',
+ innodb => '',
+ colors => [],
+ hide_caption => 1,
+ group_by => [],
+ aggregate => 0,
+ },
+ var_status => {
+ capt => 'Variables & Status',
+ cust => {},
+ cols => {}, # Generated from current varset
+ visible => [], # Generated from current varset
+ filters => [],
+ sort_cols => '',
+ sort_dir => 1,
+ innodb => '',
+ temp => 1, # Do not persist to config file.
+ hide_caption => 1,
+ pivot => 0,
+ group_by => [],
+ aggregate => 0,
+ },
+ wait_array => {
+ capt => 'InnoDB Wait Array',
+ cust => {},
+ cols => {
+ cxn => { src => 'cxn' },
+ thread => { src => 'thread' },
+ waited_at_filename => { src => 'waited_at_filename' },
+ waited_at_line => { src => 'waited_at_line' },
+ 'time' => { src => 'waited_secs', trans => [ qw(secs_to_time) ] },
+ request_type => { src => 'request_type' },
+ lock_mem_addr => { src => 'lock_mem_addr' },
+ lock_cfile_name => { src => 'lock_cfile_name' },
+ lock_cline => { src => 'lock_cline' },
+ writer_thread => { src => 'writer_thread' },
+ writer_lock_mode => { src => 'writer_lock_mode' },
+ num_readers => { src => 'num_readers' },
+ lock_var => { src => 'lock_var' },
+ waiters_flag => { src => 'waiters_flag' },
+ last_s_file_name => { src => 'last_s_file_name' },
+ last_s_line => { src => 'last_s_line' },
+ last_x_file_name => { src => 'last_x_file_name' },
+ last_x_line => { src => 'last_x_line' },
+ cell_waiting => { src => 'cell_waiting' },
+ cell_event_set => { src => 'cell_event_set' },
+ },
+ visible => [ qw(cxn thread time waited_at_filename waited_at_line request_type num_readers lock_var waiters_flag cell_waiting cell_event_set)],
+ filters => [],
+ sort_cols => 'cxn -time',
+ sort_dir => '1',
+ innodb => 'sm',
+ group_by => [],
+ aggregate => 0,
+ },
+);
+
+# Initialize %tbl_meta from %columns and do some checks.
+foreach my $table_name ( keys %tbl_meta ) {
+ my $table = $tbl_meta{$table_name};
+ my $cols = $table->{cols};
+
+ foreach my $col_name ( keys %$cols ) {
+ my $col_def = $table->{cols}->{$col_name};
+ die "I can't find a column named '$col_name' for '$table_name'" unless $columns{$col_name};
+ $columns{$col_name}->{referenced} = 1;
+
+ foreach my $prop ( keys %col_props ) {
+ # Each column gets non-existing values set from %columns or defaults from %col_props.
+ if ( !$col_def->{$prop} ) {
+ $col_def->{$prop}
+ = defined($columns{$col_name}->{$prop})
+ ? $columns{$col_name}->{$prop}
+ : $col_props{$prop};
+ }
+ }
+
+ # Ensure transformations and aggregate functions are valid
+ die "Unknown aggregate function '$col_def->{agg}' "
+ . "for column '$col_name' in table '$table_name'"
+ unless exists $agg_funcs{$col_def->{agg}};
+ foreach my $trans ( @{$col_def->{trans}} ) {
+ die "Unknown transformation '$trans' "
+ . "for column '$col_name' in table '$table_name'"
+ unless exists $trans_funcs{$trans};
+ }
+ }
+
+ # Ensure each column in visible and group_by exists in cols
+ foreach my $place ( qw(visible group_by) ) {
+ foreach my $col_name ( @{$table->{$place}} ) {
+ if ( !exists $cols->{$col_name} ) {
+ die "Column '$col_name' is listed in '$place' for '$table_name', but doesn't exist";
+ }
+ }
+ }
+
+ # Compile sort and color subroutines
+ $table->{sort_func} = make_sort_func($table);
+ $table->{color_func} = make_color_func($table);
+}
+
+# This is for code cleanup:
+{
+ my @unused_cols = grep { !$columns{$_}->{referenced} } sort keys %columns;
+ if ( @unused_cols ) {
+ die "The following columns are not used: "
+ . join(' ', @unused_cols);
+ }
+}
+
+# ###########################################################################
+# Operating modes {{{3
+# ###########################################################################
+my %modes = (
+ B => {
+ hdr => 'InnoDB Buffers',
+ cust => {},
+ note => 'Shows buffer info from InnoDB',
+ action_for => {
+ i => {
+ action => sub { toggle_config('status_inc') },
+ label => 'Toggle incremental status display',
+ },
+ },
+ display_sub => \&display_B,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(buffer_pool page_statistics insert_buffers adaptive_hash_index)],
+ visible_tables => [qw(buffer_pool page_statistics insert_buffers adaptive_hash_index)],
+ },
+ C => {
+ hdr => 'Command Summary',
+ cust => {},
+ note => 'Shows relative magnitude of variables',
+ action_for => {
+ s => {
+ action => sub { get_config_interactive('cmd_filter') },
+ label => 'Choose variable prefix',
+ },
+ },
+ display_sub => \&display_C,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(cmd_summary)],
+ visible_tables => [qw(cmd_summary)],
+ },
+ D => {
+ hdr => 'InnoDB Deadlocks',
+ cust => {},
+ note => 'View InnoDB deadlock information',
+ action_for => {
+ c => {
+ action => sub { edit_table('deadlock_transactions') },
+ label => 'Choose visible columns',
+ },
+ w => {
+ action => \&create_deadlock,
+ label => 'Wipe deadlock status info by creating a deadlock',
+ },
+ },
+ display_sub => \&display_D,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(deadlock_transactions deadlock_locks)],
+ visible_tables => [qw(deadlock_transactions deadlock_locks)],
+ },
+ F => {
+ hdr => 'InnoDB FK Err',
+ cust => {},
+ note => 'View the latest InnoDB foreign key error',
+ action_for => {},
+ display_sub => \&display_F,
+ connections => [],
+ server_group => '',
+ one_connection => 1,
+ tables => [qw(fk_error)],
+ visible_tables => [qw(fk_error)],
+ },
+ I => {
+ hdr => 'InnoDB I/O Info',
+ cust => {},
+ note => 'Shows I/O info (i/o, log...) from InnoDB',
+ action_for => {
+ i => {
+ action => sub { toggle_config('status_inc') },
+ label => 'Toggle incremental status display',
+ },
+ },
+ display_sub => \&display_I,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(io_threads pending_io file_io_misc log_statistics)],
+ visible_tables => [qw(io_threads pending_io file_io_misc log_statistics)],
+ },
+ L => {
+ hdr => 'Locks',
+ cust => {},
+ note => 'Shows transaction locks',
+ action_for => {
+ a => {
+ action => sub { send_cmd_to_servers('CREATE TABLE IF NOT EXISTS test.innodb_lock_monitor(a int) ENGINE=InnoDB', 0, '', []); },
+ label => 'Start the InnoDB Lock Monitor',
+ },
+ o => {
+ action => sub { send_cmd_to_servers('DROP TABLE IF EXISTS test.innodb_lock_monitor', 0, '', []); },
+ label => 'Stop the InnoDB Lock Monitor',
+ },
+ },
+ display_sub => \&display_L,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(innodb_locks)],
+ visible_tables => [qw(innodb_locks)],
+ },
+ M => {
+ hdr => 'Replication Status',
+ cust => {},
+ note => 'Shows replication (master and slave) status',
+ action_for => {
+ a => {
+ action => sub { send_cmd_to_servers('START SLAVE', 0, 'START SLAVE SQL_THREAD UNTIL MASTER_LOG_FILE = ?, MASTER_LOG_POS = ?', []); },
+ label => 'Start slave(s)',
+ },
+ i => {
+ action => sub { toggle_config('status_inc') },
+ label => 'Toggle incremental status display',
+ },
+ o => {
+ action => sub { send_cmd_to_servers('STOP SLAVE', 0, '', []); },
+ label => 'Stop slave(s)',
+ },
+ b => {
+ action => sub { purge_master_logs() },
+ label => 'Purge unused master logs',
+ },
+ },
+ display_sub => \&display_M,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(slave_sql_status slave_io_status master_status)],
+ visible_tables => [qw(slave_sql_status slave_io_status master_status)],
+ },
+ O => {
+ hdr => 'Open Tables',
+ cust => {},
+ note => 'Shows open tables in MySQL',
+ action_for => {
+ r => {
+ action => sub { reverse_sort('open_tables'); },
+ label => 'Reverse sort order',
+ },
+ s => {
+ action => sub { choose_sort_cols('open_tables'); },
+ label => "Choose sort column",
+ },
+ },
+ display_sub => \&display_O,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(open_tables)],
+ visible_tables => [qw(open_tables)],
+ },
+ Q => {
+ hdr => 'Query List',
+ cust => {},
+ note => 'Shows queries from SHOW FULL PROCESSLIST',
+ action_for => {
+ a => {
+ action => sub { toggle_filter('processlist', 'hide_self') },
+ label => 'Toggle the innotop process',
+ },
+ c => {
+ action => sub { edit_table('processlist') },
+ label => 'Choose visible columns',
+ },
+ e => {
+ action => sub { analyze_query('e'); },
+ label => "Explain a thread's query",
+ },
+ f => {
+ action => sub { analyze_query('f'); },
+ label => "Show a thread's full query",
+ },
+ h => {
+ action => sub { toggle_visible_table('Q', 'q_header') },
+ label => 'Toggle the header on and off',
+ },
+ i => {
+ action => sub { toggle_filter('processlist', 'hide_inactive') },
+ label => 'Toggle idle processes',
+ },
+ k => {
+ action => sub { kill_query('CONNECTION') },
+ label => "Kill a query's connection",
+ },
+ r => {
+ action => sub { reverse_sort('processlist'); },
+ label => 'Reverse sort order',
+ },
+ s => {
+ action => sub { choose_sort_cols('processlist'); },
+ label => "Change the display's sort column",
+ },
+ x => {
+ action => sub { kill_query('QUERY') },
+ label => "Kill a query",
+ },
+ },
+ display_sub => \&display_Q,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(q_header processlist)],
+ visible_tables => [qw(q_header processlist)],
+ },
+ R => {
+ hdr => 'InnoDB Row Ops',
+ cust => {},
+ note => 'Shows InnoDB row operation and semaphore info',
+ action_for => {
+ i => {
+ action => sub { toggle_config('status_inc') },
+ label => 'Toggle incremental status display',
+ },
+ },
+ display_sub => \&display_R,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(row_operations row_operation_misc semaphores wait_array)],
+ visible_tables => [qw(row_operations row_operation_misc semaphores wait_array)],
+ },
+ S => {
+ hdr => 'Variables & Status',
+ cust => {},
+ note => 'Shows query load statistics a la vmstat',
+ action_for => {
+ '>' => {
+ action => sub { switch_var_set('S_set', 1) },
+ label => 'Switch to next variable set',
+ },
+ '<' => {
+ action => sub { switch_var_set('S_set', -1) },
+ label => 'Switch to prev variable set',
+ },
+ c => {
+ action => sub {
+ choose_var_set('S_set');
+ start_S_mode();
+ },
+ label => "Choose which set to display",
+ },
+ e => {
+ action => \&edit_current_var_set,
+ label => 'Edit the current set of variables',
+ },
+ i => {
+ action => sub { $clear_screen_sub->(); toggle_config('status_inc') },
+ label => 'Toggle incremental status display',
+ },
+ '-' => {
+ action => sub { set_display_precision(-1) },
+ label => 'Decrease fractional display precision',
+ },
+ '+' => {
+ action => sub { set_display_precision(1) },
+ label => 'Increase fractional display precision',
+ },
+ g => {
+ action => sub { set_s_mode('g') },
+ label => 'Switch to graph (tload) view',
+ },
+ s => {
+ action => sub { set_s_mode('s') },
+ label => 'Switch to standard (vmstat) view',
+ },
+ v => {
+ action => sub { set_s_mode('v') },
+ label => 'Switch to pivoted view',
+ },
+ },
+ display_sub => \&display_S,
+ no_clear_screen => 1,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(var_status)],
+ visible_tables => [qw(var_status)],
+ },
+ T => {
+ hdr => 'InnoDB Txns',
+ cust => {},
+ note => 'Shows InnoDB transactions in top-like format',
+ action_for => {
+ a => {
+ action => sub { toggle_filter('innodb_transactions', 'hide_self') },
+ label => 'Toggle the innotop process',
+ },
+ c => {
+ action => sub { edit_table('innodb_transactions') },
+ label => 'Choose visible columns',
+ },
+ e => {
+ action => sub { analyze_query('e'); },
+ label => "Explain a thread's query",
+ },
+ f => {
+ action => sub { analyze_query('f'); },
+ label => "Show a thread's full query",
+ },
+ h => {
+ action => sub { toggle_visible_table('T', 't_header') },
+ label => 'Toggle the header on and off',
+ },
+ i => {
+ action => sub { toggle_filter('innodb_transactions', 'hide_inactive') },
+ label => 'Toggle inactive transactions',
+ },
+ k => {
+ action => sub { kill_query('CONNECTION') },
+ label => "Kill a transaction's connection",
+ },
+ r => {
+ action => sub { reverse_sort('innodb_transactions'); },
+ label => 'Reverse sort order',
+ },
+ s => {
+ action => sub { choose_sort_cols('innodb_transactions'); },
+ label => "Change the display's sort column",
+ },
+ x => {
+ action => sub { kill_query('QUERY') },
+ label => "Kill a query",
+ },
+ },
+ display_sub => \&display_T,
+ connections => [],
+ server_group => '',
+ one_connection => 0,
+ tables => [qw(t_header innodb_transactions)],
+ visible_tables => [qw(t_header innodb_transactions)],
+ },
+);
+
+# ###########################################################################
+# Global key mappings {{{3
+# Keyed on a single character, which is read from the keyboard. Uppercase
+# letters switch modes. Lowercase letters access commands when in a mode.
+# These can be overridden by action_for in %modes.
+# ###########################################################################
+my %action_for = (
+ '$' => {
+ action => \&edit_configuration,
+ label => 'Edit configuration settings',
+ },
+ '?' => {
+ action => \&display_help,
+ label => 'Show help',
+ },
+ '!' => {
+ action => \&display_license,
+ label => 'Show license and warranty',
+ },
+ '^' => {
+ action => \&edit_table,
+ label => "Edit the displayed table(s)",
+ },
+ '#' => {
+ action => \&choose_server_groups,
+ label => 'Select/create server groups',
+ },
+ '@' => {
+ action => \&choose_servers,
+ label => 'Select/create server connections',
+ },
+ '/' => {
+ action => \&add_quick_filter,
+ label => 'Quickly filter what you see',
+ },
+ '\\' => {
+ action => \&clear_quick_filters,
+ label => 'Clear quick-filters',
+ },
+ '%' => {
+ action => \&choose_filters,
+ label => 'Choose and edit table filters',
+ },
+ "\t" => {
+ action => \&next_server_group,
+ label => 'Switch to the next server group',
+ key => 'TAB',
+ },
+ '=' => {
+ action => \&toggle_aggregate,
+ label => 'Toggle aggregation',
+ },
+ # TODO: can these be auto-generated from %modes?
+ B => {
+ action => sub { switch_mode('B') },
+ label => '',
+ },
+ C => {
+ action => sub { switch_mode('C') },
+ label => '',
+ },
+ D => {
+ action => sub { switch_mode('D') },
+ label => '',
+ },
+ F => {
+ action => sub { switch_mode('F') },
+ label => '',
+ },
+ I => {
+ action => sub { switch_mode('I') },
+ label => '',
+ },
+ L => {
+ action => sub { switch_mode('L') },
+ label => '',
+ },
+ M => {
+ action => sub { switch_mode('M') },
+ label => '',
+ },
+ O => {
+ action => sub { switch_mode('O') },
+ label => '',
+ },
+ Q => {
+ action => sub { switch_mode('Q') },
+ label => '',
+ },
+ R => {
+ action => sub { switch_mode('R') },
+ label => '',
+ },
+ S => {
+ action => \&start_S_mode,
+ label => '',
+ },
+ T => {
+ action => sub { switch_mode('T') },
+ label => '',
+ },
+ d => {
+ action => sub { get_config_interactive('interval') },
+ label => 'Change refresh interval',
+ },
+ n => { action => \&next_server, label => 'Switch to the next connection' },
+ p => { action => \&pause, label => 'Pause innotop', },
+ q => { action => \&finish, label => 'Quit innotop', },
+);
+
+# ###########################################################################
+# Sleep times after certain statements {{{3
+# ###########################################################################
+my %stmt_sleep_time_for = ();
+
+# ###########################################################################
+# Config editor key mappings {{{3
+# ###########################################################################
+my %cfg_editor_action = (
+ c => {
+ note => 'Edit columns, etc in the displayed table(s)',
+ func => \&edit_table,
+ },
+ g => {
+ note => 'Edit general configuration',
+ func => \&edit_configuration_variables,
+ },
+ k => {
+ note => 'Edit row-coloring rules',
+ func => \&edit_color_rules,
+ },
+ p => {
+ note => 'Manage plugins',
+ func => \&edit_plugins,
+ },
+ s => {
+ note => 'Edit server groups',
+ func => \&edit_server_groups,
+ },
+ S => {
+ note => 'Edit SQL statement sleep delays',
+ func => \&edit_stmt_sleep_times,
+ },
+ t => {
+ note => 'Choose which table(s) to display in this mode',
+ func => \&choose_mode_tables,
+ },
+);
+
+# ###########################################################################
+# Color editor key mappings {{{3
+# ###########################################################################
+my %color_editor_action = (
+ n => {
+ note => 'Create a new color rule',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my $meta = $tbl_meta{$tbl};
+
+ $clear_screen_sub->();
+ my $col;
+ do {
+ $col = prompt_list(
+ 'Choose the target column for the rule',
+ '',
+ sub { return keys %{$meta->{cols}} },
+ { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} });
+ } while ( !$col );
+ ( $col ) = grep { $_ } split(/\W+/, $col);
+ return $idx unless $col && exists $meta->{cols}->{$col};
+
+ $clear_screen_sub->();
+ my $op;
+ do {
+ $op = prompt_list(
+ 'Choose the comparison operator for the rule',
+ '',
+ sub { return keys %comp_ops },
+ { map { $_ => $comp_ops{$_} } keys %comp_ops } );
+ } until ( $op );
+ $op =~ s/\s+//g;
+ return $idx unless $op && exists $comp_ops{$op};
+
+ my $arg;
+ do {
+ $arg = prompt('Specify an argument for the comparison');
+ } until defined $arg;
+
+ my $color;
+ do {
+ $color = prompt_list(
+ 'Choose the color(s) the row should be when the rule matches',
+ '',
+ sub { return keys %ansicolors },
+ { map { $_ => $_ } keys %ansicolors } );
+ } until defined $color;
+ $color = join(' ', unique(grep { exists $ansicolors{$_} } split(/\W+/, $color)));
+ return $idx unless $color;
+
+ push @{$tbl_meta{$tbl}->{colors}}, {
+ col => $col,
+ op => $op,
+ arg => $arg,
+ color => $color
+ };
+ $tbl_meta{$tbl}->{cust}->{colors} = 1;
+
+ return $idx;
+ },
+ },
+ d => {
+ note => 'Remove the selected rule',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my @rules = @{ $tbl_meta{$tbl}->{colors} };
+ return 0 unless @rules > 0 && $idx < @rules && $idx >= 0;
+ splice(@{$tbl_meta{$tbl}->{colors}}, $idx, 1);
+ $tbl_meta{$tbl}->{cust}->{colors} = 1;
+ return $idx == @rules ? $#rules : $idx;
+ },
+ },
+ j => {
+ note => 'Move highlight down one',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my $num_rules = scalar @{$tbl_meta{$tbl}->{colors}};
+ return ($idx + 1) % $num_rules;
+ },
+ },
+ k => {
+ note => 'Move highlight up one',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my $num_rules = scalar @{$tbl_meta{$tbl}->{colors}};
+ return ($idx - 1) % $num_rules;
+ },
+ },
+ '+' => {
+ note => 'Move selected rule up one',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my $meta = $tbl_meta{$tbl};
+ my $dest = $idx == 0 ? scalar(@{$meta->{colors}} - 1) : $idx - 1;
+ my $temp = $meta->{colors}->[$idx];
+ $meta->{colors}->[$idx] = $meta->{colors}->[$dest];
+ $meta->{colors}->[$dest] = $temp;
+ $meta->{cust}->{colors} = 1;
+ return $dest;
+ },
+ },
+ '-' => {
+ note => 'Move selected rule down one',
+ func => sub {
+ my ( $tbl, $idx ) = @_;
+ my $meta = $tbl_meta{$tbl};
+ my $dest = $idx == scalar(@{$meta->{colors}} - 1) ? 0 : $idx + 1;
+ my $temp = $meta->{colors}->[$idx];
+ $meta->{colors}->[$idx] = $meta->{colors}->[$dest];
+ $meta->{colors}->[$dest] = $temp;
+ $meta->{cust}->{colors} = 1;
+ return $dest;
+ },
+ },
+);
+
+# ###########################################################################
+# Plugin editor key mappings {{{3
+# ###########################################################################
+my %plugin_editor_action = (
+ '*' => {
+ note => 'Toggle selected plugin active/inactive',
+ func => sub {
+ my ( $plugins, $idx ) = @_;
+ my $plugin = $plugins->[$idx];
+ $plugin->{active} = $plugin->{active} ? 0 : 1;
+ return $idx;
+ },
+ },
+ j => {
+ note => 'Move highlight down one',
+ func => sub {
+ my ( $plugins, $idx ) = @_;
+ return ($idx + 1) % scalar(@$plugins);
+ },
+ },
+ k => {
+ note => 'Move highlight up one',
+ func => sub {
+ my ( $plugins, $idx ) = @_;
+ return $idx == 0 ? @$plugins - 1 : $idx - 1;
+ },
+ },
+);
+
+# ###########################################################################
+# Table editor key mappings {{{3
+# ###########################################################################
+my %tbl_editor_action = (
+ a => {
+ note => 'Add a column to the table',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+ my %all_cols = %{ $tbl_meta{$tbl}->{cols} };
+ delete @all_cols{@visible_cols};
+ my $choice = prompt_list(
+ 'Choose a column',
+ '',
+ sub { return keys %all_cols; },
+ { map { $_ => $all_cols{$_}->{label} || $all_cols{$_}->{hdr} } keys %all_cols });
+ if ( $all_cols{$choice} ) {
+ push @{$tbl_meta{$tbl}->{visible}}, $choice;
+ $tbl_meta{$tbl}->{cust}->{visible} = 1;
+ return $choice;
+ }
+ return $col;
+ },
+ },
+ n => {
+ note => 'Create a new column and add it to the table',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+
+ $clear_screen_sub->();
+ print word_wrap("Choose a name for the column. This name is not displayed, and is used only "
+ . "for internal reference. It can contain only lowercase letters, numbers, "
+ . "and underscores.");
+ print "\n\n";
+ do {
+ $col = prompt("Enter column name");
+ $col = '' if $col =~ m/[^a-z0-9_]/;
+ } while ( !$col );
+
+ $clear_screen_sub->();
+ my $hdr;
+ do {
+ $hdr = prompt("Enter column header");
+ } while ( !$hdr );
+
+ $clear_screen_sub->();
+ print "Choose a source for the column's data\n\n";
+ my ( $src, $sub, $err );
+ do {
+ if ( $err ) {
+ print "Error: $err\n\n";
+ }
+ $src = prompt("Enter column source");
+ if ( $src ) {
+ ( $sub, $err ) = compile_expr($src);
+ }
+ } until ( !$err);
+
+ # TODO: this duplicates %col_props.
+ $tbl_meta{$tbl}->{cols}->{$col} = {
+ hdr => $hdr,
+ src => $src,
+ just => '-',
+ num => 0,
+ label => 'User-defined',
+ user => 1,
+ tbl => $tbl,
+ minw => 0,
+ maxw => 0,
+ trans => [],
+ func => $sub,
+ dec => 0,
+ agg => 0,
+ aggonly => 0,
+ };
+
+ $tbl_meta{$tbl}->{visible} = [ unique(@{$tbl_meta{$tbl}->{visible}}, $col) ];
+ $tbl_meta{$tbl}->{cust}->{visible} = 1;
+ return $col;
+ },
+ },
+ d => {
+ note => 'Remove selected column',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+ my $idx = 0;
+ return $col unless @visible_cols > 1;
+ while ( $visible_cols[$idx] ne $col ) {
+ $idx++;
+ }
+ $tbl_meta{$tbl}->{visible} = [ grep { $_ ne $col } @visible_cols ];
+ $tbl_meta{$tbl}->{cust}->{visible} = 1;
+ return $idx == $#visible_cols ? $visible_cols[$idx - 1] : $visible_cols[$idx + 1];
+ },
+ },
+ e => {
+ note => 'Edit selected column',
+ func => sub {
+ # TODO: make this editor hotkey-driven and give readline support.
+ my ( $tbl, $col ) = @_;
+ $clear_screen_sub->();
+ my $meta = $tbl_meta{$tbl}->{cols}->{$col};
+ my @prop = qw(hdr label src just num minw maxw trans agg); # TODO redundant
+
+ my $answer;
+ do {
+ # Do what the user asked...
+ if ( $answer && grep { $_ eq $answer } @prop ) {
+ # Some properties are arrays, others scalars.
+ my $ini = ref $col_props{$answer} ? join(' ', @{$meta->{$answer}}) : $meta->{$answer};
+ my $val = prompt("New value for $answer", undef, $ini);
+ $val = [ split(' ', $val) ] if ref($col_props{$answer});
+ if ( $answer eq 'trans' ) {
+ $val = [ unique(grep{ exists $trans_funcs{$_} } @$val) ];
+ }
+ @{$meta}{$answer, 'user', 'tbl' } = ( $val, 1, $tbl );
+ }
+
+ my @display_lines = (
+ '',
+ "You are editing column $tbl.$col.\n",
+ );
+
+ push @display_lines, create_table2(
+ \@prop,
+ { map { $_ => $_ } @prop },
+ { map { $_ => ref $meta->{$_} eq 'ARRAY' ? join(' ', @{$meta->{$_}})
+ : ref $meta->{$_} ? '[expression code]'
+ : $meta->{$_}
+ } @prop
+ },
+ { sep => ' ' });
+ draw_screen(\@display_lines, { raw => 1 });
+ print "\n\n"; # One to add space, one to clear readline artifacts
+ $answer = prompt('Edit what? (q to quit)');
+ } while ( $answer ne 'q' );
+
+ return $col;
+ },
+ },
+ j => {
+ note => 'Move highlight down one',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+ my $idx = 0;
+ while ( $visible_cols[$idx] ne $col ) {
+ $idx++;
+ }
+ return $visible_cols[ ($idx + 1) % @visible_cols ];
+ },
+ },
+ k => {
+ note => 'Move highlight up one',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+ my $idx = 0;
+ while ( $visible_cols[$idx] ne $col ) {
+ $idx++;
+ }
+ return $visible_cols[ $idx - 1 ];
+ },
+ },
+ '+' => {
+ note => 'Move selected column up one',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my $meta = $tbl_meta{$tbl};
+ my @visible_cols = @{$meta->{visible}};
+ my $idx = 0;
+ while ( $visible_cols[$idx] ne $col ) {
+ $idx++;
+ }
+ if ( $idx ) {
+ $visible_cols[$idx] = $visible_cols[$idx - 1];
+ $visible_cols[$idx - 1] = $col;
+ $meta->{visible} = \@visible_cols;
+ }
+ else {
+ shift @{$meta->{visible}};
+ push @{$meta->{visible}}, $col;
+ }
+ $meta->{cust}->{visible} = 1;
+ return $col;
+ },
+ },
+ '-' => {
+ note => 'Move selected column down one',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ my $meta = $tbl_meta{$tbl};
+ my @visible_cols = @{$meta->{visible}};
+ my $idx = 0;
+ while ( $visible_cols[$idx] ne $col ) {
+ $idx++;
+ }
+ if ( $idx == $#visible_cols ) {
+ unshift @{$meta->{visible}}, $col;
+ pop @{$meta->{visible}};
+ }
+ else {
+ $visible_cols[$idx] = $visible_cols[$idx + 1];
+ $visible_cols[$idx + 1] = $col;
+ $meta->{visible} = \@visible_cols;
+ }
+ $meta->{cust}->{visible} = 1;
+ return $col;
+ },
+ },
+ f => {
+ note => 'Choose filters',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ choose_filters($tbl);
+ return $col;
+ },
+ },
+ o => {
+ note => 'Edit color rules',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ edit_color_rules($tbl);
+ return $col;
+ },
+ },
+ s => {
+ note => 'Choose sort columns',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ choose_sort_cols($tbl);
+ return $col;
+ },
+ },
+ g => {
+ note => 'Choose group-by (aggregate) columns',
+ func => sub {
+ my ( $tbl, $col ) = @_;
+ choose_group_cols($tbl);
+ return $col;
+ },
+ },
+);
+
+# ###########################################################################
+# Global variables and environment {{{2
+# ###########################################################################
+
+my @this_term_size; # w_chars, h_chars, w_pix, h_pix
+my @last_term_size; # w_chars, h_chars, w_pix, h_pix
+my $char;
+my $windows = $OSNAME =~ m/MSWin/;
+my $have_color = 0;
+my $MAX_ULONG = 4294967295; # 2^32-1
+my $num_regex = qr/^[+-]?(?=\d|\.)\d*(?:\.\d+)?(?:E[+-]?\d+|)$/i;
+my $int_regex = qr/^\d+$/;
+my $bool_regex = qr/^[01]$/;
+my $term = undef;
+my $file = undef; # File to watch for InnoDB monitor output
+my $file_mtime = undef; # Status of watched file
+my $file_data = undef; # Last chunk of text read from file
+my $innodb_parser = InnoDBParser->new;
+
+my $nonfatal_errs = join('|',
+ 'Access denied for user',
+ 'Unknown MySQL server host',
+ 'Unknown database',
+ 'Can\'t connect to local MySQL server through socket',
+ 'Can\'t connect to MySQL server on',
+ 'MySQL server has gone away',
+ 'Cannot call SHOW INNODB STATUS',
+ 'Access denied',
+ 'AutoCommit',
+);
+
+if ( !$opts{n} ) {
+ require Term::ReadLine;
+ $term = Term::ReadLine->new('innotop');
+}
+
+# Stores status, variables, innodb status, master/slave status etc.
+# Keyed on connection name. Each entry is a hashref of current and past data sets,
+# keyed on clock tick.
+my %vars;
+my %info_gotten = (); # Which things have been retrieved for the current clock tick.
+
+# Stores info on currently displayed queries: cxn, connection ID, query text.
+my @current_queries;
+
+my $lines_printed = 0;
+my $clock = 0; # Incremented with every wake-sleep cycle
+my $clearing_deadlocks = 0;
+
+# Find the home directory; it's different on different OSes.
+my $homepath = $ENV{HOME} || $ENV{HOMEPATH} || $ENV{USERPROFILE} || '.';
+
+# If terminal coloring is available, use it. The only function I want from
+# the module is the colored() function.
+eval {
+ if ( !$opts{n} ) {
+ if ( $windows ) {
+ require Win32::Console::ANSI;
+ }
+ require Term::ANSIColor;
+ import Term::ANSIColor qw(colored);
+ $have_color = 1;
+ }
+};
+if ( $EVAL_ERROR || $opts{n} ) {
+ # If there was an error, manufacture my own colored() function that does no
+ # coloring.
+ *colored = sub { pop @_; @_; };
+}
+
+if ( $opts{n} ) {
+ $clear_screen_sub = sub {};
+}
+elsif ( $windows ) {
+ $clear_screen_sub = sub { $lines_printed = 0; system("cls") };
+}
+else {
+ my $clear = `clear`;
+ $clear_screen_sub = sub { $lines_printed = 0; print $clear };
+}
+
+# ###########################################################################
+# Config storage. {{{2
+# ###########################################################################
+my %config = (
+ color => {
+ val => $have_color,
+ note => 'Whether to use terminal coloring',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ cmd_filter => {
+ val => 'Com_',
+ note => 'Prefix for values in C mode',
+ conf => [qw(C)],
+ },
+ plugin_dir => {
+ val => "$homepath/.innotop/plugins",
+ note => 'Directory where plugins can be found',
+ conf => 'ALL',
+ },
+ show_percent => {
+ val => 1,
+ note => 'Show the % symbol after percentages',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ skip_innodb => {
+ val => 0,
+ note => 'Disable SHOW INNODB STATUS',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ S_func => {
+ val => 's',
+ note => 'What to display in S mode: graph, status, pivoted status',
+ conf => [qw(S)],
+ pat => qr/^[gsv]$/,
+ },
+ cxn_timeout => {
+ val => 28800,
+ note => 'Connection timeout for keeping unused connections alive',
+ conf => 'ALL',
+ pat => $int_regex,
+ },
+ graph_char => {
+ val => '*',
+ note => 'Character for drawing graphs',
+ conf => [ qw(S) ],
+ pat => qr/^.$/,
+ },
+ show_cxn_errors_in_tbl => {
+ val => 1,
+ note => 'Whether to display connection errors as rows in the table',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ hide_hdr => {
+ val => 0,
+ note => 'Whether to show column headers',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ show_cxn_errors => {
+ val => 1,
+ note => 'Whether to print connection errors to STDOUT',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ readonly => {
+ val => 0,
+ note => 'Whether the config file is read-only',
+ conf => [ qw() ],
+ pat => $bool_regex,
+ },
+ global => {
+ val => 1,
+ note => 'Whether to show GLOBAL variables and status',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ header_highlight => {
+ val => 'bold',
+ note => 'How to highlight table column headers',
+ conf => 'ALL',
+ pat => qr/^(?:bold|underline)$/,
+ },
+ display_table_captions => {
+ val => 1,
+ note => 'Whether to put captions on tables',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ charset => {
+ val => 'ascii',
+ note => 'What type of characters should be displayed in queries (ascii, unicode, none)',
+ conf => 'ALL',
+ pat => qr/^(?:ascii|unicode|none)$/,
+ },
+ auto_wipe_dl => {
+ val => 0,
+ note => 'Whether to auto-wipe InnoDB deadlocks',
+ conf => 'ALL',
+ pat => $bool_regex,
+ },
+ max_height => {
+ val => 30,
+ note => '[Win32] Max window height',
+ conf => 'ALL',
+ },
+ debug => {
+ val => 0,
+ pat => $bool_regex,
+ note => 'Debug mode (more verbose errors, uses more memory)',
+ conf => 'ALL',
+ },
+ num_digits => {
+ val => 2,
+ pat => $int_regex,
+ note => 'How many digits to show in fractional numbers and percents',
+ conf => 'ALL',
+ },
+ debugfile => {
+ val => "$homepath/.innotop/core_dump",
+ note => 'A debug file in case you are interested in error output',
+ },
+ show_statusbar => {
+ val => 1,
+ pat => $bool_regex,
+ note => 'Whether to show the status bar in the display',
+ conf => 'ALL',
+ },
+ mode => {
+ val => "T",
+ note => "Which mode to start in",
+ cmdline => 1,
+ },
+ status_inc => {
+ val => 0,
+ note => 'Whether to show raw or incremental values for status variables',
+ pat => $bool_regex,
+ },
+ interval => {
+ val => 10,
+ pat => qr/^(?:(?:\d*?[1-9]\d*(?:\.\d*)?)|(?:\d*\.\d*?[1-9]\d*))$/,
+ note => "The interval at which the display will be refreshed. Fractional values allowed.",
+ },
+ num_status_sets => {
+ val => 9,
+ pat => $int_regex,
+ note => 'How many sets of STATUS and VARIABLES values to show',
+ conf => [ qw(S) ],
+ },
+ S_set => {
+ val => 'general',
+ pat => qr/^\w+$/,
+ note => 'Which set of variables to display in S (Variables & Status) mode',
+ conf => [ qw(S) ],
+ },
+);
+
+# ###########################################################################
+# Config file sections {{{2
+# The configuration file is broken up into sections like a .ini file. This
+# variable defines those sections and the subroutines responsible for reading
+# and writing them.
+# ###########################################################################
+my %config_file_sections = (
+ plugins => {
+ reader => \&load_config_plugins,
+ writer => \&save_config_plugins,
+ },
+ group_by => {
+ reader => \&load_config_group_by,
+ writer => \&save_config_group_by,
+ },
+ filters => {
+ reader => \&load_config_filters,
+ writer => \&save_config_filters,
+ },
+ active_filters => {
+ reader => \&load_config_active_filters,
+ writer => \&save_config_active_filters,
+ },
+ visible_tables => {
+ reader => \&load_config_visible_tables,
+ writer => \&save_config_visible_tables,
+ },
+ sort_cols => {
+ reader => \&load_config_sort_cols,
+ writer => \&save_config_sort_cols,
+ },
+ active_columns => {
+ reader => \&load_config_active_columns,
+ writer => \&save_config_active_columns,
+ },
+ tbl_meta => {
+ reader => \&load_config_tbl_meta,
+ writer => \&save_config_tbl_meta,
+ },
+ general => {
+ reader => \&load_config_config,
+ writer => \&save_config_config,
+ },
+ connections => {
+ reader => \&load_config_connections,
+ writer => \&save_config_connections,
+ },
+ active_connections => {
+ reader => \&load_config_active_connections,
+ writer => \&save_config_active_connections,
+ },
+ server_groups => {
+ reader => \&load_config_server_groups,
+ writer => \&save_config_server_groups,
+ },
+ active_server_groups => {
+ reader => \&load_config_active_server_groups,
+ writer => \&save_config_active_server_groups,
+ },
+ max_values_seen => {
+ reader => \&load_config_mvs,
+ writer => \&save_config_mvs,
+ },
+ varsets => {
+ reader => \&load_config_varsets,
+ writer => \&save_config_varsets,
+ },
+ colors => {
+ reader => \&load_config_colors,
+ writer => \&save_config_colors,
+ },
+ stmt_sleep_times => {
+ reader => \&load_config_stmt_sleep_times,
+ writer => \&save_config_stmt_sleep_times,
+ },
+);
+
+# Config file sections have some dependencies, so they have to be read/written in order.
+my @ordered_config_file_sections = qw(general plugins filters active_filters tbl_meta
+ connections active_connections server_groups active_server_groups max_values_seen
+ active_columns sort_cols visible_tables varsets colors stmt_sleep_times
+ group_by);
+
+# All events for which plugins may register themselves. Entries are arrayrefs.
+my %event_listener_for = map { $_ => [] }
+ qw(
+ extract_values
+ set_to_tbl_pre_filter set_to_tbl_pre_sort set_to_tbl_pre_group
+ set_to_tbl_pre_colorize set_to_tbl_pre_transform set_to_tbl_pre_pivot
+ set_to_tbl_pre_create set_to_tbl_post_create
+ draw_screen
+ );
+
+# All variables to which plugins have access.
+my %pluggable_vars = (
+ action_for => \%action_for,
+ agg_funcs => \%agg_funcs,
+ config => \%config,
+ connections => \%connections,
+ dbhs => \%dbhs,
+ filters => \%filters,
+ modes => \%modes,
+ server_groups => \%server_groups,
+ tbl_meta => \%tbl_meta,
+ trans_funcs => \%trans_funcs,
+ var_sets => \%var_sets,
+);
+
+# ###########################################################################
+# Contains logic to generate prepared statements for a given function for a
+# given DB connection. Returns a $sth.
+# ###########################################################################
+my %stmt_maker_for = (
+ INNODB_STATUS => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+ ? 'SHOW ENGINE INNODB STATUS'
+ : 'SHOW INNODB STATUS');
+ },
+ SHOW_VARIABLES => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare($config{global}->{val} && version_ge( $dbh, '4.0.3' )
+ ? 'SHOW GLOBAL VARIABLES'
+ : 'SHOW VARIABLES');
+ },
+ SHOW_STATUS => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare($config{global}->{val} && version_ge( $dbh, '5.0.2' )
+ ? 'SHOW GLOBAL STATUS'
+ : 'SHOW STATUS');
+ },
+ KILL_QUERY => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+ ? 'KILL QUERY ?'
+ : 'KILL ?');
+ },
+ SHOW_MASTER_LOGS => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare('SHOW MASTER LOGS');
+ },
+ SHOW_MASTER_STATUS => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare('SHOW MASTER STATUS');
+ },
+ SHOW_SLAVE_STATUS => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare('SHOW SLAVE STATUS');
+ },
+ KILL_CONNECTION => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+ ? 'KILL CONNECTION ?'
+ : 'KILL ?');
+ },
+ OPEN_TABLES => sub {
+ my ( $dbh ) = @_;
+ return version_ge($dbh, '4.0.0')
+ ? $dbh->prepare('SHOW OPEN TABLES')
+ : undef;
+ },
+ PROCESSLIST => sub {
+ my ( $dbh ) = @_;
+ return $dbh->prepare('SHOW FULL PROCESSLIST');
+ },
+);
+
+# Plugins!
+my %plugins = (
+);
+
+# ###########################################################################
+# Run the program {{{1
+# ###########################################################################
+
+# This config variable is only useful for MS Windows because its terminal
+# can't tell how tall it is.
+if ( !$windows ) {
+ delete $config{max_height};
+}
+
+# Try to lower my priority.
+eval { setpriority(0, 0, getpriority(0, 0) + 10); };
+
+# Print stuff to the screen immediately, don't wait for a newline.
+$OUTPUT_AUTOFLUSH = 1;
+
+# Clear the screen and load the configuration.
+$clear_screen_sub->();
+load_config();
+post_process_tbl_meta();
+
+# Make sure no changes are written to config file in non-interactive mode.
+if ( $opts{n} ) {
+ $config{readonly}->{val} = 1;
+}
+
+eval {
+
+ # Open the file for InnoDB status
+ if ( @ARGV ) {
+ my $filename = shift @ARGV;
+ open $file, "<", $filename
+ or die "Cannot open '$filename': $OS_ERROR";
+ }
+
+ # In certain modes we might have to collect data for two cycles
+ # before printing anything out, so we need to bump up the count one.
+ if ( $opts{n} && $opts{count} && $config{status_inc}->{val}
+ && $config{mode}->{val} =~ m/[S]/ )
+ {
+ $opts{count}++;
+ }
+
+ while (++$clock) {
+
+ my $mode = $config{mode}->{val} || 'T';
+ if ( !$modes{$mode} ) {
+ die "Mode '$mode' doesn't exist; try one of these:\n"
+ . join("\n", map { " $_ $modes{$_}->{hdr}" } sort keys %modes)
+ . "\n";
+ }
+
+ if ( !$opts{n} ) {
+ @last_term_size = @this_term_size;
+ @this_term_size = Term::ReadKey::GetTerminalSize(\*STDOUT);
+ if ( $windows ) {
+ $this_term_size[0]--;
+ $this_term_size[1]
+ = min($this_term_size[1], $config{max_height}->{val});
+ }
+ die("Can't read terminal size") unless @this_term_size;
+ }
+
+ # If there's no connection to a database server, we need to fix that...
+ if ( !%connections ) {
+ print "You have not defined any database connections.\n\n";
+ add_new_dsn();
+ }
+
+ # See whether there are any connections defined for this mode. If there's only one
+ # connection total, assume the user wants to just use innotop for a single server
+ # and don't ask which server to connect to. Also, if we're monitoring from a file,
+ # we just use the first connection.
+ if ( !get_connections() ) {
+ if ( $file || 1 == scalar keys %connections ) {
+ $modes{$config{mode}->{val}}->{connections} = [ keys %connections ];
+ }
+ else {
+ choose_connections();
+ }
+ }
+
+ # Term::ReadLine might have re-set $OUTPUT_AUTOFLUSH.
+ $OUTPUT_AUTOFLUSH = 1;
+
+ # Prune old data
+ my $sets = $config{num_status_sets}->{val};
+ foreach my $store ( values %vars ) {
+ delete @{$store}{ grep { $_ < $clock - $sets } keys %$store };
+ }
+ %info_gotten = ();
+
+ # Call the subroutine to display this mode.
+ $modes{$mode}->{display_sub}->();
+
+ # It may be time to quit now.
+ if ( $opts{count} && $clock >= $opts{count} ) {
+ finish();
+ }
+
+ # Wait for a bit.
+ if ( $opts{n} ) {
+ sleep($config{interval}->{val});
+ }
+ else {
+ ReadMode('cbreak');
+ $char = ReadKey($config{interval}->{val});
+ ReadMode('normal');
+ }
+
+ # Handle whatever action the key indicates.
+ do_key_action();
+
+ }
+};
+if ( $EVAL_ERROR ) {
+ core_dump( $EVAL_ERROR );
+}
+finish();
+
+# Subroutines {{{1
+# Mode functions{{{2
+# switch_mode {{{3
+sub switch_mode {
+ my $mode = shift;
+ $config{mode}->{val} = $mode;
+}
+
+# Prompting functions {{{2
+# prompt_list {{{3
+# Prompts the user for a value, given a question, initial value,
+# a completion function and a hashref of hints.
+sub prompt_list {
+ die "Can't call in non-interactive mode" if $opts{n};
+ my ( $question, $init, $completion, $hints ) = @_;
+ if ( $hints ) {
+ # Figure out how wide the table will be
+ my $max_name = max(map { length($_) } keys %$hints );
+ $max_name ||= 0;
+ $max_name += 3;
+ my @meta_rows = create_table2(
+ [ sort keys %$hints ],
+ { map { $_ => $_ } keys %$hints },
+ { map { $_ => trunc($hints->{$_}, $this_term_size[0] - $max_name) } keys %$hints },
+ { sep => ' ' });
+ if (@meta_rows > 10) {
+ # Try to split and stack the meta rows next to each other
+ my $split = int(@meta_rows / 2);
+ @meta_rows = stack_next(
+ [@meta_rows[0..$split - 1]],
+ [@meta_rows[$split..$#meta_rows]],
+ { pad => ' | '},
+ );
+ }
+ print join( "\n",
+ '',
+ map { ref $_ ? colored(@$_) : $_ } create_caption('Choose from', @meta_rows), ''),
+ "\n";
+ }
+ $term->Attribs->{completion_function} = $completion;
+ my $answer = $term->readline("$question: ", $init);
+ $OUTPUT_AUTOFLUSH = 1;
+ $answer = '' if !defined($answer);
+ $answer =~ s/\s+$//;
+ return $answer;
+}
+
+# prompt {{{3
+# Prints out a prompt and reads from the keyboard, then validates with the
+# validation regex until the input is correct.
+sub prompt {
+ die "Can't call in non-interactive mode" if $opts{n};
+ my ( $prompt, $regex, $init, $completion ) = @_;
+ my $response;
+ my $success = 0;
+ do {
+ if ( $completion ) {
+ $term->Attribs->{completion_function} = $completion;
+ }
+ $response = $term->readline("$prompt: ", $init);
+ if ( $regex && $response !~ m/$regex/ ) {
+ print "Invalid response.\n\n";
+ }
+ else {
+ $success = 1;
+ }
+ } while ( !$success );
+ $OUTPUT_AUTOFLUSH = 1;
+ $response =~ s/\s+$//;
+ return $response;
+}
+
+# prompt_noecho {{{3
+# Unfortunately, suppressing echo with Term::ReadLine isn't reliable; the user might not
+# have that library, or it might not support that feature.
+sub prompt_noecho {
+ my ( $prompt ) = @_;
+ print colored("$prompt: ", 'underline');
+ my $response;
+ ReadMode('noecho');
+ $response = <STDIN>;
+ chomp($response);
+ ReadMode('normal');
+ return $response;
+}
+
+# do_key_action {{{3
+# Depending on whether a key was read, do something. Keys have certain
+# actions defined in lookup tables. Each mode may have its own lookup table,
+# which trumps the global table -- so keys can be context-sensitive. The key
+# may be read and written in a subroutine, so it's a global.
+sub do_key_action {
+ if ( defined $char ) {
+ my $mode = $config{mode}->{val};
+ my $action
+ = defined($modes{$mode}->{action_for}->{$char})
+ ? $modes{$mode}->{action_for}->{$char}->{action}
+ : defined($action_for{$char})
+ ? $action_for{$char}->{action}
+ : sub{};
+ $action->();
+ }
+}
+
+# pause {{{3
+sub pause {
+ die "Can't call in non-interactive mode" if $opts{n};
+ my $msg = shift;
+ print defined($msg) ? "\n$msg" : "\nPress any key to continue";
+ ReadMode('cbreak');
+ my $char = ReadKey(0);
+ ReadMode('normal');
+ return $char;
+}
+
+# reverse_sort {{{3
+sub reverse_sort {
+ my $tbl = shift;
+ $tbl_meta{$tbl}->{sort_dir} *= -1;
+}
+
+# select_cxn {{{3
+# Selects connection(s). If the mode (or argument list) has only one, returns
+# it without prompt.
+sub select_cxn {
+ my ( $prompt, @cxns ) = @_;
+ if ( !@cxns ) {
+ @cxns = get_connections();
+ }
+ if ( @cxns == 1 ) {
+ return $cxns[0];
+ }
+ my $choices = prompt_list(
+ $prompt,
+ $cxns[0],
+ sub{ return @cxns },
+ { map { $_ => $connections{$_}->{dsn} } @cxns });
+ my @result = unique(grep { my $a = $_; grep { $_ eq $a } @cxns } split(/\s+/, $choices));
+ return @result;
+}
+
+# kill_query {{{3
+# Kills a connection, or on new versions, optionally a query but not connection.
+sub kill_query {
+ my ( $q_or_c ) = @_;
+
+ my $info = choose_thread(
+ sub { 1 },
+ 'Select a thread to kill the ' . $q_or_c,
+ );
+ return unless $info;
+ return unless pause("Kill $info->{id}?") =~ m/y/i;
+
+ eval {
+ do_stmt($info->{cxn}, $q_or_c eq 'QUERY' ? 'KILL_QUERY' : 'KILL_CONNECTION', $info->{id} );
+ };
+
+ if ( $EVAL_ERROR ) {
+ print "\nError: $EVAL_ERROR";
+ pause();
+ }
+}
+
+# set_display_precision {{{3
+sub set_display_precision {
+ my $dir = shift;
+ $config{num_digits}->{val} = min(9, max(0, $config{num_digits}->{val} + $dir));
+}
+
+sub toggle_visible_table {
+ my ( $mode, $table ) = @_;
+ my $visible = $modes{$mode}->{visible_tables};
+ if ( grep { $_ eq $table } @$visible ) {
+ $modes{$mode}->{visible_tables} = [ grep { $_ ne $table } @$visible ];
+ }
+ else {
+ unshift @$visible, $table;
+ }
+ $modes{$mode}->{cust}->{visible_tables} = 1;
+}
+
+# toggle_filter{{{3
+sub toggle_filter {
+ my ( $tbl, $filter ) = @_;
+ my $filters = $tbl_meta{$tbl}->{filters};
+ if ( grep { $_ eq $filter } @$filters ) {
+ $tbl_meta{$tbl}->{filters} = [ grep { $_ ne $filter } @$filters ];
+ }
+ else {
+ push @$filters, $filter;
+ }
+ $tbl_meta{$tbl}->{cust}->{filters} = 1;
+}
+
+# toggle_config {{{3
+sub toggle_config {
+ my ( $key ) = @_;
+ $config{$key}->{val} ^= 1;
+}
+
+# create_deadlock {{{3
+sub create_deadlock {
+ $clear_screen_sub->();
+
+ print "This function will deliberately cause a small deadlock, "
+ . "clearing deadlock information from the InnoDB monitor.\n\n";
+
+ my $answer = prompt("Are you sure you want to proceed? Say 'y' if you do");
+ return 0 unless $answer eq 'y';
+
+ my ( $cxn ) = select_cxn('Clear on which server? ');
+ return unless $cxn && exists($connections{$cxn});
+
+ clear_deadlock($cxn);
+}
+
+# deadlock_thread {{{3
+sub deadlock_thread {
+ my ( $id, $tbl, $cxn ) = @_;
+
+ eval {
+ my $dbh = get_new_db_connection($cxn, 1);
+ my @stmts = (
+ "set transaction isolation level serializable",
+ (version_ge($dbh, '4.0.11') ? "start transaction" : 'begin'),
+ "select * from $tbl where a = $id",
+ "update $tbl set a = $id where a <> $id",
+ );
+
+ foreach my $stmt (@stmts[0..2]) {
+ $dbh->do($stmt);
+ }
+ sleep(1 + $id);
+ $dbh->do($stmts[-1]);
+ };
+ if ( $EVAL_ERROR ) {
+ if ( $EVAL_ERROR !~ m/Deadlock found/ ) {
+ die $EVAL_ERROR;
+ }
+ }
+ exit(0);
+}
+
+# Purges unused binlogs on the master, up to but not including the latest log.
+# TODO: guess which connections are slaves of a given master.
+sub purge_master_logs {
+ my @cxns = get_connections();
+
+ get_master_slave_status(@cxns);
+
+ # Toss out the rows that don't have master/slave status...
+ my @vars =
+ grep { $_ && ($_->{file} || $_->{master_host}) }
+ map { $vars{$_}->{$clock} } @cxns;
+ @cxns = map { $_->{cxn} } @vars;
+
+ # Figure out which master to purge ons.
+ my @masters = map { $_->{cxn} } grep { $_->{file} } @vars;
+ my ( $master ) = select_cxn('Which master?', @masters );
+ return unless $master;
+ my ($master_status) = grep { $_->{cxn} eq $master } @vars;
+
+ # Figure out the result order (not lexical order) of master logs.
+ my @master_logs = get_master_logs($master);
+ my $i = 0;
+ my %master_logs = map { $_->{log_name} => $i++ } @master_logs;
+
+ # Ask which slave(s) are reading from this master.
+ my @slave_status = grep { $_->{master_host} } @vars;
+ my @slaves = map { $_->{cxn} } @slave_status;
+ @slaves = select_cxn("Which slaves are reading from $master?", @slaves);
+ @slave_status = grep { my $item = $_; grep { $item->{cxn} eq $_ } @slaves } @slave_status;
+ return unless @slave_status;
+
+ # Find the minimum binary log in use.
+ my $min_log = min(map { $master_logs{$_->{master_log_file}} } @slave_status);
+ my $log_name = $master_logs[$min_log]->{log_name};
+
+ my $stmt = "PURGE MASTER LOGS TO '$log_name'";
+ send_cmd_to_servers($stmt, 0, 'PURGE {MASTER | BINARY} LOGS {TO "log_name" | BEFORE "date"}', [$master]);
+}
+
+sub send_cmd_to_servers {
+ my ( $cmd, $all, $hint, $cxns ) = @_;
+ if ( $all ) {
+ @$cxns = get_connections();
+ }
+ elsif ( !@$cxns ) {
+ @$cxns = select_cxn('Which servers?', @$cxns);
+ }
+ if ( $hint ) {
+ print "\nHint: $hint\n";
+ }
+ $cmd = prompt('Command to send', undef, $cmd);
+ foreach my $cxn ( @$cxns ) {
+ eval {
+ my $sth = do_query($cxn, $cmd);
+ };
+ if ( $EVAL_ERROR ) {
+ print "Error from $cxn: $EVAL_ERROR\n";
+ }
+ else {
+ print "Success on $cxn\n";
+ }
+ }
+ pause();
+}
+
+# Display functions {{{2
+
+sub set_s_mode {
+ my ( $func ) = @_;
+ $clear_screen_sub->();
+ $config{S_func}->{val} = $func;
+}
+
+# start_S_mode {{{3
+sub start_S_mode {
+ $clear_screen_sub->();
+ switch_mode('S');
+}
+
+# display_B {{{3
+sub display_B {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_innodb_status(\@cxns);
+
+ my @buffer_pool;
+ my @page_statistics;
+ my @insert_buffers;
+ my @adaptive_hash_index;
+ my %rows_for = (
+ buffer_pool => \@buffer_pool,
+ page_statistics => \@page_statistics,
+ insert_buffers => \@insert_buffers,
+ adaptive_hash_index => \@adaptive_hash_index,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+ if ( $set->{IB_bp_complete} ) {
+ if ( $wanted{buffer_pool} ) {
+ push @buffer_pool, extract_values($set, $set, $pre, 'buffer_pool');
+ }
+ if ( $wanted{page_statistics} ) {
+ push @page_statistics, extract_values($set, $set, $pre, 'page_statistics');
+ }
+ }
+ if ( $set->{IB_ib_complete} ) {
+ if ( $wanted{insert_buffers} ) {
+ push @insert_buffers, extract_values(
+ $config{status_inc}->{val} ? inc(0, $cxn) : $set, $set, $pre,
+ 'insert_buffers');
+ }
+ if ( $wanted{adaptive_hash_index} ) {
+ push @adaptive_hash_index, extract_values($set, $set, $pre, 'adaptive_hash_index');
+ }
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_C {{{3
+sub display_C {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_status_info(@cxns);
+
+ my @cmd_summary;
+ my %rows_for = (
+ cmd_summary => \@cmd_summary,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ # For now, I'm manually pulling these variables out and pivoting. Eventually a SQL-ish
+ # dialect should let me join a table to a grouped and pivoted table and do this more easily.
+ # TODO: make it so.
+ my $prefix = qr/^$config{cmd_filter}->{val}/; # TODO: this is a total hack
+ my @values;
+ my ($total, $last_total) = (0, 0);
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+ foreach my $key ( keys %$set ) {
+ next unless $key =~ m/$prefix/i;
+ my $val = $set->{$key};
+ next unless defined $val && $val =~ m/^\d+$/;
+ my $last_val = $val - ($pre->{$key} || 0);
+ $total += $val;
+ $last_total += $last_val;
+ push @values, {
+ name => $key,
+ value => $val,
+ last_value => $last_val,
+ };
+ }
+ }
+
+ # Add aggregation and turn into a real set TODO: total hack
+ if ( $wanted{cmd_summary} ) {
+ foreach my $value ( @values ) {
+ @{$value}{qw(total last_total)} = ($total, $last_total);
+ push @cmd_summary, extract_values($value, $value, $value, 'cmd_summary');
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_D {{{3
+sub display_D {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_innodb_status(\@cxns);
+
+ my @deadlock_transactions;
+ my @deadlock_locks;
+ my %rows_for = (
+ deadlock_transactions => \@deadlock_transactions,
+ deadlock_locks => \@deadlock_locks,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ foreach my $cxn ( @cxns ) {
+ my $innodb_status = $vars{$cxn}->{$clock};
+ my $prev_status = $vars{$cxn}->{$clock-1} || $innodb_status;
+
+ if ( $innodb_status->{IB_dl_timestring} ) {
+
+ my $victim = $innodb_status->{IB_dl_rolled_back} || 0;
+
+ if ( %wanted ) {
+ foreach my $txn_id ( keys %{$innodb_status->{IB_dl_txns}} ) {
+ my $txn = $innodb_status->{IB_dl_txns}->{$txn_id};
+ my $pre = $prev_status->{IB_dl_txns}->{$txn_id} || $txn;
+
+ if ( $wanted{deadlock_transactions} ) {
+ my $hash = extract_values($txn->{tx}, $txn->{tx}, $pre->{tx}, 'deadlock_transactions');
+ $hash->{cxn} = $cxn;
+ $hash->{dl_txn_num} = $txn_id;
+ $hash->{victim} = $txn_id == $victim ? 'Yes' : 'No';
+ $hash->{timestring} = $innodb_status->{IB_dl_timestring};
+ $hash->{truncates} = $innodb_status->{IB_dl_complete} ? 'No' : 'Yes';
+ push @deadlock_transactions, $hash;
+ }
+
+ if ( $wanted{deadlock_locks} ) {
+ foreach my $lock ( @{$txn->{locks}} ) {
+ my $hash = extract_values($lock, $lock, $lock, 'deadlock_locks');
+ $hash->{dl_txn_num} = $txn_id;
+ $hash->{cxn} = $cxn;
+ $hash->{mysql_thread_id} = $txn->{tx}->{mysql_thread_id};
+ push @deadlock_locks, $hash;
+ }
+ }
+
+ }
+ }
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_F {{{3
+sub display_F {
+ my @display_lines;
+ my ( $cxn ) = get_connections();
+ get_innodb_status([$cxn]);
+ my $innodb_status = $vars{$cxn}->{$clock};
+
+ if ( $innodb_status->{IB_fk_timestring} ) {
+
+ push @display_lines, 'Reason: ' . $innodb_status->{IB_fk_reason};
+
+ # Display FK errors caused by invalid DML.
+ if ( $innodb_status->{IB_fk_txn} ) {
+ my $txn = $innodb_status->{IB_fk_txn};
+ push @display_lines,
+ '',
+ "User $txn->{user} from $txn->{hostname}, thread $txn->{mysql_thread_id} was executing:",
+ '', no_ctrl_char($txn->{query_text});
+ }
+
+ my @fk_table = create_table2(
+ $tbl_meta{fk_error}->{visible},
+ meta_to_hdr('fk_error'),
+ extract_values($innodb_status, $innodb_status, $innodb_status, 'fk_error'),
+ { just => '-', sep => ' '});
+ push @display_lines, '', @fk_table;
+
+ }
+ else {
+ push @display_lines, '', 'No foreign key error data.';
+ }
+ draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# display_I {{{3
+sub display_I {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_innodb_status(\@cxns);
+
+ my @io_threads;
+ my @pending_io;
+ my @file_io_misc;
+ my @log_statistics;
+ my %rows_for = (
+ io_threads => \@io_threads,
+ pending_io => \@pending_io,
+ file_io_misc => \@file_io_misc,
+ log_statistics => \@log_statistics,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+ if ( $set->{IB_io_complete} ) {
+ if ( $wanted{io_threads} ) {
+ my $cur_threads = $set->{IB_io_threads};
+ my $pre_threads = $pre->{IB_io_threads} || $cur_threads;
+ foreach my $key ( sort keys %$cur_threads ) {
+ my $cur_thd = $cur_threads->{$key};
+ my $pre_thd = $pre_threads->{$key} || $cur_thd;
+ my $hash = extract_values($cur_thd, $cur_thd, $pre_thd, 'io_threads');
+ $hash->{cxn} = $cxn;
+ push @io_threads, $hash;
+ }
+ }
+ if ( $wanted{pending_io} ) {
+ push @pending_io, extract_values($set, $set, $pre, 'pending_io');
+ }
+ if ( $wanted{file_io_misc} ) {
+ push @file_io_misc, extract_values(
+ $config{status_inc}->{val} ? inc(0, $cxn) : $set,
+ $set, $pre, 'file_io_misc');
+ }
+ }
+ if ( $set->{IB_lg_complete} && $wanted{log_statistics} ) {
+ push @log_statistics, extract_values($set, $set, $pre, 'log_statistics');
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_L {{{3
+sub display_L {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_innodb_status(\@cxns);
+
+ my @innodb_locks;
+ my %rows_for = (
+ innodb_locks => \@innodb_locks,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ # Get info on locks
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock} or next;
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+ if ( $wanted{innodb_locks} && defined $set->{IB_tx_transactions} && @{$set->{IB_tx_transactions}} ) {
+
+ my $cur_txns = $set->{IB_tx_transactions};
+ my $pre_txns = $pre->{IB_tx_transactions} || $cur_txns;
+ my %cur_txns = map { $_->{mysql_thread_id} => $_ } @$cur_txns;
+ my %pre_txns = map { $_->{mysql_thread_id} => $_ } @$pre_txns;
+ foreach my $txn ( @$cur_txns ) {
+ foreach my $lock ( @{$txn->{locks}} ) {
+ my %hash = map { $_ => $txn->{$_} } qw(txn_id mysql_thread_id lock_wait_time active_secs);
+ map { $hash{$_} = $lock->{$_} } qw(lock_type space_id page_no n_bits index db table txn_id lock_mode special insert_intention waiting);
+ $hash{cxn} = $cxn;
+ push @innodb_locks, extract_values(\%hash, \%hash, \%hash, 'innodb_locks');
+ }
+ }
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_M {{{3
+sub display_M {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_master_slave_status(@cxns);
+ get_status_info(@cxns);
+
+ my @slave_sql_status;
+ my @slave_io_status;
+ my @master_status;
+ my %rows_for = (
+ slave_sql_status => \@slave_sql_status,
+ slave_io_status => \@slave_io_status,
+ master_status => \@master_status,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ foreach my $cxn ( @cxns ) {
+ my $set = $config{status_inc}->{val} ? inc(0, $cxn) : $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock - 1} || $set;
+ if ( $wanted{slave_sql_status} ) {
+ push @slave_sql_status, extract_values($set, $set, $pre, 'slave_sql_status');
+ }
+ if ( $wanted{slave_io_status} ) {
+ push @slave_io_status, extract_values($set, $set, $pre, 'slave_io_status');
+ }
+ if ( $wanted{master_status} ) {
+ push @master_status, extract_values($set, $set, $pre, 'master_status');
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_O {{{3
+sub display_O {
+ my @display_lines = ('');
+ my @cxns = get_connections();
+ my @open_tables = get_open_tables(@cxns);
+ my @tables = map { extract_values($_, $_, $_, 'open_tables') } @open_tables;
+ push @display_lines, set_to_tbl(\@tables, 'open_tables'), get_cxn_errors(@cxns);
+ draw_screen(\@display_lines);
+}
+
+# display_Q {{{3
+sub display_Q {
+ my @display_lines;
+
+ my @q_header;
+ my @processlist;
+ my %rows_for = (
+ q_header => \@q_header,
+ processlist => \@processlist,
+ );
+
+ my @visible = $opts{n} ? 'processlist' : get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ # Get the data
+ my @cxns = get_connections();
+ my @full_processlist = get_full_processlist(@cxns);
+
+ # Create header
+ if ( $wanted{q_header} ) {
+ get_status_info(@cxns);
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+ my $hash = extract_values($set, $set, $pre, 'q_header');
+ $hash->{cxn} = $cxn;
+ $hash->{when} = 'Total';
+ push @q_header, $hash;
+
+ if ( exists $vars{$cxn}->{$clock - 1} ) {
+ my $inc = inc(0, $cxn);
+ my $hash = extract_values($inc, $set, $pre, 'q_header');
+ $hash->{cxn} = $cxn;
+ $hash->{when} = 'Now';
+ push @q_header, $hash;
+ }
+ }
+ }
+
+ if ( $wanted{processlist} ) {
+ # TODO: save prev values
+ push @processlist, map { extract_values($_, $_, $_, 'processlist') } @full_processlist;
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ next unless $wanted{$tbl};
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ # Save queries in global variable for analysis. The rows in %rows_for have been
+ # filtered, etc as a side effect of set_to_tbl(), so they are the same as the rows
+ # that get pushed to the screen.
+ @current_queries = map {
+ my %hash;
+ @hash{ qw(cxn id db query secs) } = @{$_}{ qw(cxn mysql_thread_id db info secs) };
+ \%hash;
+ } @{$rows_for{processlist}};
+
+ draw_screen(\@display_lines);
+}
+
+# display_R {{{3
+sub display_R {
+ my @display_lines;
+ my @cxns = get_connections();
+ get_innodb_status(\@cxns);
+
+ my @row_operations;
+ my @row_operation_misc;
+ my @semaphores;
+ my @wait_array;
+ my %rows_for = (
+ row_operations => \@row_operations,
+ row_operation_misc => \@row_operation_misc,
+ semaphores => \@semaphores,
+ wait_array => \@wait_array,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+ my $incvar = $config{status_inc}->{val};
+
+ foreach my $cxn ( @cxns ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+ my $inc; # Only assigned to if wanted
+
+ if ( $set->{IB_ro_complete} ) {
+ if ( $wanted{row_operations} ) {
+ $inc ||= $incvar ? inc(0, $cxn) : $set;
+ push @row_operations, extract_values($inc, $set, $pre, 'row_operations');
+ }
+ if ( $wanted{row_operation_misc} ) {
+ push @row_operation_misc, extract_values($set, $set, $pre, 'row_operation_misc'),
+ }
+ }
+
+ if ( $set->{IB_sm_complete} && $wanted{semaphores} ) {
+ $inc ||= $incvar ? inc(0, $cxn) : $set;
+ push @semaphores, extract_values($inc, $set, $pre, 'semaphores');
+ }
+
+ if ( $set->{IB_sm_wait_array_size} && $wanted{wait_array} ) {
+ foreach my $wait ( @{$set->{IB_sm_waits}} ) {
+ my $hash = extract_values($wait, $wait, $wait, 'wait_array');
+ $hash->{cxn} = $cxn;
+ push @wait_array, $hash;
+ }
+ }
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ draw_screen(\@display_lines);
+}
+
+# display_T {{{3
+sub display_T {
+ my @display_lines;
+
+ my @t_header;
+ my @innodb_transactions;
+ my %rows_for = (
+ t_header => \@t_header,
+ innodb_transactions => \@innodb_transactions,
+ );
+
+ my @visible = $opts{n} ? 'innodb_transactions' : get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+
+ my @cxns = get_connections();
+
+ # If the header is to be shown, buffer pool data is required.
+ get_innodb_status( \@cxns, [ $wanted{t_header} ? qw(bp) : () ] );
+
+ foreach my $cxn ( get_connections() ) {
+ my $set = $vars{$cxn}->{$clock};
+ my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+ next unless $set->{IB_tx_transactions};
+
+ if ( $wanted{t_header} ) {
+ my $hash = extract_values($set, $set, $pre, 't_header');
+ push @t_header, $hash;
+ }
+
+ if ( $wanted{innodb_transactions} ) {
+ my $cur_txns = $set->{IB_tx_transactions};
+ my $pre_txns = $pre->{IB_tx_transactions} || $cur_txns;
+ my %cur_txns = map { $_->{mysql_thread_id} => $_ } @$cur_txns;
+ my %pre_txns = map { $_->{mysql_thread_id} => $_ } @$pre_txns;
+ foreach my $thd_id ( sort keys %cur_txns ) {
+ my $cur_txn = $cur_txns{$thd_id};
+ my $pre_txn = $pre_txns{$thd_id} || $cur_txn;
+ my $hash = extract_values($cur_txn, $cur_txn, $pre_txn, 'innodb_transactions');
+ $hash->{cxn} = $cxn;
+ push @innodb_transactions, $hash;
+ }
+ }
+
+ }
+
+ my $first_table = 0;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+
+ # Save queries in global variable for analysis. The rows in %rows_for have been
+ # filtered, etc as a side effect of set_to_tbl(), so they are the same as the rows
+ # that get pushed to the screen.
+ @current_queries = map {
+ my %hash;
+ @hash{ qw(cxn id db query secs) } = @{$_}{ qw(cxn mysql_thread_id db query_text active_secs) };
+ \%hash;
+ } @{$rows_for{innodb_transactions}};
+
+ draw_screen(\@display_lines);
+}
+
+# display_S {{{3
+sub display_S {
+ my $fmt = get_var_set('S_set');
+ my $func = $config{S_func}->{val};
+ my $inc = $func eq 'g' || $config{status_inc}->{val};
+
+ # The table's meta-data is generated from the compiled var_set.
+ my ( $cols, $visible );
+ if ( $tbl_meta{var_status}->{fmt} && $fmt eq $tbl_meta{var_status}->{fmt} ) {
+ ( $cols, $visible ) = @{$tbl_meta{var_status}}{qw(cols visible)};
+ }
+ else {
+ ( $cols, $visible ) = compile_select_stmt($fmt);
+
+ # Apply missing values to columns. Always apply averages across all connections.
+ map {
+ $_->{agg} = 'avg';
+ $_->{label} = $_->{hdr};
+ } values %$cols;
+
+ $tbl_meta{var_status}->{cols} = $cols;
+ $tbl_meta{var_status}->{visible} = $visible;
+ $tbl_meta{var_status}->{fmt} = $fmt;
+ map { $tbl_meta{var_status}->{cols}->{$_}->{just} = ''} @$visible;
+ }
+
+ my @var_status;
+ my %rows_for = (
+ var_status => \@var_status,
+ );
+
+ my @visible = get_visible_tables();
+ my %wanted = map { $_ => 1 } @visible;
+ my @cxns = get_connections();
+
+ get_status_info(@cxns);
+ get_innodb_status(\@cxns);
+
+ # Set up whether to pivot and how many sets to extract.
+ $tbl_meta{var_status}->{pivot} = $func eq 'v';
+
+ my $num_sets
+ = $func eq 'v'
+ ? $config{num_status_sets}->{val}
+ : 0;
+ foreach my $set ( 0 .. $num_sets ) {
+ my @rows;
+ foreach my $cxn ( @cxns ) {
+ my $vars = $inc ? inc($set, $cxn) : $vars{$cxn}->{$clock - $set};
+ my $cur = $vars{$cxn}->{$clock-$set};
+ my $pre = $vars{$cxn}->{$clock-$set-1} || $cur;
+ next unless $vars && %$vars;
+ my $hash = extract_values($vars, $cur, $pre, 'var_status');
+ push @rows, $hash;
+ }
+ @rows = apply_group_by('var_status', [], @rows);
+ push @var_status, @rows;
+ }
+
+ # Recompile the sort func. TODO: avoid recompiling at every refresh.
+ # Figure out whether the data is all numeric and decide on a sort type.
+ # my $cmp
+ # = scalar(
+ # grep { !defined $_ || $_ !~ m/^\d+$/ }
+ # map { my $col = $_; map { $_->{$col} } @var_status }
+ # $tbl_meta{var_status}->{sort_cols} =~ m/(\w+)/g)
+ # ? 'cmp'
+ # : '<=>';
+ $tbl_meta{var_status}->{sort_func} = make_sort_func($tbl_meta{var_status});
+
+ # ################################################################
+ # Now there is specific display code based on $config{S_func}
+ # ################################################################
+ if ( $func =~ m/s|g/ ) {
+ my $min_width = 4;
+
+ # Clear the screen if the display width changed.
+ if ( @last_term_size && $this_term_size[0] != $last_term_size[0] ) {
+ $lines_printed = 0;
+ $clear_screen_sub->();
+ }
+
+ if ( $func eq 's' ) {
+ # Decide how wide columns should be.
+ my $num_cols = scalar(@$visible);
+ my $width = $opts{n} ? 0 : max($min_width, int(($this_term_size[0] - $num_cols + 1) / $num_cols));
+ my $g_format = $opts{n} ? ( "%s\t" x $num_cols ) : ( "%-${width}s " x $num_cols );
+
+ # Print headers every now and then. Headers can get really long, so compact them.
+ my @hdr = @$visible;
+ if ( $opts{n} ) {
+ if ( $lines_printed == 0 ) {
+ print join("\t", @hdr), "\n";
+ $lines_printed++;
+ }
+ }
+ elsif ( $lines_printed == 0 || $lines_printed > $this_term_size[1] - 2 ) {
+ @hdr = map { donut(crunch($_, $width), $width) } @hdr;
+ print join(' ', map { sprintf( "%${width}s", donut($_, $width)) } @hdr) . "\n";
+ $lines_printed = 1;
+ }
+
+ # Design a column format for the values.
+ my $format
+ = $opts{n}
+ ? join("\t", map { '%s' } @$visible) . "\n"
+ : join(' ', map { "%${width}s" } @hdr) . "\n";
+
+ foreach my $row ( @var_status ) {
+ printf($format, map { defined $_ ? $_ : '' } @{$row}{ @$visible });
+ $lines_printed++;
+ }
+ }
+ else { # 'g' mode
+ # Design a column format for the values.
+ my $num_cols = scalar(@$visible);
+ my $width = $opts{n} ? 0 : int(($this_term_size[0] - $num_cols + 1) / $num_cols);
+ my $format = $opts{n} ? ( "%s\t" x $num_cols ) : ( "%-${width}s " x $num_cols );
+ $format =~ s/\s$/\n/;
+
+ # Print headers every now and then.
+ if ( $opts{n} ) {
+ if ( $lines_printed == 0 ) {
+ print join("\t", @$visible), "\n";
+ print join("\t", map { shorten($mvs{$_}) } @$visible), "\n";
+ }
+ }
+ elsif ( $lines_printed == 0 || $lines_printed > $this_term_size[1] - 2 ) {
+ printf($format, map { donut(crunch($_, $width), $width) } @$visible);
+ printf($format, map { shorten($mvs{$_} || 0) } @$visible);
+ $lines_printed = 2;
+ }
+
+ # Update the max ever seen, and scale by the max ever seen.
+ my $set = $var_status[0];
+ foreach my $col ( @$visible ) {
+ $set->{$col} = 1 unless defined $set->{$col} && $set->{$col} =~ m/$num_regex/;
+ $set->{$col} = ($set->{$col} || 1) / ($set->{Uptime_hires} || 1);
+ $mvs{$col} = max($mvs{$col} || 1, $set->{$col});
+ $set->{$col} /= $mvs{$col};
+ }
+ printf($format, map { ( $config{graph_char}->{val} x int( $width * $set->{$_} )) || '.' } @$visible );
+ $lines_printed++;
+
+ }
+ }
+ else { # 'v'
+ my $first_table = 0;
+ my @display_lines;
+ foreach my $tbl ( @visible ) {
+ push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+ push @display_lines, get_cxn_errors(@cxns)
+ if ( $config{debug}->{val} || !$first_table++ );
+ }
+ $clear_screen_sub->();
+ draw_screen( \@display_lines );
+ }
+}
+
+# display_explain {{{3
+sub display_explain {
+ my $info = shift;
+ my $cxn = $info->{cxn};
+ my $db = $info->{db};
+
+ my ( $mods, $query ) = rewrite_for_explain($info->{query});
+
+ my @display_lines;
+
+ if ( $query ) {
+
+ my $part = version_ge($dbhs{$cxn}->{dbh}, '5.1.5') ? 'PARTITIONS' : '';
+ $query = "EXPLAIN $part\n" . $query;
+
+ eval {
+ if ( $db ) {
+ do_query($cxn, "use $db");
+ }
+ my $sth = do_query($cxn, $query);
+
+ my $res;
+ while ( $res = $sth->fetchrow_hashref() ) {
+ map { $res->{$_} ||= '' } ( 'partitions', keys %$res);
+ my @this_table = create_caption("Sub-Part $res->{id}",
+ create_table2(
+ $tbl_meta{explain}->{visible},
+ meta_to_hdr('explain'),
+ extract_values($res, $res, $res, 'explain')));
+ @display_lines = stack_next(\@display_lines, \@this_table, { pad => ' ', vsep => 2 });
+ }
+ };
+
+ if ( $EVAL_ERROR ) {
+ push @display_lines,
+ '',
+ "The query could not be explained. Only SELECT queries can be "
+ . "explained; innotop tries to rewrite certain REPLACE and INSERT queries "
+ . "into SELECT, but this doesn't always succeed.";
+ }
+
+ }
+ else {
+ push @display_lines, '', 'The query could not be explained.';
+ }
+
+ if ( $mods ) {
+ push @display_lines, '', '[This query has been re-written to be explainable]';
+ }
+
+ unshift @display_lines, no_ctrl_char($query);
+ draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# rewrite_for_explain {{{3
+sub rewrite_for_explain {
+ my $query = shift;
+
+ my $mods = 0;
+ my $orig = $query;
+ $mods += $query =~ s/^\s*(?:replace|insert).*?select/select/is;
+ $mods += $query =~ s/^
+ \s*create\s+(?:temporary\s+)?table
+ \s+(?:\S+\s+)as\s+select/select/xis;
+ $mods += $query =~ s/\s+on\s+duplicate\s+key\s+update.*$//is;
+ return ( $mods, $query );
+}
+
+# show_optimized_query {{{3
+sub show_optimized_query {
+ my $info = shift;
+ my $cxn = $info->{cxn};
+ my $db = $info->{db};
+ my $meta = $dbhs{$cxn};
+
+ my @display_lines;
+
+ my ( $mods, $query ) = rewrite_for_explain($info->{query});
+
+ if ( $mods ) {
+ push @display_lines, '[This query has been re-written to be explainable]';
+ }
+
+ if ( $query ) {
+ push @display_lines, no_ctrl_char($info->{query});
+
+ eval {
+ if ( $db ) {
+ do_query($cxn, "use $db");
+ }
+ do_query( $cxn, 'EXPLAIN EXTENDED ' . $query ) or die "Can't explain query";
+ my $sth = do_query($cxn, 'SHOW WARNINGS');
+ my $res = $sth->fetchall_arrayref({});
+
+ if ( $res ) {
+ foreach my $result ( @$res ) {
+ push @display_lines, 'Note:', no_ctrl_char($result->{message});
+ }
+ }
+ else {
+ push @display_lines, '', 'The query optimization could not be generated.';
+ }
+ };
+
+ if ( $EVAL_ERROR ) {
+ push @display_lines, '', "The optimization could not be generated: $EVAL_ERROR";
+ }
+
+ }
+ else {
+ push @display_lines, '', 'The query optimization could not be generated.';
+ }
+
+ draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# display_help {{{3
+sub display_help {
+ my $mode = $config{mode}->{val};
+
+ # Get globally mapped keys, then overwrite them with mode-specific ones.
+ my %keys = map {
+ $_ => $action_for{$_}->{label}
+ } keys %action_for;
+ foreach my $key ( keys %{$modes{$mode}->{action_for}} ) {
+ $keys{$key} = $modes{$mode}->{action_for}->{$key}->{label};
+ }
+ delete $keys{'?'};
+
+ # Split them into three kinds of keys: MODE keys, action keys, and
+ # magic (special character) keys.
+ my @modes = sort grep { m/[A-Z]/ } keys %keys;
+ my @actions = sort grep { m/[a-z]/ } keys %keys;
+ my @magic = sort grep { m/[^A-Z]/i } keys %keys;
+
+ my @display_lines = ( '', 'Switch to a different mode:' );
+
+ # Mode keys
+ my @all_modes = map { "$_ $modes{$_}->{hdr}" } @modes;
+ my @col1 = splice(@all_modes, 0, ceil(@all_modes/3));
+ my @col2 = splice(@all_modes, 0, ceil(@all_modes/2));
+ my $max1 = max(map {length($_)} @col1);
+ my $max2 = max(map {length($_)} @col2);
+ while ( @col1 ) {
+ push @display_lines, sprintf(" %-${max1}s %-${max2}s %s",
+ (shift @col1 || ''),
+ (shift @col2 || ''),
+ (shift @all_modes || ''));
+ }
+
+ # Action keys
+ my @all_actions = map { "$_ $keys{$_}" } @actions;
+ @col1 = splice(@all_actions, 0, ceil(@all_actions/2));
+ $max1 = max(map {length($_)} @col1);
+ push @display_lines, '', 'Actions:';
+ while ( @col1 ) {
+ push @display_lines, sprintf(" %-${max1}s %s",
+ (shift @col1 || ''),
+ (shift @all_actions || ''));
+ }
+
+ # Magic keys
+ my @all_magic = map { sprintf('%4s', $action_for{$_}->{key} || $_) . " $keys{$_}" } @magic;
+ @col1 = splice(@all_magic, 0, ceil(@all_magic/2));
+ $max1 = max(map {length($_)} @col1);
+ push @display_lines, '', 'Other:';
+ while ( @col1 ) {
+ push @display_lines, sprintf("%-${max1}s%s",
+ (shift @col1 || ''),
+ (shift @all_magic || ''));
+ }
+
+ $clear_screen_sub->();
+ draw_screen(\@display_lines, { show_all => 1 } );
+ pause();
+ $clear_screen_sub->();
+}
+
+# show_full_query {{{3
+sub show_full_query {
+ my $info = shift;
+ my @display_lines = no_ctrl_char($info->{query});
+ draw_screen(\@display_lines, { raw => 1 });
+}
+
+# Formatting functions {{{2
+
+# create_table2 {{{3
+# Makes a two-column table, labels on left, data on right.
+# Takes refs of @cols, %labels and %data, %user_prefs
+sub create_table2 {
+ my ( $cols, $labels, $data, $user_prefs ) = @_;
+ my @rows;
+
+ if ( @$cols && %$data ) {
+
+ # Override defaults
+ my $p = {
+ just => '',
+ sep => ':',
+ just1 => '-',
+ };
+ if ( $user_prefs ) {
+ map { $p->{$_} = $user_prefs->{$_} } keys %$user_prefs;
+ }
+
+ # Fix undef values
+ map { $data->{$_} = '' unless defined $data->{$_} } @$cols;
+
+ # Format the table
+ my $max_l = max(map{ length($labels->{$_}) } @$cols);
+ my $max_v = max(map{ length($data->{$_}) } @$cols);
+ my $format = "%$p->{just}${max_l}s$p->{sep} %$p->{just1}${max_v}s";
+ foreach my $col ( @$cols ) {
+ push @rows, sprintf($format, $labels->{$col}, $data->{$col});
+ }
+ }
+ return @rows;
+}
+
+# stack_next {{{3
+# Stacks one display section next to the other. Accepts left-hand arrayref,
+# right-hand arrayref, and options hashref. Tries to stack as high as
+# possible, so
+# aaaaaa
+# bbb
+# can stack ccc next to the bbb.
+# NOTE: this DOES modify its arguments, even though it returns a new array.
+sub stack_next {
+ my ( $left, $right, $user_prefs ) = @_;
+ my @result;
+
+ my $p = {
+ pad => ' ',
+ vsep => 0,
+ };
+ if ( $user_prefs ) {
+ map { $p->{$_} = $user_prefs->{$_} } keys %$user_prefs;
+ }
+
+ # Find out how wide the LHS can be and still let the RHS fit next to it.
+ my $pad = $p->{pad};
+ my $max_r = max( map { length($_) } @$right) || 0;
+ my $max_l = $this_term_size[0] - $max_r - length($pad);
+
+ # Find the minimum row on the LHS that the RHS will fit next to.
+ my $i = scalar(@$left) - 1;
+ while ( $i >= 0 && length($left->[$i]) <= $max_l ) {
+ $i--;
+ }
+ $i++;
+ my $offset = $i;
+
+ if ( $i < scalar(@$left) ) {
+ # Find the max width of the section of the LHS against which the RHS
+ # will sit.
+ my $max_i_in_common = min($i + scalar(@$right) - 1, scalar(@$left) - 1);
+ my $max_width = max( map { length($_) } @{$left}[$i..$max_i_in_common]);
+
+ # Append the RHS onto the LHS until one runs out.
+ while ( $i < @$left && $i - $offset < @$right ) {
+ my $format = "%-${max_width}s$pad%${max_r}s";
+ $left->[$i] = sprintf($format, $left->[$i], $right->[$i - $offset]);
+ $i++;
+ }
+ while ( $i - $offset < @$right ) {
+ # There is more RHS to push on the end of the array
+ push @$left,
+ sprintf("%${max_width}s$pad%${max_r}s", ' ', $right->[$i - $offset]);
+ $i++;
+ }
+ push @result, @$left;
+ }
+ else {
+ # There is no room to put them side by side. Add them below, with
+ # a blank line above them if specified.
+ push @result, @$left;
+ push @result, (' ' x $this_term_size[0]) if $p->{vsep} && @$left;
+ push @result, @$right;
+ }
+ return @result;
+}
+
+# create_caption {{{3
+sub create_caption {
+ my ( $caption, @rows ) = @_;
+ if ( @rows ) {
+
+ # Calculate the width of what will be displayed, so it can be centered
+ # in that space. When the thing is wider than the display, center the
+ # caption in the display.
+ my $width = min($this_term_size[0], max(map { length(ref($_) ? $_->[0] : $_) } @rows));
+
+ my $cap_len = length($caption);
+
+ # It may be narrow enough to pad the sides with underscores and save a
+ # line on the screen.
+ if ( $cap_len <= $width - 6 ) {
+ my $left = int(($width - 2 - $cap_len) / 2);
+ unshift @rows,
+ ("_" x $left) . " $caption " . ("_" x ($width - $left - $cap_len - 2));
+ }
+
+ # The caption is too wide to add underscores on each side.
+ else {
+
+ # Color is supported, so we can use terminal underlining.
+ if ( $config{color}->{val} ) {
+ my $left = int(($width - $cap_len) / 2);
+ unshift @rows, [
+ (" " x $left) . $caption . (" " x ($width - $left - $cap_len)),
+ 'underline',
+ ];
+ }
+
+ # Color is not supported, so we have to add a line underneath to separate the
+ # caption from whatever it's captioning.
+ else {
+ my $left = int(($width - $cap_len) / 2);
+ unshift @rows, ('-' x $width);
+ unshift @rows, (" " x $left) . $caption . (" " x ($width - $left - $cap_len));
+ }
+
+ # The caption is wider than the thing it labels, so we have to pad the
+ # thing it labels to a consistent width.
+ if ( $cap_len > $width ) {
+ @rows = map {
+ ref($_)
+ ? [ sprintf('%-' . $cap_len . 's', $_->[0]), $_->[1] ]
+ : sprintf('%-' . $cap_len . 's', $_);
+ } @rows;
+ }
+
+ }
+ }
+ return @rows;
+}
+
+# create_table {{{3
+# Input: an arrayref of columns, hashref of col info, and an arrayref of hashes
+# Example: [ 'a', 'b' ]
+# { a => spec, b => spec }
+# [ { a => 1, b => 2}, { a => 3, b => 4 } ]
+# The 'spec' is a hashref of hdr => label, just => ('-' or ''). It also supports min and max-widths
+# vi the minw and maxw params.
+# Output: an array of strings, one per row.
+# Example:
+# Column One Column Two
+# ---------- ----------
+# 1 2
+# 3 4
+sub create_table {
+ my ( $cols, $info, $data, $prefs ) = @_;
+ $prefs ||= {};
+ $prefs->{no_hdr} ||= ($opts{n} && $clock != 1);
+
+ # Truncate rows that will surely be off screen even if this is the only table.
+ if ( !$opts{n} && !$prefs->{raw} && !$prefs->{show_all} && $this_term_size[1] < @$data-1 ) {
+ $data = [ @$data[0..$this_term_size[1] - 1] ];
+ }
+
+ my @rows = ();
+
+ if ( @$cols && %$info ) {
+
+ # Fix undef values, collapse whitespace.
+ foreach my $row ( @$data ) {
+ map { $row->{$_} = collapse_ws($row->{$_}) } @$cols;
+ }
+
+ my $col_sep = $opts{n} ? "\t" : ' ';
+
+ # Find each column's max width.
+ my %width_for;
+ if ( !$opts{n} ) {
+ %width_for = map {
+ my $col_name = $_;
+ if ( $info->{$_}->{dec} ) {
+ # Align along the decimal point
+ my $max_rodp = max(0, map { $_->{$col_name} =~ m/([^\s\d-].*)$/ ? length($1) : 0 } @$data);
+ foreach my $row ( @$data ) {
+ my $col = $row->{$col_name};
+ my ( $l, $r ) = $col =~ m/^([\s\d]*)(.*)$/;
+ $row->{$col_name} = sprintf("%s%-${max_rodp}s", $l, $r);
+ }
+ }
+ my $max_width = max( length($info->{$_}->{hdr}), map { length($_->{$col_name}) } @$data);
+ if ( $info->{$col_name}->{maxw} ) {
+ $max_width = min( $max_width, $info->{$col_name}->{maxw} );
+ }
+ if ( $info->{$col_name}->{minw} ) {
+ $max_width = max( $max_width, $info->{$col_name}->{minw} );
+ }
+ $col_name => $max_width;
+ } @$cols;
+ }
+
+ # The table header.
+ if ( !$config{hide_hdr}->{val} && !$prefs->{no_hdr} ) {
+ push @rows, $opts{n}
+ ? join( $col_sep, @$cols )
+ : join( $col_sep, map { sprintf( "%-$width_for{$_}s", trunc($info->{$_}->{hdr}, $width_for{$_}) ) } @$cols );
+ if ( $config{color}->{val} && $config{header_highlight}->{val} ) {
+ push @rows, [ pop @rows, $config{header_highlight}->{val} ];
+ }
+ elsif ( !$opts{n} ) {
+ push @rows, join( $col_sep, map { "-" x $width_for{$_} } @$cols );
+ }
+ }
+
+ # The table data.
+ if ( $opts{n} ) {
+ foreach my $item ( @$data ) {
+ push @rows, join($col_sep, map { $item->{$_} } @$cols );
+ }
+ }
+ else {
+ my $format = join( $col_sep,
+ map { "%$info->{$_}->{just}$width_for{$_}s" } @$cols );
+ foreach my $item ( @$data ) {
+ my $row = sprintf($format, map { trunc($item->{$_}, $width_for{$_}) } @$cols );
+ if ( $config{color}->{val} && $item->{_color} ) {
+ push @rows, [ $row, $item->{_color} ];
+ }
+ else {
+ push @rows, $row;
+ }
+ }
+ }
+ }
+
+ return @rows;
+}
+
+# Aggregates a table. If $group_by is an arrayref of columns, the grouping key
+# is the specified columns; otherwise it's just the empty string (e.g.
+# everything is grouped as one group).
+sub apply_group_by {
+ my ( $tbl, $group_by, @rows ) = @_;
+ my $meta = $tbl_meta{$tbl};
+ my %is_group = map { $_ => 1 } @$group_by;
+ my @non_grp = grep { !$is_group{$_} } keys %{$meta->{cols}};
+
+ my %temp_table;
+ foreach my $row ( @rows ) {
+ my $group_key
+ = @$group_by
+ ? '{' . join('}{', map { defined $_ ? $_ : '' } @{$row}{@$group_by}) . '}'
+ : '';
+ $temp_table{$group_key} ||= [];
+ push @{$temp_table{$group_key}}, $row;
+ }
+
+ # Crush the rows together...
+ my @new_rows;
+ foreach my $key ( sort keys %temp_table ) {
+ my $group = $temp_table{$key};
+ my %new_row;
+ @new_row{@$group_by} = @{$group->[0]}{@$group_by};
+ foreach my $col ( @non_grp ) {
+ my $agg = $meta->{cols}->{$col}->{agg} || 'first';
+ $new_row{$col} = $agg_funcs{$agg}->( map { $_->{$col} } @$group );
+ }
+ push @new_rows, \%new_row;
+ }
+ return @new_rows;
+}
+
+# set_to_tbl {{{3
+# Unifies all the work of filtering, sorting etc. Alters the input.
+# TODO: pull all the little pieces out into subroutines and stick events in each of them.
+sub set_to_tbl {
+ my ( $rows, $tbl ) = @_;
+ my $meta = $tbl_meta{$tbl} or die "No such table $tbl in tbl_meta";
+
+ if ( !$meta->{pivot} ) {
+
+ # Hook in event listeners
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_filter}} ) {
+ $listener->set_to_tbl_pre_filter($rows, $tbl);
+ }
+
+ # Apply filters. Note that if the table is pivoted, filtering and sorting
+ # are applied later.
+ foreach my $filter ( @{$meta->{filters}} ) {
+ eval {
+ @$rows = grep { $filters{$filter}->{func}->($_) } @$rows;
+ };
+ if ( $EVAL_ERROR && $config{debug}->{val} ) {
+ die $EVAL_ERROR;
+ }
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_sort}} ) {
+ $listener->set_to_tbl_pre_sort($rows, $tbl);
+ }
+
+ # Sort. Note that if the table is pivoted, sorting might have the wrong
+ # columns and it could crash. This will only be an issue if it's possible
+ # to toggle pivoting on and off, which it's not at the moment.
+ if ( @$rows && $meta->{sort_func} && !$meta->{aggregate} ) {
+ if ( $meta->{sort_dir} > 0 ) {
+ @$rows = $meta->{sort_func}->( @$rows );
+ }
+ else {
+ @$rows = reverse $meta->{sort_func}->( @$rows );
+ }
+ }
+
+ }
+
+ # Stop altering arguments now.
+ my @rows = @$rows;
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_group}} ) {
+ $listener->set_to_tbl_pre_group(\@rows, $tbl);
+ }
+
+ # Apply group-by.
+ if ( $meta->{aggregate} ) {
+ @rows = apply_group_by($tbl, $meta->{group_by}, @rows);
+
+ # Sort. Note that if the table is pivoted, sorting might have the wrong
+ # columns and it could crash. This will only be an issue if it's possible
+ # to toggle pivoting on and off, which it's not at the moment.
+ if ( @rows && $meta->{sort_func} ) {
+ if ( $meta->{sort_dir} > 0 ) {
+ @rows = $meta->{sort_func}->( @rows );
+ }
+ else {
+ @rows = reverse $meta->{sort_func}->( @rows );
+ }
+ }
+
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_colorize}} ) {
+ $listener->set_to_tbl_pre_colorize(\@rows, $tbl);
+ }
+
+ if ( !$meta->{pivot} ) {
+ # Colorize. Adds a _color column to rows.
+ if ( @rows && $meta->{color_func} ) {
+ eval {
+ foreach my $row ( @rows ) {
+ $row->{_color} = $meta->{color_func}->($row);
+ }
+ };
+ if ( $EVAL_ERROR ) {
+ pause($EVAL_ERROR);
+ }
+ }
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_transform}} ) {
+ $listener->set_to_tbl_pre_transform(\@rows, $tbl);
+ }
+
+ # Apply_transformations.
+ if ( @rows ) {
+ my $cols = $meta->{cols};
+ foreach my $col ( keys %{$rows->[0]} ) {
+ # Don't auto-vivify $tbl_meta{tbl}-{cols}->{_color}->{trans}
+ next if $col eq '_color';
+ foreach my $trans ( @{$cols->{$col}->{trans}} ) {
+ map { $_->{$col} = $trans_funcs{$trans}->($_->{$col}) } @rows;
+ }
+ }
+ }
+
+ my ($fmt_cols, $fmt_meta);
+
+ # Pivot.
+ if ( $meta->{pivot} ) {
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_pivot}} ) {
+ $listener->set_to_tbl_pre_pivot(\@rows, $tbl);
+ }
+
+ my @vars = @{$meta->{visible}};
+ my @tmp = map { { name => $_ } } @vars;
+ my @cols = 'name';
+ foreach my $i ( 0..@$rows-1 ) {
+ my $col = "set_$i";
+ push @cols, $col;
+ foreach my $j ( 0..@vars-1 ) {
+ $tmp[$j]->{$col} = $rows[$i]->{$vars[$j]};
+ }
+ }
+ $fmt_meta = { map { $_ => { hdr => $_, just => '-' } } @cols };
+ $fmt_cols = \@cols;
+ @rows = @tmp;
+
+ # Hook in event listeners
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_filter}} ) {
+ $listener->set_to_tbl_pre_filter($rows, $tbl);
+ }
+
+ # Apply filters.
+ foreach my $filter ( @{$meta->{filters}} ) {
+ eval {
+ @rows = grep { $filters{$filter}->{func}->($_) } @rows;
+ };
+ if ( $EVAL_ERROR && $config{debug}->{val} ) {
+ die $EVAL_ERROR;
+ }
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_sort}} ) {
+ $listener->set_to_tbl_pre_sort($rows, $tbl);
+ }
+
+ # Sort.
+ if ( @rows && $meta->{sort_func} ) {
+ if ( $meta->{sort_dir} > 0 ) {
+ @rows = $meta->{sort_func}->( @rows );
+ }
+ else {
+ @rows = reverse $meta->{sort_func}->( @rows );
+ }
+ }
+
+ }
+ else {
+ # If the table isn't pivoted, just show all columns that are supposed to
+ # be shown; but eliminate aggonly columns if the table isn't aggregated.
+ my $aggregated = $meta->{aggregate};
+ $fmt_cols = [ grep { $aggregated || !$meta->{cols}->{$_}->{aggonly} } @{$meta->{visible}} ];
+ $fmt_meta = { map { $_ => $meta->{cols}->{$_} } @$fmt_cols };
+
+ # If the table is aggregated, re-order the group_by columns to the left of
+ # the display.
+ if ( $aggregated ) {
+ my %is_group = map { $_ => 1 } @{$meta->{group_by}};
+ $fmt_cols = [ @{$meta->{group_by}}, grep { !$is_group{$_} } @$fmt_cols ];
+ }
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_create}} ) {
+ $listener->set_to_tbl_pre_create(\@rows, $tbl);
+ }
+
+ @rows = create_table( $fmt_cols, $fmt_meta, \@rows);
+ if ( !$meta->{hide_caption} && !$opts{n} && $config{display_table_captions}->{val} ) {
+ @rows = create_caption($meta->{capt}, @rows)
+ }
+
+ foreach my $listener ( @{$event_listener_for{set_to_tbl_post_create}} ) {
+ $listener->set_to_tbl_post_create(\@rows, $tbl);
+ }
+
+ return @rows;
+}
+
+# meta_to_hdr {{{3
+sub meta_to_hdr {
+ my $tbl = shift;
+ my $meta = $tbl_meta{$tbl};
+ my %labels = map { $_ => $meta->{cols}->{$_}->{hdr} } @{$meta->{visible}};
+ return \%labels;
+}
+
+# commify {{{3
+# From perlfaq5: add commas.
+sub commify {
+ my ( $num ) = @_;
+ $num = 0 unless defined $num;
+ $num =~ s/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/$1,/g;
+ return $num;
+}
+
+# set_precision {{{3
+# Trim to desired precision.
+sub set_precision {
+ my ( $num, $precision ) = @_;
+ $precision = $config{num_digits}->{val} if !defined $precision;
+ sprintf("%.${precision}f", $num);
+}
+
+# percent {{{3
+# Convert to percent
+sub percent {
+ my ( $num ) = @_;
+ $num = 0 unless defined $num;
+ my $digits = $config{num_digits}->{val};
+ return sprintf("%.${digits}f", $num * 100)
+ . ($config{show_percent}->{val} ? '%' : '');
+}
+
+# shorten {{{3
+sub shorten {
+ my ( $num, $opts ) = @_;
+
+ return $num if !defined($num) || $opts{n} || $num !~ m/$num_regex/;
+
+ $opts ||= {};
+ my $pad = defined $opts->{pad} ? $opts->{pad} : '';
+ my $num_digits = defined $opts->{num_digits}
+ ? $opts->{num_digits}
+ : $config{num_digits}->{val};
+ my $force = defined $opts->{force};
+
+ my $n = 0;
+ while ( $num >= 1_024 ) {
+ $num /= 1_024;
+ ++$n;
+ }
+ return sprintf(
+ $num =~ m/\./ || $n || $force
+ ? "%.${num_digits}f%s"
+ : '%d',
+ $num, ($pad,'k','M','G', 'T')[$n]);
+
+}
+
+# Utility functions {{{2
+# unique {{{3
+sub unique {
+ my %seen;
+ return grep { !$seen{$_}++ } @_;
+}
+
+# make_color_func {{{3
+sub make_color_func {
+ my ( $tbl ) = @_;
+ my @criteria;
+ foreach my $spec ( @{$tbl->{colors}} ) {
+ next unless exists $comp_ops{$spec->{op}};
+ my $val = $spec->{op} =~ m/^(?:eq|ne|le|ge|lt|gt)$/ ? "'$spec->{arg}'"
+ : $spec->{op} =~ m/^(?:=~|!~)$/ ? "m/" . quotemeta($spec->{arg}) . "/"
+ : $spec->{arg};
+ push @criteria,
+ "( defined \$set->{$spec->{col}} && \$set->{$spec->{col}} $spec->{op} $val ) { return '$spec->{color}'; }";
+ }
+ return undef unless @criteria;
+ my $sub = eval 'sub { my ( $set ) = @_; if ' . join(" elsif ", @criteria) . '}';
+ die if $EVAL_ERROR;
+ return $sub;
+}
+
+# make_sort_func {{{3
+# Gets a list of sort columns from the table, like "+cxn -time" and returns a
+# subroutine that will sort that way.
+sub make_sort_func {
+ my ( $tbl ) = @_;
+ my @criteria;
+
+ # Pivoted tables can be sorted by 'name' and set_x columns; others must be
+ # sorted by existing columns. TODO: this will crash if you toggle between
+ # pivoted and nonpivoted. I have several other 'crash' notes about this if
+ # this ever becomes possible.
+
+ if ( $tbl->{pivot} ) {
+ # Sort type is not really possible on pivoted columns, because a 'column'
+ # contains data from an entire non-pivoted row, so there could be a mix of
+ # numeric and non-numeric data. Thus everything has to be 'cmp' type.
+ foreach my $col ( split(/\s+/, $tbl->{sort_cols} ) ) {
+ next unless $col;
+ my ( $dir, $name ) = $col =~ m/([+-])?(\w+)$/;
+ next unless $name && $name =~ m/^(?:name|set_\d+)$/;
+ $dir ||= '+';
+ my $op = 'cmp';
+ my $df = "''";
+ push @criteria,
+ $dir eq '+'
+ ? "(\$a->{$name} || $df) $op (\$b->{$name} || $df)"
+ : "(\$b->{$name} || $df) $op (\$a->{$name} || $df)";
+ }
+ }
+ else {
+ foreach my $col ( split(/\s+/, $tbl->{sort_cols} ) ) {
+ next unless $col;
+ my ( $dir, $name ) = $col =~ m/([+-])?(\w+)$/;
+ next unless $name && $tbl->{cols}->{$name};
+ $dir ||= '+';
+ my $op = $tbl->{cols}->{$name}->{num} ? "<=>" : "cmp";
+ my $df = $tbl->{cols}->{$name}->{num} ? "0" : "''";
+ push @criteria,
+ $dir eq '+'
+ ? "(\$a->{$name} || $df) $op (\$b->{$name} || $df)"
+ : "(\$b->{$name} || $df) $op (\$a->{$name} || $df)";
+ }
+ }
+ return sub { return @_ } unless @criteria;
+ my $sub = eval 'sub { sort {' . join("||", @criteria) . '} @_; }';
+ die if $EVAL_ERROR;
+ return $sub;
+}
+
+# trunc {{{3
+# Shortens text to specified length.
+sub trunc {
+ my ( $text, $len ) = @_;
+ if ( length($text) <= $len ) {
+ return $text;
+ }
+ return substr($text, 0, $len);
+}
+
+# donut {{{3
+# Takes out the middle of text to shorten it.
+sub donut {
+ my ( $text, $len ) = @_;
+ return $text if length($text) <= $len;
+ my $max = length($text) - $len;
+ my $min = $max - 1;
+
+ # Try to remove a single "word" from somewhere in the center
+ if ( $text =~ s/_[^_]{$min,$max}_/_/ ) {
+ return $text;
+ }
+
+ # Prefer removing the end of a "word"
+ if ( $text =~ s/([^_]+)[^_]{$max}_/$1_/ ) {
+ return $text;
+ }
+
+ $text = substr($text, 0, int($len/2))
+ . "_"
+ . substr($text, int($len/2) + $max + 1);
+ return $text;
+}
+
+# crunch {{{3
+# Removes vowels and compacts repeated letters to shorten text.
+sub crunch {
+ my ( $text, $len ) = @_;
+ return $text if $len && length($text) <= $len;
+ $text =~ s/^IB_\w\w_//;
+ $text =~ s/(?<![_ ])[aeiou]//g;
+ $text =~ s/(.)\1+/$1/g;
+ return $text;
+}
+
+# collapse_ws {{{3
+# Collapses all whitespace to a single space.
+sub collapse_ws {
+ my ( $text ) = @_;
+ return '' unless defined $text;
+ $text =~ s/\s+/ /g;
+ return $text;
+}
+
+# Strips out non-printable characters within fields, which freak terminals out.
+sub no_ctrl_char {
+ my ( $text ) = @_;
+ return '' unless defined $text;
+ my $charset = $config{charset}->{val};
+ if ( $charset && $charset eq 'unicode' ) {
+ $text =~ s/
+ ("(?:(?!(?<!\\)").)*" # Double-quoted string
+ |'(?:(?!(?<!\\)').)*') # Or single-quoted string
+ /$1 =~ m#\p{IsC}# ? "[BINARY]" : $1/egx;
+ }
+ elsif ( $charset && $charset eq 'none' ) {
+ $text =~ s/
+ ("(?:(?!(?<!\\)").)*"
+ |'(?:(?!(?<!\\)').)*')
+ /[TEXT]/gx;
+ }
+ else { # The default is 'ascii'
+ $text =~ s/
+ ("(?:(?!(?<!\\)").)*"
+ |'(?:(?!(?<!\\)').)*')
+ /$1 =~ m#[^\040-\176]# ? "[BINARY]" : $1/egx;
+ }
+ return $text;
+}
+
+# word_wrap {{{3
+# Wraps text at word boundaries so it fits the screen.
+sub word_wrap {
+ my ( $text, $width) = @_;
+ $width ||= $this_term_size[0];
+ $text =~ s/(.{0,$width})(?:\s+|$)/$1\n/g;
+ $text =~ s/ +$//mg;
+ return $text;
+}
+
+# draw_screen {{{3
+# Prints lines to the screen. The first argument is an arrayref. Each
+# element of the array is either a string or an arrayref. If it's a string it
+# just gets printed. If it's an arrayref, the first element is the string to
+# print, and the second is args to colored().
+sub draw_screen {
+ my ( $display_lines, $prefs ) = @_;
+ if ( !$opts{n} && $config{show_statusbar}->{val} ) {
+ unshift @$display_lines, create_statusbar();
+ }
+
+ foreach my $listener ( @{$event_listener_for{draw_screen}} ) {
+ $listener->draw_screen($display_lines);
+ }
+
+ $clear_screen_sub->()
+ if $prefs->{clear} || !$modes{$config{mode}->{val}}->{no_clear_screen};
+ if ( $opts{n} || $prefs->{raw} ) {
+ my $num_lines = 0;
+ print join("\n",
+ map {
+ $num_lines++;
+ ref $_
+ ? colored($_->[0], $_->[1])
+ : $_;
+ }
+ grep { !$opts{n} || $_ } # Suppress empty lines
+ @$display_lines);
+ if ( $opts{n} && $num_lines ) {
+ print "\n";
+ }
+ }
+ else {
+ my $max_lines = $prefs->{show_all}
+ ? scalar(@$display_lines)- 1
+ : min(scalar(@$display_lines), $this_term_size[1]);
+ print join("\n",
+ map {
+ ref $_
+ ? colored(substr($_->[0], 0, $this_term_size[0]), $_->[1])
+ : substr($_, 0, $this_term_size[0]);
+ } @$display_lines[0..$max_lines - 1]);
+ }
+}
+
+# secs_to_time {{{3
+sub secs_to_time {
+ my ( $secs, $fmt ) = @_;
+ $secs ||= 0;
+ return '00:00' unless $secs;
+
+ # Decide what format to use, if not given
+ $fmt ||= $secs >= 86_400 ? 'd'
+ : $secs >= 3_600 ? 'h'
+ : 'm';
+
+ return
+ $fmt eq 'd' ? sprintf(
+ "%d+%02d:%02d:%02d",
+ int($secs / 86_400),
+ int(($secs % 86_400) / 3_600),
+ int(($secs % 3_600) / 60),
+ $secs % 60)
+ : $fmt eq 'h' ? sprintf(
+ "%02d:%02d:%02d",
+ int(($secs % 86_400) / 3_600),
+ int(($secs % 3_600) / 60),
+ $secs % 60)
+ : sprintf(
+ "%02d:%02d",
+ int(($secs % 3_600) / 60),
+ $secs % 60);
+}
+
+# dulint_to_int {{{3
+# Takes a number that InnoDB formats as two ulint integers, like transaction IDs
+# and such, and turns it into a single integer
+sub dulint_to_int {
+ my $num = shift;
+ return 0 unless $num;
+ my ( $high, $low ) = $num =~ m/^(\d+) (\d+)$/;
+ return $low unless $high;
+ return $low + ( $high * $MAX_ULONG );
+}
+
+# create_statusbar {{{3
+sub create_statusbar {
+ my $mode = $config{mode}->{val};
+ my @cxns = sort { $a cmp $b } get_connections();
+
+ my $modeline = ( $config{readonly}->{val} ? '[RO] ' : '' )
+ . $modes{$mode}->{hdr} . " (? for help)";
+ my $mode_width = length($modeline);
+ my $remaining_width = $this_term_size[0] - $mode_width - 1;
+ my $result;
+
+ # The thingie in top-right that says what we're monitoring.
+ my $cxn = '';
+
+ if ( 1 == @cxns && $dbhs{$cxns[0]} && $dbhs{$cxns[0]}->{dbh} ) {
+ $cxn = $dbhs{$cxns[0]}->{dbh}->{mysql_serverinfo} || '';
+ }
+ else {
+ if ( $modes{$mode}->{server_group} ) {
+ $cxn = "Servers: " . $modes{$mode}->{server_group};
+ my $err_count = grep { $dbhs{$_} && $dbhs{$_}->{err_count} } @cxns;
+ if ( $err_count ) {
+ $cxn .= "(" . ( scalar(@cxns) - $err_count ) . "/" . scalar(@cxns) . ")";
+ }
+ }
+ else {
+ $cxn = join(' ', map { ($dbhs{$_}->{err_count} ? '!' : '') . $_ }
+ grep { $dbhs{$_} } @cxns);
+ }
+ }
+
+ if ( 1 == @cxns ) {
+ get_driver_status(@cxns);
+ my $vars = $vars{$cxns[0]}->{$clock};
+ my $inc = inc(0, $cxns[0]);
+
+ # Format server uptime human-readably, calculate QPS...
+ my $uptime = secs_to_time( $vars->{Uptime_hires} );
+ my $qps = ($inc->{Questions}||0) / ($inc->{Uptime_hires}||1);
+ my $ibinfo = '';
+
+ if ( exists $vars->{IB_last_secs} ) {
+ $ibinfo .= "InnoDB $vars->{IB_last_secs}s ";
+ if ( $vars->{IB_got_all} ) {
+ if ( ($mode eq 'T' || $mode eq 'W')
+ && $vars->{IB_tx_is_truncated} ) {
+ $ibinfo .= ':^|';
+ }
+ else {
+ $ibinfo .= ':-)';
+ }
+ }
+ else {
+ $ibinfo .= ':-(';
+ }
+ }
+ $result = sprintf(
+ "%-${mode_width}s %${remaining_width}s",
+ $modeline,
+ join(', ', grep { $_ } (
+ $cxns[0],
+ $uptime,
+ $ibinfo,
+ shorten($qps) . " QPS",
+ ($vars->{Threads} || 0) . " thd",
+ $cxn)));
+ }
+ else {
+ $result = sprintf(
+ "%-${mode_width}s %${remaining_width}s",
+ $modeline,
+ $cxn);
+ }
+
+ return $config{color}->{val} ? [ $result, 'bold reverse' ] : $result;
+}
+
+# Database connections {{{3
+sub add_new_dsn {
+ my ( $name ) = @_;
+
+ if ( defined $name ) {
+ $name =~ s/[\s:;]//g;
+ }
+
+ if ( !$name ) {
+ print word_wrap("Choose a name for the connection. It cannot contain "
+ . "whitespace, colons or semicolons."), "\n\n";
+ do {
+ $name = prompt("Enter a name");
+ $name =~ s/[\s:;]//g;
+ } until ( $name );
+ }
+
+ my $dsn;
+ do {
+ $clear_screen_sub->();
+ print "Typical DSN strings look like\n DBI:mysql:;host=hostname;port=port\n"
+ . "The db and port are optional and can usually be omitted.\n"
+ . "If you specify 'mysql_read_default_group=mysql' many options can be read\n"
+ . "from your mysql options files (~/.my.cnf, /etc/my.cnf).\n\n";
+ $dsn = prompt("Enter a DSN string", undef, "DBI:mysql:;mysql_read_default_group=mysql;host=$name");
+ } until ( $dsn );
+
+ $clear_screen_sub->();
+ my $dl_table = prompt("Optional: enter a table (must not exist) to use when resetting InnoDB deadlock information",
+ undef, 'test.innotop_dl');
+
+ $connections{$name} = {
+ dsn => $dsn,
+ dl_table => $dl_table,
+ };
+}
+
+sub add_new_server_group {
+ my ( $name ) = @_;
+
+ if ( defined $name ) {
+ $name =~ s/[\s:;]//g;
+ }
+
+ if ( !$name ) {
+ print word_wrap("Choose a name for the group. It cannot contain "
+ . "whitespace, colons or semicolons."), "\n\n";
+ do {
+ $name = prompt("Enter a name");
+ $name =~ s/[\s:;]//g;
+ } until ( $name );
+ }
+
+ my @cxns;
+ do {
+ $clear_screen_sub->();
+ @cxns = select_cxn("Choose servers for $name", keys %connections);
+ } until ( @cxns );
+
+ $server_groups{$name} = \@cxns;
+ return $name;
+}
+
+sub get_var_set {
+ my ( $name ) = @_;
+ while ( !$name || !exists($var_sets{$config{$name}->{val}}) ) {
+ $name = choose_var_set($name);
+ }
+ return $var_sets{$config{$name}->{val}}->{text};
+}
+
+sub add_new_var_set {
+ my ( $name ) = @_;
+
+ if ( defined $name ) {
+ $name =~ s/\W//g;
+ }
+
+ if ( !$name ) {
+ do {
+ $name = prompt("Enter a name");
+ $name =~ s/\W//g;
+ } until ( $name );
+ }
+
+ my $variables;
+ do {
+ $clear_screen_sub->();
+ $variables = prompt("Enter variables for $name", undef );
+ } until ( $variables );
+
+ $var_sets{$name} = { text => $variables, user => 1 };
+}
+
+sub next_server {
+ my $mode = $config{mode}->{val};
+ my @cxns = sort keys %connections;
+ my ($cur) = get_connections($mode);
+ $cur ||= $cxns[0];
+ my $pos = grep { $_ lt $cur } @cxns;
+ my $newpos = ($pos + 1) % @cxns;
+ $modes{$mode}->{server_group} = '';
+ $modes{$mode}->{connections} = [ $cxns[$newpos] ];
+ $clear_screen_sub->();
+}
+
+sub next_server_group {
+ my $mode = shift || $config{mode}->{val};
+ my @grps = sort keys %server_groups;
+ my $curr = $modes{$mode}->{server_group};
+
+ return unless @grps;
+
+ if ( $curr ) {
+ # Find the current group's position.
+ my $pos = 0;
+ while ( $curr ne $grps[$pos] ) {
+ $pos++;
+ }
+ $modes{$mode}->{server_group} = $grps[ ($pos + 1) % @grps ];
+ }
+ else {
+ $modes{$mode}->{server_group} = $grps[0];
+ }
+}
+
+# Get a list of connection names used in this mode.
+sub get_connections {
+ if ( $file ) {
+ return qw(file);
+ }
+ my $mode = shift || $config{mode}->{val};
+ my @connections = $modes{$mode}->{server_group}
+ ? @{$server_groups{$modes{$mode}->{server_group}}}
+ : @{$modes{$mode}->{connections}};
+ if ( $modes{$mode}->{one_connection} ) {
+ @connections = @connections ? $connections[0] : ();
+ }
+ return unique(@connections);
+}
+
+# Get a list of tables used in this mode. If innotop is running non-interactively, just use the first.
+sub get_visible_tables {
+ my $mode = shift || $config{mode}->{val};
+ my @tbls = @{$modes{$mode}->{visible_tables}};
+ if ( $opts{n} ) {
+ return $tbls[0];
+ }
+ else {
+ return @tbls;
+ }
+}
+
+# Choose from among available connections or server groups.
+# If the mode has a server set in use, prefers that instead.
+sub choose_connections {
+ $clear_screen_sub->();
+ my $mode = $config{mode}->{val};
+ my $meta = { map { $_ => $connections{$_}->{dsn} } keys %connections };
+ foreach my $group ( keys %server_groups ) {
+ $meta->{"#$group"} = join(' ', @{$server_groups{$group}});
+ }
+
+ my $choices = prompt_list("Choose connections or a group for $mode mode",
+ undef, sub { return keys %$meta }, $meta);
+
+ my @choices = unique(grep { $_ } split(/\s+/, $choices));
+ if ( @choices ) {
+ if ( $choices[0] =~ s/^#// && exists $server_groups{$choices[0]} ) {
+ $modes{$mode}->{server_group} = $choices[0];
+ }
+ else {
+ $modes{$mode}->{connections} = [ grep { exists $connections{$_} } @choices ];
+ }
+ }
+}
+
+# Accepts a DB connection name and the name of a prepared query (e.g. status, kill).
+# Also a list of params for the prepared query. This allows not storing prepared
+# statements globally. Returns a $sth that's been executed.
+# ERROR-HANDLING SEMANTICS: if the statement throws an error, propagate, but if the
+# connection has gone away or can't connect, DO NOT. Just return undef.
+sub do_stmt {
+ my ( $cxn, $stmt_name, @args ) = @_;
+
+ return undef if $file;
+
+ # Test if the cxn should not even be tried
+ return undef if $dbhs{$cxn}
+ && $dbhs{$cxn}->{err_count}
+ && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} )
+ && $dbhs{$cxn}->{wake_up} > $clock;
+
+ my $sth;
+ my $retries = 1;
+ my $success = 0;
+ TRY:
+ while ( $retries-- >= 0 && !$success ) {
+
+ eval {
+ my $dbh = connect_to_db($cxn);
+
+ # If the prepared query doesn't exist, make it.
+ if ( !exists $dbhs{$cxn}->{stmts}->{$stmt_name} ) {
+ $dbhs{$cxn}->{stmts}->{$stmt_name} = $stmt_maker_for{$stmt_name}->($dbh);
+ }
+
+ $sth = $dbhs{$cxn}->{stmts}->{$stmt_name};
+ if ( $sth ) {
+ $sth->execute(@args);
+ }
+ $success = 1;
+ };
+ if ( $EVAL_ERROR ) {
+ if ( $EVAL_ERROR =~ m/$nonfatal_errs/ ) {
+ handle_cxn_error($cxn, $EVAL_ERROR);
+ }
+ else {
+ die "$cxn $stmt_name: $EVAL_ERROR";
+ }
+ if ( $retries < 0 ) {
+ $sth = undef;
+ }
+ }
+ }
+
+ if ( $sth && $sth->{NUM_OF_FIELDS} ) {
+ sleep($stmt_sleep_time_for{$stmt_name}) if $stmt_sleep_time_for{$stmt_name};
+ return $sth;
+ }
+}
+
+# Keeps track of error count, sleep times till retries, etc etc.
+# When there's an error we retry the connection every so often, increasing in
+# Fibonacci series to prevent too much banging on the server.
+sub handle_cxn_error {
+ my ( $cxn, $err ) = @_;
+ my $meta = $dbhs{$cxn};
+ $meta->{err_count}++;
+
+ # This is used so errors that have to do with permissions needed by the current
+ # mode will get displayed as long as we're in this mode, but get ignored if the
+ # mode changes.
+ $meta->{mode} = $config{mode}->{val};
+
+ # Strip garbage from the error text if possible.
+ $err =~ s/\s+/ /g;
+ if ( $err =~ m/failed: (.*?) at \S*innotop line/ ) {
+ $err = $1;
+ }
+
+ $meta->{last_err} = $err;
+ my $sleep_time = $meta->{this_sleep} + $meta->{prev_sleep};
+ $meta->{prev_sleep} = $meta->{this_sleep};
+ $meta->{this_sleep} = $sleep_time;
+ $meta->{wake_up} = $clock + $sleep_time;
+ if ( $config{show_cxn_errors}->{val} ) {
+ print STDERR "Error at tick $clock $cxn $err" if $config{debug}->{val};
+ }
+}
+
+# Accepts a DB connection name and a (string) query. Returns a $sth that's been
+# executed.
+sub do_query {
+ my ( $cxn, $query ) = @_;
+
+ return undef if $file;
+
+ # Test if the cxn should not even be tried
+ return undef if $dbhs{$cxn}
+ && $dbhs{$cxn}->{err_count}
+ && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} )
+ && $dbhs{$cxn}->{wake_up} > $clock;
+
+ my $sth;
+ my $retries = 1;
+ my $success = 0;
+ TRY:
+ while ( $retries-- >= 0 && !$success ) {
+
+ eval {
+ my $dbh = connect_to_db($cxn);
+
+ $sth = $dbh->prepare($query);
+ $sth->execute();
+ $success = 1;
+ };
+ if ( $EVAL_ERROR ) {
+ if ( $EVAL_ERROR =~ m/$nonfatal_errs/ ) {
+ handle_cxn_error($cxn, $EVAL_ERROR);
+ }
+ else {
+ die $EVAL_ERROR;
+ }
+ if ( $retries < 0 ) {
+ $sth = undef;
+ }
+ }
+ }
+
+ return $sth;
+}
+
+sub get_uptime {
+ my ( $cxn ) = @_;
+ $dbhs{$cxn}->{start_time} ||= time();
+ # Avoid dividing by zero
+ return (time() - $dbhs{$cxn}->{start_time}) || .001;
+}
+
+sub connect_to_db {
+ my ( $cxn ) = @_;
+
+ $dbhs{$cxn} ||= {
+ stmts => {}, # bucket for prepared statements.
+ prev_sleep => 0,
+ this_sleep => 1,
+ wake_up => 0,
+ start_time => 0,
+ dbh => undef,
+ };
+ my $href = $dbhs{$cxn};
+
+ if ( !$href->{dbh} || ref($href->{dbh}) !~ m/DBI/ || !$href->{dbh}->ping ) {
+ my $dbh = get_new_db_connection($cxn);
+ @{$href}{qw(dbh err_count wake_up this_sleep start_time prev_sleep)}
+ = ($dbh, 0, 0, 1, 0, 0);
+
+ # Derive and store the server's start time in hi-res
+ my $uptime = $dbh->selectrow_hashref("show status like 'Uptime'")->{value};
+ $href->{start_time} = time() - $uptime;
+
+ # Set timeouts so an unused connection stays alive.
+ # For example, a connection might be used in Q mode but idle in T mode.
+ if ( version_ge($dbh, '4.0.3')) {
+ my $timeout = $config{cxn_timeout}->{val};
+ $dbh->do("set session wait_timeout=$timeout, interactive_timeout=$timeout");
+ }
+ }
+ return $href->{dbh};
+}
+
+# Compares versions like 5.0.27 and 4.1.15-standard-log
+sub version_ge {
+ my ( $dbh, $target ) = @_;
+ my $version = sprintf('%03d%03d%03d', $dbh->{mysql_serverinfo} =~ m/(\d+)/g);
+ return $version ge sprintf('%03d%03d%03d', $target =~ m/(\d+)/g);
+}
+
+# Extracts status values that can be gleaned from the DBD driver without doing a whole query.
+sub get_driver_status {
+ my @cxns = @_;
+ if ( !$info_gotten{driver_status}++ ) {
+ foreach my $cxn ( @cxns ) {
+ next unless $dbhs{$cxn} && $dbhs{$cxn}->{dbh} && $dbhs{$cxn}->{dbh}->{Active};
+ $vars{$cxn}->{$clock} ||= {};
+ my $vars = $vars{$cxn}->{$clock};
+ my %res = map { $_ =~ s/ +/_/g; $_ } $dbhs{$cxn}->{dbh}->{mysql_stat} =~ m/(\w[^:]+): ([\d\.]+)/g;
+ map { $vars->{$_} ||= $res{$_} } keys %res;
+ $vars->{Uptime_hires} ||= get_uptime($cxn);
+ $vars->{cxn} = $cxn;
+ }
+ }
+}
+
+sub get_new_db_connection {
+ my ( $connection, $destroy ) = @_;
+ if ( $file ) {
+ die "You can't connect to a MySQL server while monitoring a file. This is probably a bug.";
+ }
+
+ my $dsn = $connections{$connection}
+ or die "No connection named '$connection' is defined in your configuration";
+
+ if ( !defined $dsn->{have_user} ) {
+ my $answer = prompt("Do you want to specify a username for $connection?", undef, 'n');
+ $dsn->{have_user} = $answer && $answer =~ m/1|y/i;
+ }
+
+ if ( !defined $dsn->{have_pass} ) {
+ my $answer = prompt("Do you want to specify a password for $connection?", undef, 'n');
+ $dsn->{have_pass} = $answer && $answer =~ m/1|y/i;
+ }
+
+ if ( !$dsn->{user} && $dsn->{have_user} ) {
+ my $user = $ENV{USERNAME} || $ENV{USER} || getlogin() || getpwuid($REAL_USER_ID) || undef;
+ $dsn->{user} = prompt("Enter username for $connection", undef, $user);
+ }
+
+ if ( !defined $dsn->{user} ) {
+ $dsn->{user} = '';
+ }
+
+ if ( !$dsn->{pass} && !$dsn->{savepass} && $dsn->{have_pass} ) {
+ $dsn->{pass} = prompt_noecho("Enter password for '$dsn->{user}' on $connection");
+ print "\n";
+ if ( !defined($dsn->{savepass}) ) {
+ my $answer = prompt("Save password in plain text in the config file?", undef, 'y');
+ $dsn->{savepass} = $answer && $answer =~ m/1|y/i;
+ }
+ }
+
+ my $dbh = DBI->connect(
+ $dsn->{dsn}, $dsn->{user}, $dsn->{pass},
+ { RaiseError => 1, PrintError => 0, AutoCommit => 1 });
+ $dbh->{InactiveDestroy} = 1 unless $destroy; # Can't be set in $db_options
+ $dbh->{FetchHashKeyName} = 'NAME_lc'; # Lowercases all column names for fetchrow_hashref
+ return $dbh;
+}
+
+sub get_cxn_errors {
+ my @cxns = @_;
+ return () unless $config{show_cxn_errors_in_tbl}->{val};
+ return
+ map { [ $_ . ': ' . $dbhs{$_}->{last_err}, 'red' ] }
+ grep { $dbhs{$_} && $dbhs{$_}->{err_count} && $dbhs{$_}->{mode} eq $config{mode}->{val} }
+ @cxns;
+}
+
+# Setup and tear-down functions {{{2
+
+# Takes a string and turns it into a hashref you can apply to %tbl_meta tables. The string
+# can be in the form 'foo, bar, foo/bar, foo as bar' much like a SQL SELECT statement.
+sub compile_select_stmt {
+ my ($str) = @_;
+ my @exps = $str =~ m/\s*([^,]+(?i:\s+as\s+[^,\s]+)?)\s*(?=,|$)/g;
+ my %cols;
+ my @visible;
+ foreach my $exp ( @exps ) {
+ my ( $text, $colname );
+ if ( $exp =~ m/as\s+(\w+)\s*/ ) {
+ $colname = $1;
+ $exp =~ s/as\s+(\w+)\s*//;
+ $text = $exp;
+ }
+ else {
+ $text = $colname = $exp;
+ }
+ my ($func, $err) = compile_expr($text);
+ $cols{$colname} = {
+ src => $text,
+ hdr => $colname,
+ num => 0,
+ func => $func,
+ };
+ push @visible, $colname;
+ }
+ return (\%cols, \@visible);
+}
+
+# compile_filter {{{3
+sub compile_filter {
+ my ( $text ) = @_;
+ my ( $sub, $err );
+ eval "\$sub = sub { my \$set = shift; $text }";
+ if ( $EVAL_ERROR ) {
+ $EVAL_ERROR =~ s/at \(eval.*$//;
+ $sub = sub { return $EVAL_ERROR };
+ $err = $EVAL_ERROR;
+ }
+ return ( $sub, $err );
+}
+
+# compile_expr {{{3
+sub compile_expr {
+ my ( $expr ) = @_;
+ # Leave built-in functions alone so they get called as Perl functions, unless
+ # they are the only word in $expr, in which case treat them as hash keys.
+ if ( $expr =~ m/\W/ ) {
+ $expr =~ s/(?<!\{|\$)\b([A-Za-z]\w{2,})\b/is_func($1) ? $1 : "\$set->{$1}"/eg;
+ }
+ else {
+ $expr = "\$set->{$expr}";
+ }
+ my ( $sub, $err );
+ my $quoted = quotemeta($expr);
+ eval qq{
+ \$sub = sub {
+ my (\$set, \$cur, \$pre) = \@_;
+ my \$val = eval { $expr };
+ if ( \$EVAL_ERROR && \$config{debug}->{val} ) {
+ \$EVAL_ERROR =~ s/ at \\(eval.*//s;
+ die "\$EVAL_ERROR in expression $quoted";
+ }
+ return \$val;
+ }
+ };
+ if ( $EVAL_ERROR ) {
+ if ( $config{debug}->{val} ) {
+ die $EVAL_ERROR;
+ }
+ $EVAL_ERROR =~ s/ at \(eval.*$//;
+ $sub = sub { return $EVAL_ERROR };
+ $err = $EVAL_ERROR;
+ }
+ return ( $sub, $err );
+}
+
+# finish {{{3
+# This is a subroutine because it's called from a key to quit the program.
+sub finish {
+ save_config();
+ ReadMode('normal') unless $opts{n};
+ print "\n";
+ exit(0);
+}
+
+# core_dump {{{3
+sub core_dump {
+ my $msg = shift;
+ if ($config{debugfile}->{val} && $config{debug}->{val}) {
+ eval {
+ open my $file, '>>', $config{debugfile}->{val};
+ if ( %vars ) {
+ print $file "Current variables:\n" . Dumper(\%vars);
+ }
+ close $file;
+ };
+ }
+ print $msg;
+}
+
+# load_config {{{3
+sub load_config {
+
+ my $filename = $opts{c} || "$homepath/.innotop/innotop.ini";
+ my $dirname = dirname($filename);
+ if ( -f $dirname && !$opts{c} ) {
+ # innotop got upgraded and this is the old config file.
+ my $answer = pause("Innotop's default config location has moved to $filename. Move old config file $dirname there now? y/n");
+ if ( lc $answer eq 'y' ) {
+ rename($dirname, "$homepath/innotop.ini")
+ or die "Can't rename '$dirname': $OS_ERROR";
+ mkdir($dirname) or die "Can't create directory '$dirname': $OS_ERROR";
+ mkdir("$dirname/plugins") or die "Can't create directory '$dirname/plugins': $OS_ERROR";
+ rename("$homepath/innotop.ini", $filename)
+ or die "Can't rename '$homepath/innotop.ini' to '$filename': $OS_ERROR";
+ }
+ else {
+ print "\nInnotop will now exit so you can fix the config file.\n";
+ exit(0);
+ }
+ }
+
+ if ( ! -d $dirname ) {
+ mkdir $dirname
+ or die "Can't create directory '$dirname': $OS_ERROR";
+ }
+ if ( ! -d "$dirname/plugins" ) {
+ mkdir "$dirname/plugins"
+ or die "Can't create directory '$dirname/plugins': $OS_ERROR";
+ }
+
+ if ( -f $filename ) {
+ open my $file, "<", $filename or die("Can't open '$filename': $OS_ERROR");
+
+ # Check config file version. Just ignore if either innotop or the file has
+ # garbage in the version number.
+ if ( defined(my $line = <$file>) && $VERSION =~ m/\d/ ) {
+ chomp $line;
+ if ( my ($maj, $min, $rev) = $line =~ m/^version=(\d+)\.(\d+)(?:\.(\d+))?$/ ) {
+ $rev ||= 0;
+ my $cfg_ver = sprintf('%03d-%03d-%03d', $maj, $min, $rev);
+ ( $maj, $min, $rev ) = $VERSION =~ m/^(\d+)\.(\d+)(?:\.(\d+))?$/;
+ $rev ||= 0;
+ my $innotop_ver = sprintf('%03d-%03d-%03d', $maj, $min, $rev);
+
+ if ( $cfg_ver gt $innotop_ver ) {
+ pause("The config file is for a newer version of innotop and may not be read correctly.");
+ }
+ else {
+ my @ver_history = @config_versions;
+ while ( my ($start, $end) = splice(@ver_history, 0, 2) ) {
+ # If the config file is between the endpoints and innotop is greater than
+ # the endpoint, innotop has a newer config file format than the file.
+ if ( $cfg_ver ge $start && $cfg_ver lt $end && $innotop_ver ge $end ) {
+ my $msg = "innotop's config file format has changed. Overwrite $filename? y or n";
+ if ( pause($msg) eq 'n' ) {
+ $config{readonly}->{val} = 1;
+ print "\ninnotop will not save any configuration changes you make.";
+ pause();
+ print "\n";
+ }
+ close $file;
+ return;
+ }
+ }
+ }
+ }
+ }
+
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next unless $line =~ m/^\[([a-z_]+)\]$/;
+ if ( exists $config_file_sections{$1} ) {
+ $config_file_sections{$1}->{reader}->($file);
+ }
+ else {
+ warn "Unknown config file section '$1'";
+ }
+ }
+ close $file or die("Can't close $filename: $OS_ERROR");
+ }
+
+}
+
+# Do some post-processing on %tbl_meta: compile src properties into func etc.
+sub post_process_tbl_meta {
+ foreach my $table ( values %tbl_meta ) {
+ foreach my $col_name ( keys %{$table->{cols}} ) {
+ my $col_def = $table->{cols}->{$col_name};
+ my ( $sub, $err ) = compile_expr($col_def->{src});
+ $col_def->{func} = $sub;
+ }
+ }
+}
+
+# load_config_plugins {{{3
+sub load_config_plugins {
+ my ( $file ) = @_;
+
+ # First, find a list of all plugins that exist on disk, and get information about them.
+ my $dir = $config{plugin_dir}->{val};
+ foreach my $p_file ( <$dir/*.pm> ) {
+ my ($package, $desc);
+ eval {
+ open my $p_in, "<", $p_file or die $OS_ERROR;
+ while ( my $line = <$p_in> ) {
+ chomp $line;
+ if ( $line =~ m/^package\s+(.*?);/ ) {
+ $package = $1;
+ }
+ elsif ( $line =~ m/^# description: (.*)/ ) {
+ $desc = $1;
+ }
+ last if $package && $desc;
+ }
+ close $p_in;
+ };
+ if ( $package ) {
+ $plugins{$package} = {
+ file => $p_file,
+ desc => $desc,
+ class => $package,
+ active => 0,
+ };
+ if ( $config{debug}->{val} && $EVAL_ERROR ) {
+ die $EVAL_ERROR;
+ }
+ }
+ }
+
+ # Now read which ones the user has activated. Each line simply represents an active plugin.
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+ next unless $line && $plugins{$line};
+
+ my $obj;
+ eval {
+ require $plugins{$line}->{file};
+ $obj = $line->new(%pluggable_vars);
+ foreach my $event ( $obj->register_for_events() ) {
+ my $queue = $event_listener_for{$event};
+ if ( $queue ) {
+ push @$queue, $obj;
+ }
+ }
+ };
+ if ( $config{debug}->{val} && $EVAL_ERROR ) {
+ die $EVAL_ERROR;
+ }
+ if ( $obj ) {
+ $plugins{$line}->{active} = 1;
+ $plugins{$line}->{object} = $obj;
+ }
+ }
+}
+
+# save_config_plugins {{{3
+sub save_config_plugins {
+ my $file = shift;
+ foreach my $class ( sort keys %plugins ) {
+ next unless $plugins{$class}->{active};
+ print $file "$class\n";
+ }
+}
+
+# load_config_active_server_groups {{{3
+sub load_config_active_server_groups {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $mode, $group ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $mode && $group
+ && exists $modes{$mode} && exists $server_groups{$group};
+ $modes{$mode}->{server_group} = $group;
+ }
+}
+
+# save_config_active_server_groups {{{3
+sub save_config_active_server_groups {
+ my $file = shift;
+ foreach my $mode ( sort keys %modes ) {
+ print $file "$mode=$modes{$mode}->{server_group}\n";
+ }
+}
+
+# load_config_server_groups {{{3
+sub load_config_server_groups {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $name, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $name && $rest;
+ my @vars = unique(grep { $_ && exists $connections{$_} } split(/\s+/, $rest));
+ next unless @vars;
+ $server_groups{$name} = \@vars;
+ }
+}
+
+# save_config_server_groups {{{3
+sub save_config_server_groups {
+ my $file = shift;
+ foreach my $set ( sort keys %server_groups ) {
+ print $file "$set=", join(' ', @{$server_groups{$set}}), "\n";
+ }
+}
+
+# load_config_varsets {{{3
+sub load_config_varsets {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $name, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $name && $rest;
+ $var_sets{$name} = {
+ text => $rest,
+ user => 1,
+ };
+ }
+}
+
+# save_config_varsets {{{3
+sub save_config_varsets {
+ my $file = shift;
+ foreach my $varset ( sort keys %var_sets ) {
+ next unless $var_sets{$varset}->{user};
+ print $file "$varset=$var_sets{$varset}->{text}\n";
+ }
+}
+
+# load_config_group_by {{{3
+sub load_config_group_by {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $tbl , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $tbl && exists $tbl_meta{$tbl};
+ my @parts = unique(grep { exists($tbl_meta{$tbl}->{cols}->{$_}) } split(/\s+/, $rest));
+ $tbl_meta{$tbl}->{group_by} = [ @parts ];
+ $tbl_meta{$tbl}->{cust}->{group_by} = 1;
+ }
+}
+
+# save_config_group_by {{{3
+sub save_config_group_by {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ next if $tbl_meta{$tbl}->{temp};
+ next unless $tbl_meta{$tbl}->{cust}->{group_by};
+ my $aref = $tbl_meta{$tbl}->{group_by};
+ print $file "$tbl=", join(' ', @$aref), "\n";
+ }
+}
+
+# load_config_filters {{{3
+sub load_config_filters {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key, $rest ) = $line =~ m/^(.+?)=(.*)$/;
+ next unless $key && $rest;
+
+ my %parts = $rest =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+ next unless $parts{text} && $parts{tbls};
+
+ foreach my $prop ( keys %parts ) {
+ # Un-escape escaping
+ $parts{$prop} =~ s/\\\\/\\/g;
+ $parts{$prop} =~ s/\\'/'/g;
+ }
+
+ my ( $sub, $err ) = compile_filter($parts{text});
+ my @tbls = unique(split(/\s+/, $parts{tbls}));
+ @tbls = grep { exists $tbl_meta{$_} } @tbls;
+ $filters{$key} = {
+ func => $sub,
+ text => $parts{text},
+ user => 1,
+ name => $key,
+ note => 'User-defined filter',
+ tbls => \@tbls,
+ }
+ }
+}
+
+# save_config_filters {{{3
+sub save_config_filters {
+ my $file = shift;
+ foreach my $key ( sort keys %filters ) {
+ next if !$filters{$key}->{user} || $filters{$key}->{quick};
+ my $text = $filters{$key}->{text};
+ $text =~ s/([\\'])/\\$1/g;
+ my $tbls = join(" ", @{$filters{$key}->{tbls}});
+ print $file "$key=text='$text' tbls='$tbls'\n";
+ }
+}
+
+# load_config_visible_tables {{{3
+sub load_config_visible_tables {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $mode, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $mode && exists $modes{$mode};
+ $modes{$mode}->{visible_tables} =
+ [ unique(grep { $_ && exists $tbl_meta{$_} } split(/\s+/, $rest)) ];
+ $modes{$mode}->{cust}->{visible_tables} = 1;
+ }
+}
+
+# save_config_visible_tables {{{3
+sub save_config_visible_tables {
+ my $file = shift;
+ foreach my $mode ( sort keys %modes ) {
+ next unless $modes{$mode}->{cust}->{visible_tables};
+ my $tables = $modes{$mode}->{visible_tables};
+ print $file "$mode=", join(' ', @$tables), "\n";
+ }
+}
+
+# load_config_sort_cols {{{3
+sub load_config_sort_cols {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $key && exists $tbl_meta{$key};
+ $tbl_meta{$key}->{sort_cols} = $rest;
+ $tbl_meta{$key}->{cust}->{sort_cols} = 1;
+ $tbl_meta{$key}->{sort_func} = make_sort_func($tbl_meta{$key});
+ }
+}
+
+# save_config_sort_cols {{{3
+sub save_config_sort_cols {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ next unless $tbl_meta{$tbl}->{cust}->{sort_cols};
+ my $col = $tbl_meta{$tbl}->{sort_cols};
+ print $file "$tbl=$col\n";
+ }
+}
+
+# load_config_active_filters {{{3
+sub load_config_active_filters {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $tbl , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $tbl && exists $tbl_meta{$tbl};
+ my @parts = unique(grep { exists($filters{$_}) } split(/\s+/, $rest));
+ @parts = grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} } @parts;
+ $tbl_meta{$tbl}->{filters} = [ @parts ];
+ $tbl_meta{$tbl}->{cust}->{filters} = 1;
+ }
+}
+
+# save_config_active_filters {{{3
+sub save_config_active_filters {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ next if $tbl_meta{$tbl}->{temp};
+ next unless $tbl_meta{$tbl}->{cust}->{filters};
+ my $aref = $tbl_meta{$tbl}->{filters};
+ print $file "$tbl=", join(' ', @$aref), "\n";
+ }
+}
+
+# load_config_active_columns {{{3
+sub load_config_active_columns {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $key && exists $tbl_meta{$key};
+ my @parts = grep { exists($tbl_meta{$key}->{cols}->{$_}) } unique split(/ /, $rest);
+ $tbl_meta{$key}->{visible} = [ @parts ];
+ $tbl_meta{$key}->{cust}->{visible} = 1;
+ }
+}
+
+# save_config_active_columns {{{3
+sub save_config_active_columns {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ next unless $tbl_meta{$tbl}->{cust}->{visible};
+ my $aref = $tbl_meta{$tbl}->{visible};
+ print $file "$tbl=", join(' ', @$aref), "\n";
+ }
+}
+
+# save_config_tbl_meta {{{3
+sub save_config_tbl_meta {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ foreach my $col ( keys %{$tbl_meta{$tbl}->{cols}} ) {
+ my $meta = $tbl_meta{$tbl}->{cols}->{$col};
+ next unless $meta->{user};
+ print $file "$col=", join(
+ " ",
+ map {
+ # Some properties (trans) are arrays, others scalars
+ my $val = ref($meta->{$_}) ? join(',', @{$meta->{$_}}) : $meta->{$_};
+ $val =~ s/([\\'])/\\$1/g; # Escape backslashes and single quotes
+ "$_='$val'"; # Enclose in single quotes
+ }
+ grep { $_ ne 'func' }
+ keys %$meta
+ ), "\n";
+ }
+ }
+}
+
+# save_config_config {{{3
+sub save_config_config {
+ my $file = shift;
+ foreach my $key ( sort keys %config ) {
+ eval {
+ if ( $key ne 'password' || $config{savepass}->{val} ) {
+ print $file "# $config{$key}->{note}\n"
+ or die "Cannot print to file: $OS_ERROR";
+ my $val = $config{$key}->{val};
+ $val = '' unless defined($val);
+ if ( ref( $val ) eq 'ARRAY' ) {
+ print $file "$key="
+ . join( " ", @$val ) . "\n"
+ or die "Cannot print to file: $OS_ERROR";
+ }
+ elsif ( ref( $val ) eq 'HASH' ) {
+ print $file "$key="
+ . join( " ",
+ map { "$_:$val->{$_}" } keys %$val
+ ) . "\n";
+ }
+ else {
+ print $file "$key=$val\n";
+ }
+ }
+ };
+ if ( $EVAL_ERROR ) { print "$EVAL_ERROR in $key"; };
+ }
+
+}
+
+# load_config_config {{{3
+sub load_config_config {
+ my ( $file ) = @_;
+
+ # Look in the command-line parameters for things stored in the same slot.
+ my %cmdline =
+ map { $_->{c} => $opts{$_->{k}} }
+ grep { exists $_->{c} && exists $opts{$_->{k}} }
+ @opt_spec;
+
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $name, $val ) = $line =~ m/^(.+?)=(.*)$/;
+ next unless defined $name && defined $val;
+
+ # Values might already have been set at the command line.
+ $val = defined($cmdline{$name}) ? $cmdline{$name} : $val;
+
+ # Validate the incoming values...
+ if ( $name && exists( $config{$name} ) ) {
+ if ( !$config{$name}->{pat} || $val =~ m/$config{$name}->{pat}/ ) {
+ $config{$name}->{val} = $val;
+ $config{$name}->{read} = 1;
+ }
+ }
+ }
+}
+
+# load_config_tbl_meta {{{3
+sub load_config_tbl_meta {
+ my ( $file ) = @_;
+
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ # Each tbl_meta section has all the properties defined in %col_props.
+ my ( $col , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $col;
+ my %parts = $rest =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+
+ # Each section read from the config file has one extra property: which table it
+ # goes in.
+ my $tbl = $parts{tbl} or die "There's no table for tbl_meta $col";
+ my $meta = $tbl_meta{$tbl} or die "There's no table in tbl_meta named $tbl";
+
+ # The section is user-defined by definition (if that makes sense).
+ $parts{user} = 1;
+
+ # The column may already exist in the table, in which case this is just a
+ # customization.
+ $meta->{cols}->{$col} ||= {};
+
+ foreach my $prop ( keys %col_props ) {
+ if ( !defined($parts{$prop}) ) {
+ die "Undefined property $prop for column $col in table $tbl";
+ }
+
+ # Un-escape escaping
+ $parts{$prop} =~ s/\\\\/\\/g;
+ $parts{$prop} =~ s/\\'/'/g;
+
+ if ( ref $col_props{$prop} ) {
+ if ( $prop eq 'trans' ) {
+ $meta->{cols}->{$col}->{trans}
+ = [ unique(grep { exists $trans_funcs{$_} } split(',', $parts{$prop})) ];
+ }
+ else {
+ $meta->{cols}->{$col}->{$prop} = [ split(',', $parts{$prop}) ];
+ }
+ }
+ else {
+ $meta->{cols}->{$col}->{$prop} = $parts{$prop};
+ }
+ }
+
+ }
+}
+
+# save_config {{{3
+sub save_config {
+ return if $config{readonly}->{val};
+ # Save to a temp file first, so a crash doesn't destroy the main config file
+ my $newname = $opts{c} || "$homepath/.innotop/innotop.ini";
+ my $filename = $newname . '_tmp';
+ open my $file, "+>", $filename
+ or die("Can't write to $filename: $OS_ERROR");
+ print $file "version=$VERSION\n";
+
+ foreach my $section ( @ordered_config_file_sections ) {
+ die "No such config file section $section" unless $config_file_sections{$section};
+ print $file "\n[$section]\n\n";
+ $config_file_sections{$section}->{writer}->($file);
+ print $file "\n[/$section]\n";
+ }
+
+ # Now clobber the main config file with the temp.
+ close $file or die("Can't close $filename: $OS_ERROR");
+ rename($filename, $newname) or die("Can't rename $filename to $newname: $OS_ERROR");
+}
+
+# load_config_connections {{{3
+sub load_config_connections {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $key;
+ my %parts = $rest =~ m/(\S+?)=(\S*)/g;
+ my %conn = map { $_ => $parts{$_} || '' } @conn_parts;
+ $connections{$key} = \%conn;
+ }
+}
+
+# save_config_connections {{{3
+sub save_config_connections {
+ my $file = shift;
+ foreach my $conn ( sort keys %connections ) {
+ my $href = $connections{$conn};
+ my @keys = $href->{savepass} ? @conn_parts : grep { $_ ne 'pass' } @conn_parts;
+ print $file "$conn=", join(' ', map { "$_=$href->{$_}" } grep { defined $href->{$_} } @keys), "\n";
+ }
+}
+
+sub load_config_colors {
+ my ( $file ) = @_;
+ my %rule_set_for;
+
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $tbl, $rule ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $tbl && $rule;
+ next unless exists $tbl_meta{$tbl};
+ my %parts = $rule =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+ next unless $parts{col} && exists $tbl_meta{$tbl}->{cols}->{$parts{col}};
+ next unless $parts{op} && exists $comp_ops{$parts{op}};
+ next unless defined $parts{arg};
+ next unless defined $parts{color};
+ my @colors = unique(grep { exists $ansicolors{$_} } split(/\W+/, $parts{color}));
+ next unless @colors;
+
+ # Finally! Enough validation...
+ $rule_set_for{$tbl} ||= [];
+ push @{$rule_set_for{$tbl}}, \%parts;
+ }
+
+ foreach my $tbl ( keys %rule_set_for ) {
+ $tbl_meta{$tbl}->{colors} = $rule_set_for{$tbl};
+ $tbl_meta{$tbl}->{color_func} = make_color_func($tbl_meta{$tbl});
+ $tbl_meta{$tbl}->{cust}->{colors} = 1;
+ }
+}
+
+# save_config_colors {{{3
+sub save_config_colors {
+ my $file = shift;
+ foreach my $tbl ( sort keys %tbl_meta ) {
+ my $meta = $tbl_meta{$tbl};
+ next unless $meta->{cust}->{colors};
+ foreach my $rule ( @{$meta->{colors}} ) {
+ print $file "$tbl=", join(
+ ' ',
+ map {
+ my $val = $rule->{$_};
+ $val =~ s/([\\'])/\\$1/g; # Escape backslashes and single quotes
+ "$_='$val'"; # Enclose in single quotes
+ }
+ qw(col op arg color)
+ ), "\n";
+ }
+ }
+}
+
+# load_config_active_connections {{{3
+sub load_config_active_connections {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+ next unless $key && exists $modes{$key};
+ my @parts = grep { exists $connections{$_} } split(/ /, $rest);
+ $modes{$key}->{connections} = [ @parts ] if exists $modes{$key};
+ }
+}
+
+# save_config_active_connections {{{3
+sub save_config_active_connections {
+ my $file = shift;
+ foreach my $mode ( sort keys %modes ) {
+ my @connections = get_connections($mode);
+ print $file "$mode=", join(' ', @connections), "\n";
+ }
+}
+
+# load_config_stmt_sleep_times {{{3
+sub load_config_stmt_sleep_times {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $val ) = split('=', $line);
+ next unless $key && defined $val && $val =~ m/$num_regex/;
+ $stmt_sleep_time_for{$key} = $val;
+ }
+}
+
+# save_config_stmt_sleep_times {{{3
+sub save_config_stmt_sleep_times {
+ my $file = shift;
+ foreach my $key ( sort keys %stmt_sleep_time_for ) {
+ print $file "$key=$stmt_sleep_time_for{$key}\n";
+ }
+}
+
+# load_config_mvs {{{3
+sub load_config_mvs {
+ my ( $file ) = @_;
+ while ( my $line = <$file> ) {
+ chomp $line;
+ next if $line =~ m/^#/;
+ last if $line =~ m/^\[/;
+
+ my ( $key , $val ) = split('=', $line);
+ next unless $key && defined $val && $val =~ m/$num_regex/;
+ $mvs{$key} = $val;
+ }
+}
+
+# save_config_mvs {{{3
+sub save_config_mvs {
+ my $file = shift;
+ foreach my $key ( sort keys %mvs ) {
+ print $file "$key=$mvs{$key}\n";
+ }
+}
+
+# edit_configuration {{{3
+sub edit_configuration {
+ my $key = '';
+ while ( $key ne 'q' ) {
+ $clear_screen_sub->();
+ my @display_lines = '';
+
+ if ( $key && $cfg_editor_action{$key} ) {
+ $cfg_editor_action{$key}->{func}->();
+ }
+
+ # Show help
+ push @display_lines, create_caption('What configuration do you want to edit?',
+ create_table2(
+ [ sort keys %cfg_editor_action ],
+ { map { $_ => $_ } keys %cfg_editor_action },
+ { map { $_ => $cfg_editor_action{$_}->{note} } keys %cfg_editor_action },
+ { sep => ' ' }));
+
+ draw_screen(\@display_lines);
+ $key = pause('');
+ }
+}
+
+# edit_configuration_variables {{{3
+sub edit_configuration_variables {
+ $clear_screen_sub->();
+ my $mode = $config{mode}->{val};
+
+ my %config_choices
+ = map { $_ => $config{$_}->{note} || '' }
+ # Only config values that are marked as applying to this mode.
+ grep {
+ my $key = $_;
+ $config{$key}->{conf} &&
+ ( $config{$key}->{conf} eq 'ALL'
+ || grep { $mode eq $_ } @{$config{$key}->{conf}} )
+ } keys %config;
+
+ my $key = prompt_list(
+ "Enter the name of the variable you wish to configure",
+ '',
+ sub{ return keys %config_choices },
+ \%config_choices);
+
+ if ( exists($config_choices{$key}) ) {
+ get_config_interactive($key);
+ }
+}
+
+# edit_color_rules {{{3
+sub edit_color_rules {
+ my ( $tbl ) = @_;
+ $clear_screen_sub->();
+ $tbl ||= choose_visible_table();
+ if ( $tbl && exists($tbl_meta{$tbl}) ) {
+ my $meta = $tbl_meta{$tbl};
+ my @cols = ('', qw(col op arg color));
+ my $info = { map { $_ => { hdr => $_, just => '-', } } @cols };
+ $info->{label}->{maxw} = 30;
+ my $key;
+ my $selected_rule;
+
+ # This loop builds a tabular view of the rules.
+ do {
+
+ # Show help
+ if ( $key && $key eq '?' ) {
+ my @display_lines = '';
+ push @display_lines, create_caption('Editor key mappings',
+ create_table2(
+ [ sort keys %color_editor_action ],
+ { map { $_ => $_ } keys %color_editor_action },
+ { map { $_ => $color_editor_action{$_}->{note} } keys %color_editor_action },
+ { sep => ' ' }));
+ draw_screen(\@display_lines);
+ pause();
+ $key = '';
+ }
+ else {
+
+ # Do the action specified
+ $selected_rule ||= 0;
+ if ( $key && $color_editor_action{$key} ) {
+ $selected_rule = $color_editor_action{$key}->{func}->($tbl, $selected_rule);
+ $selected_rule ||= 0;
+ }
+
+ # Build the table of rules. If the terminal has color, the selected rule
+ # will be highlighted; otherwise a > at the left will indicate.
+ my $data = $meta->{colors} || [];
+ foreach my $i ( 0..@$data - 1 ) {
+ $data->[$i]->{''} = $i == $selected_rule ? '>' : '';
+ }
+ my @display_lines = create_table(\@cols, $info, $data);
+
+ # Highlight selected entry
+ for my $i ( 0 .. $#display_lines ) {
+ if ( $display_lines[$i] =~ m/^>/ ) {
+ $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+ }
+ }
+
+ # Draw the screen and wait for a command.
+ unshift @display_lines, '',
+ "Editing color rules for $meta->{capt}. Press ? for help, q to "
+ . "quit.", '';
+ draw_screen(\@display_lines);
+ print "\n\n", word_wrap('Rules are applied in order from top to '
+ . 'bottom. The first matching rule wins and prevents the '
+ . 'rest of the rules from being applied.');
+ $key = pause('');
+ }
+ } while ( $key ne 'q' );
+ $meta->{color_func} = make_color_func($meta);
+ }
+}
+
+# add_quick_filter {{{3
+sub add_quick_filter {
+ my $tbl = choose_visible_table();
+ if ( $tbl && exists($tbl_meta{$tbl}) ) {
+ print "\n";
+ my $response = prompt_list(
+ "Enter column name and filter text",
+ '',
+ sub { return keys %{$tbl_meta{$tbl}->{cols}} },
+ ()
+ );
+ my ( $col, $text ) = split(/\s+/, $response, 2);
+
+ # You can't filter on a nonexistent column. But if you filter on a pivoted
+ # table, the columns are different, so on a pivoted table, allow filtering
+ # on the 'name' column.
+ # NOTE: if a table is pivoted and un-pivoted, this will likely cause crashes.
+ # Currently not an issue since there's no way to toggle pivot/nopivot.
+ return unless $col && $text &&
+ (exists($tbl_meta{$tbl}->{cols}->{$col})
+ || ($tbl_meta{$tbl}->{pivot} && $col eq 'name'));
+
+ my ( $sub, $err ) = compile_filter( "defined \$set->{$col} && \$set->{$col} =~ m/$text/" );
+ return if !$sub || $err;
+ my $name = "quick_$tbl.$col";
+ $filters{$name} = {
+ func => $sub,
+ text => $text,
+ user => 1,
+ quick => 1,
+ name => $name,
+ note => 'Quick-filter',
+ tbls => [$tbl],
+ };
+ push @{$tbl_meta{$tbl}->{filters}}, $name;
+ }
+}
+
+# clear_quick_filters {{{3
+sub clear_quick_filters {
+ my $tbl = choose_visible_table(
+ # Only tables that have quick-filters
+ sub {
+ my ( $tbl ) = @_;
+ return scalar grep { $filters{$_}->{quick} } @{ $tbl_meta{$tbl}->{filters} };
+ }
+ );
+ if ( $tbl && exists($tbl_meta{$tbl}) ) {
+ my @current = @{$tbl_meta{$tbl}->{filters}};
+ @current = grep { !$filters{$_}->{quick} } @current;
+ $tbl_meta{$tbl}->{filters} = \@current;
+ }
+}
+
+sub edit_plugins {
+ $clear_screen_sub->();
+
+ my @cols = ('', qw(class desc active));
+ my $info = { map { $_ => { hdr => $_, just => '-', } } @cols };
+ my @rows = map { $plugins{$_} } sort keys %plugins;
+ my $key;
+ my $selected;
+
+ # This loop builds a tabular view of the plugins.
+ do {
+
+ # Show help
+ if ( $key && $key eq '?' ) {
+ my @display_lines = '';
+ push @display_lines, create_caption('Editor key mappings',
+ create_table2(
+ [ sort keys %plugin_editor_action ],
+ { map { $_ => $_ } keys %plugin_editor_action },
+ { map { $_ => $plugin_editor_action{$_}->{note} } keys %plugin_editor_action },
+ { sep => ' ' }));
+ draw_screen(\@display_lines);
+ pause();
+ $key = '';
+ }
+
+ # Do the action specified
+ else {
+ $selected ||= 0;
+ if ( $key && $plugin_editor_action{$key} ) {
+ $selected = $plugin_editor_action{$key}->{func}->(\@rows, $selected);
+ $selected ||= 0;
+ }
+
+ # Build the table of plugins.
+ foreach my $row ( 0.. $#rows ) {
+ $rows[$row]->{''} = $row eq $selected ? '>' : ' ';
+ }
+ my @display_lines = create_table(\@cols, $info, \@rows);
+
+ # Highlight selected entry
+ for my $i ( 0 .. $#display_lines ) {
+ if ( $display_lines[$i] =~ m/^>/ ) {
+ $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+ }
+ }
+
+ # Draw the screen and wait for a command.
+ unshift @display_lines, '',
+ "Plugin Management. Press ? for help, q to quit.", '';
+ draw_screen(\@display_lines);
+ $key = pause('');
+ }
+ } while ( $key ne 'q' );
+}
+
+# edit_table {{{3
+sub edit_table {
+ $clear_screen_sub->();
+ my ( $tbl ) = @_;
+ $tbl ||= choose_visible_table();
+ if ( $tbl && exists($tbl_meta{$tbl}) ) {
+ my $meta = $tbl_meta{$tbl};
+ my @cols = ('', qw(name hdr label src));
+ my $info = { map { $_ => { hdr => $_, just => '-', } } @cols };
+ $info->{label}->{maxw} = 30;
+ my $key;
+ my $selected_column;
+
+ # This loop builds a tabular view of the tbl_meta's structure, showing each column
+ # in the entry as a row.
+ do {
+
+ # Show help
+ if ( $key && $key eq '?' ) {
+ my @display_lines = '';
+ push @display_lines, create_caption('Editor key mappings',
+ create_table2(
+ [ sort keys %tbl_editor_action ],
+ { map { $_ => $_ } keys %tbl_editor_action },
+ { map { $_ => $tbl_editor_action{$_}->{note} } keys %tbl_editor_action },
+ { sep => ' ' }));
+ draw_screen(\@display_lines);
+ pause();
+ $key = '';
+ }
+ else {
+
+ # Do the action specified
+ $selected_column ||= $meta->{visible}->[0];
+ if ( $key && $tbl_editor_action{$key} ) {
+ $selected_column = $tbl_editor_action{$key}->{func}->($tbl, $selected_column);
+ $selected_column ||= $meta->{visible}->[0];
+ }
+
+ # Build the pivoted view of the table's meta-data. If the terminal has color,
+ # The selected row will be highlighted; otherwise a > at the left will indicate.
+ my $data = [];
+ foreach my $row ( @{$meta->{visible}} ) {
+ my %hash;
+ @hash{ @cols } = @{$meta->{cols}->{$row}}{@cols};
+ $hash{src} = '' if ref $hash{src};
+ $hash{name} = $row;
+ $hash{''} = $row eq $selected_column ? '>' : ' ';
+ push @$data, \%hash;
+ }
+ my @display_lines = create_table(\@cols, $info, $data);
+
+ # Highlight selected entry
+ for my $i ( 0 .. $#display_lines ) {
+ if ( $display_lines[$i] =~ m/^>/ ) {
+ $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+ }
+ }
+
+ # Draw the screen and wait for a command.
+ unshift @display_lines, '',
+ "Editing table definition for $meta->{capt}. Press ? for help, q to quit.", '';
+ draw_screen(\@display_lines, { clear => 1 });
+ $key = pause('');
+ }
+ } while ( $key ne 'q' );
+ }
+}
+
+# choose_mode_tables {{{3
+# Choose which table(s), and in what order, to display in a given mode.
+sub choose_mode_tables {
+ my $mode = $config{mode}->{val};
+ my @tbls = @{$modes{$mode}->{visible_tables}};
+ my $new = prompt_list(
+ "Choose tables to display",
+ join(' ', @tbls),
+ sub { return @{$modes{$mode}->{tables}} },
+ { map { $_ => $tbl_meta{$_}->{capt} } @{$modes{$mode}->{tables}} }
+ );
+ $modes{$mode}->{visible_tables} =
+ [ unique(grep { $_ && exists $tbl_meta{$_} } split(/\s+/, $new)) ];
+ $modes{$mode}->{cust}->{visible_tables} = 1;
+}
+
+# choose_visible_table {{{3
+sub choose_visible_table {
+ my ( $grep_cond ) = @_;
+ my $mode = $config{mode}->{val};
+ my @tbls
+ = grep { $grep_cond ? $grep_cond->($_) : 1 }
+ @{$modes{$mode}->{visible_tables}};
+ my $tbl = $tbls[0];
+ if ( @tbls > 1 ) {
+ $tbl = prompt_list(
+ "Choose a table",
+ '',
+ sub { return @tbls },
+ { map { $_ => $tbl_meta{$_}->{capt} } @tbls }
+ );
+ }
+ return $tbl;
+}
+
+sub toggle_aggregate {
+ my ( $tbl ) = @_;
+ $tbl ||= choose_visible_table();
+ return unless $tbl && exists $tbl_meta{$tbl};
+ my $meta = $tbl_meta{$tbl};
+ $meta->{aggregate} ^= 1;
+}
+
+sub choose_filters {
+ my ( $tbl ) = @_;
+ $tbl ||= choose_visible_table();
+ return unless $tbl && exists $tbl_meta{$tbl};
+ my $meta = $tbl_meta{$tbl};
+ $clear_screen_sub->();
+
+ print "Choose filters for $meta->{capt}:\n";
+
+ my $ini = join(' ', @{$meta->{filters}});
+ my $val = prompt_list(
+ 'Choose filters',
+ $ini,
+ sub { return keys %filters },
+ {
+ map { $_ => $filters{$_}->{note} }
+ grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} }
+ keys %filters
+ }
+ );
+
+ my @choices = unique(split(/\s+/, $val));
+ foreach my $new ( grep { !exists($filters{$_}) } @choices ) {
+ my $answer = prompt("There is no filter called '$new'. Create it?", undef, 'y');
+ if ( $answer eq 'y' ) {
+ create_new_filter($new, $tbl);
+ }
+ }
+ @choices = grep { exists $filters{$_} } @choices;
+ @choices = grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} } @choices;
+ $meta->{filters} = [ @choices ];
+ $meta->{cust}->{filters} = 1;
+}
+
+sub choose_group_cols {
+ my ( $tbl ) = @_;
+ $tbl ||= choose_visible_table();
+ return unless $tbl && exists $tbl_meta{$tbl};
+ $clear_screen_sub->();
+ my $meta = $tbl_meta{$tbl};
+ my $curr = join(', ', @{$meta->{group_by}});
+ my $val = prompt_list(
+ 'Group-by columns',
+ $curr,
+ sub { return keys %{$meta->{cols}} },
+ { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} });
+ if ( $curr ne $val ) {
+ $meta->{group_by} = [ grep { exists $meta->{cols}->{$_} } $val =~ m/(\w+)/g ];
+ $meta->{cust}->{group_by} = 1;
+ }
+}
+
+sub choose_sort_cols {
+ my ( $tbl ) = @_;
+ $tbl ||= choose_visible_table();
+ return unless $tbl && exists $tbl_meta{$tbl};
+ $clear_screen_sub->();
+ my $meta = $tbl_meta{$tbl};
+
+ my ( $cols, $hints );
+ if ( $meta->{pivot} ) {
+ $cols = sub { qw(name set_0) };
+ $hints = { name => 'name', set_0 => 'set_0' };
+ }
+ else {
+ $cols = sub { return keys %{$meta->{cols}} };
+ $hints = { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} };
+ }
+
+ my $val = prompt_list(
+ 'Sort columns (reverse sort with -col)',
+ $meta->{sort_cols},
+ $cols,
+ $hints );
+ if ( $meta->{sort_cols} ne $val ) {
+ $meta->{sort_cols} = $val;
+ $meta->{cust}->{sort_cols} = 1;
+ $tbl_meta{$tbl}->{sort_func} = make_sort_func($tbl_meta{$tbl});
+ }
+}
+
+# create_new_filter {{{3
+sub create_new_filter {
+ my ( $filter, $tbl ) = @_;
+ $clear_screen_sub->();
+
+ if ( !$filter || $filter =~ m/\W/ ) {
+ print word_wrap("Choose a name for the filter. This name is not displayed, and is only used "
+ . "for internal reference. It can only contain lowercase letters, numbers, and underscores.");
+ print "\n\n";
+ do {
+ $filter = prompt("Enter filter name");
+ } while ( !$filter || $filter =~ m/\W/ );
+ }
+
+ my $completion = sub { keys %{$tbl_meta{$tbl}->{cols}} };
+ my ( $err, $sub, $body );
+ do {
+ $clear_screen_sub->();
+ print word_wrap("A filter is a Perl subroutine that accepts a hashref of columns "
+ . "called \$set, and returns a true value if the filter accepts the row. Example:\n"
+ . " \$set->{active_secs} > 5\n"
+ . "will only allow rows if their active_secs column is greater than 5.");
+ print "\n\n";
+ if ( $err ) {
+ print "There's an error in your filter expression: $err\n\n";
+ }
+ $body = prompt("Enter subroutine body", undef, undef, $completion);
+ ( $sub, $err ) = compile_filter($body);
+ } while ( $err );
+
+ $filters{$filter} = {
+ func => $sub,
+ text => $body,
+ user => 1,
+ name => $filter,
+ note => 'User-defined filter',
+ tbls => [$tbl],
+ };
+}
+
+# get_config_interactive {{{3
+sub get_config_interactive {
+ my $key = shift;
+ $clear_screen_sub->();
+
+ # Print help first.
+ print "Enter a new value for '$key' ($config{$key}->{note}).\n";
+
+ my $current = ref($config{$key}->{val}) ? join(" ", @{$config{$key}->{val}}) : $config{$key}->{val};
+
+ my $new_value = prompt('Enter a value', $config{$key}->{pat}, $current);
+ $config{$key}->{val} = $new_value;
+}
+
+sub edit_current_var_set {
+ my $mode = $config{mode}->{val};
+ my $name = $config{"${mode}_set"}->{val};
+ my $variables = $var_sets{$name}->{text};
+
+ my $new = $variables;
+ do {
+ $clear_screen_sub->();
+ $new = prompt("Enter variables for $name", undef, $variables);
+ } until ( $new );
+
+ if ( $new ne $variables ) {
+ @{$var_sets{$name}}{qw(text user)} = ( $new, 1);
+ }
+}
+
+
+sub choose_var_set {
+ my ( $key ) = @_;
+ $clear_screen_sub->();
+
+ my $new_value = prompt_list(
+ 'Choose a set of values to display, or enter the name of a new one',
+ $config{$key}->{val},
+ sub { return keys %var_sets },
+ { map { $_ => $var_sets{$_}->{text} } keys %var_sets });
+
+ if ( !exists $var_sets{$new_value} ) {
+ add_new_var_set($new_value);
+ }
+
+ $config{$key}->{val} = $new_value if exists $var_sets{$new_value};
+}
+
+sub switch_var_set {
+ my ( $cfg_var, $dir ) = @_;
+ my @var_sets = sort keys %var_sets;
+ my $cur = $config{$cfg_var}->{val};
+ my $pos = grep { $_ lt $cur } @var_sets;
+ my $newpos = ($pos + $dir) % @var_sets;
+ $config{$cfg_var}->{val} = $var_sets[$newpos];
+ $clear_screen_sub->();
+}
+
+# Online configuration and prompting functions {{{2
+
+# edit_stmt_sleep_times {{{3
+sub edit_stmt_sleep_times {
+ $clear_screen_sub->();
+ my $stmt = prompt_list('Specify a statement', '', sub { return sort keys %stmt_maker_for });
+ return unless $stmt && exists $stmt_maker_for{$stmt};
+ $clear_screen_sub->();
+ my $curr_val = $stmt_sleep_time_for{$stmt} || 0;
+ my $new_val = prompt('Specify a sleep delay after calling this SQL', $num_regex, $curr_val);
+ if ( $new_val ) {
+ $stmt_sleep_time_for{$stmt} = $new_val;
+ }
+ else {
+ delete $stmt_sleep_time_for{$stmt};
+ }
+}
+
+# edit_server_groups {{{3
+# Choose which server connections are in a server group. First choose a group,
+# then choose which connections are in it.
+sub edit_server_groups {
+ $clear_screen_sub->();
+ my $mode = $config{mode}->{val};
+ my $group = $modes{$mode}->{server_group};
+ my %curr = %server_groups;
+ my $new = choose_or_create_server_group($group, 'to edit');
+ $clear_screen_sub->();
+ if ( exists $curr{$new} ) {
+ # Don't do this step if the user just created a new server group,
+ # because part of that process was to choose connections.
+ my $cxns = join(' ', @{$server_groups{$new}});
+ my @conns = choose_or_create_connection($cxns, 'for this group');
+ $server_groups{$new} = \@conns;
+ }
+}
+
+# choose_server_groups {{{3
+sub choose_server_groups {
+ $clear_screen_sub->();
+ my $mode = $config{mode}->{val};
+ my $group = $modes{$mode}->{server_group};
+ my $new = choose_or_create_server_group($group, 'for this mode');
+ $modes{$mode}->{server_group} = $new if exists $server_groups{$new};
+}
+
+sub choose_or_create_server_group {
+ my ( $group, $prompt ) = @_;
+ my $new = '';
+
+ my @available = sort keys %server_groups;
+
+ if ( @available ) {
+ print "You can enter the name of a new group to create it.\n";
+
+ $new = prompt_list(
+ "Choose a server group $prompt",
+ $group,
+ sub { return @available },
+ { map { $_ => join(' ', @{$server_groups{$_}}) } @available });
+
+ $new =~ s/\s.*//;
+
+ if ( !exists $server_groups{$new} ) {
+ my $answer = prompt("There is no server group called '$new'. Create it?", undef, "y");
+ if ( $answer eq 'y' ) {
+ add_new_server_group($new);
+ }
+ }
+ }
+ else {
+ $new = add_new_server_group();
+ }
+ return $new;
+}
+
+sub choose_or_create_connection {
+ my ( $cxns, $prompt ) = @_;
+ print "You can enter the name of a new connection to create it.\n";
+
+ my @available = sort keys %connections;
+ my $new_cxns = prompt_list(
+ "Choose connections $prompt",
+ $cxns,
+ sub { return @available },
+ { map { $_ => $connections{$_}->{dsn} } @available });
+
+ my @new = unique(grep { !exists $connections{$_} } split(/\s+/, $new_cxns));
+ foreach my $new ( @new ) {
+ my $answer = prompt("There is no connection called '$new'. Create it?", undef, "y");
+ if ( $answer eq 'y' ) {
+ add_new_dsn($new);
+ }
+ }
+
+ return unique(grep { exists $connections{$_} } split(/\s+/, $new_cxns));
+}
+
+# choose_servers {{{3
+sub choose_servers {
+ $clear_screen_sub->();
+ my $mode = $config{mode}->{val};
+ my $cxns = join(' ', get_connections());
+ my @chosen = choose_or_create_connection($cxns, 'for this mode');
+ $modes{$mode}->{connections} = \@chosen;
+ $modes{$mode}->{server_group} = ''; # Clear this because it overrides {connections}
+}
+
+# display_license {{{3
+sub display_license {
+ $clear_screen_sub->();
+
+ print $innotop_license;
+
+ pause();
+}
+
+# Data-retrieval functions {{{2
+# get_status_info {{{3
+# Get SHOW STATUS and SHOW VARIABLES together.
+sub get_status_info {
+ my @cxns = @_;
+ if ( !$info_gotten{status}++ ) {
+ foreach my $cxn ( @cxns ) {
+ $vars{$cxn}->{$clock} ||= {};
+ my $vars = $vars{$cxn}->{$clock};
+
+ my $sth = do_stmt($cxn, 'SHOW_STATUS') or next;
+ my $res = $sth->fetchall_arrayref();
+ map { $vars->{$_->[0]} = $_->[1] || 0 } @$res;
+
+ # Calculate hi-res uptime and add cxn to the hash. This duplicates get_driver_status,
+ # but it's most important to have consistency.
+ $vars->{Uptime_hires} ||= get_uptime($cxn);
+ $vars->{cxn} = $cxn;
+
+ # Add SHOW VARIABLES to the hash
+ $sth = do_stmt($cxn, 'SHOW_VARIABLES') or next;
+ $res = $sth->fetchall_arrayref();
+ map { $vars->{$_->[0]} = $_->[1] || 0 } @$res;
+ }
+ }
+}
+
+# Chooses a thread for explaining, killing, etc...
+# First arg is a func that can be called in grep.
+sub choose_thread {
+ my ( $grep_cond, $prompt ) = @_;
+
+ # Narrow the list to queries that can be explained.
+ my %thread_for = map {
+ # Eliminate innotop's own threads.
+ $_ => $dbhs{$_}->{dbh} ? $dbhs{$_}->{dbh}->{mysql_thread_id} : 0
+ } keys %connections;
+
+ my @candidates = grep {
+ $_->{id} != $thread_for{$_->{cxn}} && $grep_cond->($_)
+ } @current_queries;
+ return unless @candidates;
+
+ # Find out which server.
+ my @cxns = unique map { $_->{cxn} } @candidates;
+ my ( $cxn ) = select_cxn('On which server', @cxns);
+ return unless $cxn && exists($connections{$cxn});
+
+ # Re-filter the list of candidates to only those on this server
+ @candidates = grep { $_->{cxn} eq $cxn } @candidates;
+
+ # Find out which thread to do.
+ my $info;
+ if ( @candidates > 1 ) {
+
+ # Sort longest-active first, then longest-idle.
+ my $sort_func = sub {
+ my ( $a, $b ) = @_;
+ return $a->{query} && !$b->{query} ? 1
+ : $b->{query} && !$a->{query} ? -1
+ : ($a->{time} || 0) <=> ($b->{time} || 0);
+ };
+ my @threads = map { $_->{id} } reverse sort { $sort_func->($a, $b) } @candidates;
+
+ print "\n";
+ my $thread = prompt_list($prompt,
+ $threads[0],
+ sub { return @threads });
+ return unless $thread && $thread =~ m/$int_regex/;
+
+ # Find the info hash of that query on that server.
+ ( $info ) = grep { $thread == $_->{id} } @candidates;
+ }
+ else {
+ $info = $candidates[0];
+ }
+ return $info;
+}
+
+# analyze_query {{{3
+# Allows the user to show fulltext, explain, show optimized...
+sub analyze_query {
+ my ( $action ) = @_;
+
+ my $info = choose_thread(
+ sub { $_[0]->{query} },
+ 'Select a thread to analyze',
+ );
+ return unless $info;
+
+ my %actions = (
+ e => \&display_explain,
+ f => \&show_full_query,
+ o => \&show_optimized_query,
+ );
+ do {
+ $actions{$action}->($info);
+ print "\n";
+ $action = pause('Press e to explain, f for full query, o for optimized query');
+ } while ( exists($actions{$action}) );
+}
+
+# inc {{{3
+# Returns the difference between two sets of variables/status/innodb stuff.
+sub inc {
+ my ( $offset, $cxn ) = @_;
+ my $vars = $vars{$cxn};
+ if ( $offset < 0 ) {
+ return $vars->{$clock};
+ }
+ elsif ( exists $vars{$clock - $offset} && !exists $vars->{$clock - $offset - 1} ) {
+ return $vars->{$clock - $offset};
+ }
+ my $cur = $vars->{$clock - $offset};
+ my $pre = $vars->{$clock - $offset - 1};
+ return {
+ # Numeric variables get subtracted, non-numeric get passed straight through.
+ map {
+ $_ =>
+ ( (defined $cur->{$_} && $cur->{$_} =~ m/$num_regex/)
+ ? $cur->{$_} - ($pre->{$_} || 0)
+ : $cur->{$_} )
+ } keys %{$cur}
+ };
+}
+
+# extract_values {{{3
+# Arguments are a set of values (which may be incremental, derived from
+# current and previous), current, and previous values.
+# TODO: there are a few places that don't remember prev set so can't pass it.
+sub extract_values {
+ my ( $set, $cur, $pre, $tbl ) = @_;
+
+ # Hook in event listeners
+ foreach my $listener ( @{$event_listener_for{extract_values}} ) {
+ $listener->extract_values($set, $cur, $pre, $tbl);
+ }
+
+ my $result = {};
+ my $meta = $tbl_meta{$tbl};
+ my $cols = $meta->{cols};
+ foreach my $key ( keys %$cols ) {
+ my $info = $cols->{$key}
+ or die "Column '$key' doesn't exist in $tbl";
+ die "No func defined for '$key' in $tbl"
+ unless $info->{func};
+ eval {
+ $result->{$key} = $info->{func}->($set, $cur, $pre)
+ };
+ if ( $EVAL_ERROR ) {
+ if ( $config{debug}->{val} ) {
+ die $EVAL_ERROR;
+ }
+ $result->{$key} = $info->{num} ? 0 : '';
+ }
+ }
+ return $result;
+}
+
+# get_full_processlist {{{3
+sub get_full_processlist {
+ my @cxns = @_;
+ my @result;
+ foreach my $cxn ( @cxns ) {
+ my $stmt = do_stmt($cxn, 'PROCESSLIST') or next;
+ my $arr = $stmt->fetchall_arrayref({});
+ push @result, map { $_->{cxn} = $cxn; $_ } @$arr;
+ }
+ return @result;
+}
+
+# get_open_tables {{{3
+sub get_open_tables {
+ my @cxns = @_;
+ my @result;
+ foreach my $cxn ( @cxns ) {
+ my $stmt = do_stmt($cxn, 'OPEN_TABLES') or next;
+ my $arr = $stmt->fetchall_arrayref({});
+ push @result, map { $_->{cxn} = $cxn; $_ } @$arr;
+ }
+ return @result;
+}
+
+# get_innodb_status {{{3
+sub get_innodb_status {
+ my ( $cxns, $addl_sections ) = @_;
+ if ( !$config{skip_innodb}->{val} && !$info_gotten{innodb_status}++ ) {
+
+ # Determine which sections need to be parsed
+ my %sections_required =
+ map { $tbl_meta{$_}->{innodb} => 1 }
+ grep { $_ && $tbl_meta{$_}->{innodb} }
+ get_visible_tables();
+
+ # Add in any other sections the caller requested.
+ foreach my $sec ( @$addl_sections ) {
+ $sections_required{$sec} = 1;
+ }
+
+ foreach my $cxn ( @$cxns ) {
+ my $innodb_status_text;
+
+ if ( $file ) { # Try to fetch status text from the file.
+ my @stat = stat($file);
+
+ # Initialize the file.
+ if ( !$file_mtime ) {
+ # Initialize to 130k from the end of the file (because the limit
+ # on the size of innodb status is 128k even with Google's patches)
+ # and try to grab the last status from the file.
+ sysseek($file, (-128 * 1_024), 2);
+ }
+
+ # Read from the file.
+ my $buffer;
+ if ( !$file_mtime || $file_mtime != $stat[9] ) {
+ $file_data = '';
+ while ( sysread($file, $buffer, 4096) ) {
+ $file_data .= $buffer;
+ }
+ $file_mtime = $stat[9];
+ }
+
+ # Delete everything but the last InnoDB status text from the file.
+ $file_data =~ s/\A.*(?=^=====================================\n...... ........ INNODB MONITOR OUTPUT)//ms;
+ $innodb_status_text = $file_data;
+ }
+
+ else {
+ my $stmt = do_stmt($cxn, 'INNODB_STATUS') or next;
+ $innodb_status_text = $stmt->fetchrow_hashref()->{status};
+ }
+
+ next unless $innodb_status_text
+ && substr($innodb_status_text, 0, 100) =~ m/INNODB MONITOR OUTPUT/;
+
+ # Parse and merge into %vars storage
+ my %innodb_status = (
+ $innodb_parser->get_status_hash(
+ $innodb_status_text,
+ $config{debug}->{val},
+ \%sections_required,
+ 0, # don't parse full lock information
+ )
+ );
+ if ( !$innodb_status{IB_got_all} && $config{auto_wipe_dl}->{val} ) {
+ clear_deadlock($cxn);
+ }
+
+ # Merge using a hash slice, which is the fastest way
+ $vars{$cxn}->{$clock} ||= {};
+ my $hash = $vars{$cxn}->{$clock};
+ @{$hash}{ keys %innodb_status } = values %innodb_status;
+ $hash->{cxn} = $cxn;
+ $hash->{Uptime_hires} ||= get_uptime($cxn);
+ }
+ }
+}
+
+# clear_deadlock {{{3
+sub clear_deadlock {
+ my ( $cxn ) = @_;
+ return if $clearing_deadlocks++;
+ my $tbl = $connections{$cxn}->{dl_table};
+ return unless $tbl;
+
+ eval {
+ # Set up the table for creating a deadlock.
+ my $engine = version_ge($dbhs{$cxn}->{dbh}, '4.1.2') ? 'engine' : 'type';
+ return unless do_query($cxn, "drop table if exists $tbl");
+ return unless do_query($cxn, "create table $tbl(a int) $engine=innodb");
+ return unless do_query($cxn, "delete from $tbl");
+ return unless do_query($cxn, "insert into $tbl(a) values(0), (1)");
+ return unless do_query($cxn, "commit"); # Or the children will block against the parent
+
+ # Fork off two children to deadlock against each other.
+ my %children;
+ foreach my $child ( 0..1 ) {
+ my $pid = fork();
+ if ( defined($pid) && $pid == 0 ) { # I am a child
+ deadlock_thread( $child, $tbl, $cxn );
+ }
+ elsif ( !defined($pid) ) {
+ die("Unable to fork for clearing deadlocks!\n");
+ }
+ # I already exited if I'm a child, so I'm the parent.
+ $children{$child} = $pid;
+ }
+
+ # Wait for the children to exit.
+ foreach my $child ( keys %children ) {
+ my $pid = waitpid($children{$child}, 0);
+ }
+
+ # Clean up.
+ do_query($cxn, "drop table $tbl");
+ };
+ if ( $EVAL_ERROR ) {
+ print $EVAL_ERROR;
+ pause();
+ }
+
+ $clearing_deadlocks = 0;
+}
+
+sub get_master_logs {
+ my @cxns = @_;
+ my @result;
+ if ( !$info_gotten{master_logs}++ ) {
+ foreach my $cxn ( @cxns ) {
+ my $stmt = do_stmt($cxn, 'SHOW_MASTER_LOGS') or next;
+ push @result, @{$stmt->fetchall_arrayref({})};
+ }
+ }
+ return @result;
+}
+
+# get_master_slave_status {{{3
+sub get_master_slave_status {
+ my @cxns = @_;
+ if ( !$info_gotten{replication_status}++ ) {
+ foreach my $cxn ( @cxns ) {
+ $vars{$cxn}->{$clock} ||= {};
+ my $vars = $vars{$cxn}->{$clock};
+ $vars->{cxn} = $cxn;
+
+ my $stmt = do_stmt($cxn, 'SHOW_MASTER_STATUS') or next;
+ my $res = $stmt->fetchall_arrayref({})->[0];
+ @{$vars}{ keys %$res } = values %$res;
+ $stmt = do_stmt($cxn, 'SHOW_SLAVE_STATUS') or next;
+ $res = $stmt->fetchall_arrayref({})->[0];
+ @{$vars}{ keys %$res } = values %$res;
+ $vars->{Uptime_hires} ||= get_uptime($cxn);
+ }
+ }
+}
+
+sub is_func {
+ my ( $word ) = @_;
+ return defined(&$word)
+ || eval "my \$x= sub { $word }; 1"
+ || $EVAL_ERROR !~ m/^Bareword/;
+}
+
+# Documentation {{{1
+# ############################################################################
+# I put this last as per the Dog book.
+# ############################################################################
+=pod
+
+=head1 NAME
+
+innotop - MySQL and InnoDB transaction/status monitor.
+
+=head1 SYNOPSIS
+
+To monitor servers normally:
+
+ innotop
+
+To monitor InnoDB status information from a file:
+
+ innotop /var/log/mysql/mysqld.err
+
+To run innotop non-interactively in a pipe-and-filter configuration:
+
+ innotop --count 5 -d 1 -n
+
+=head1 DESCRIPTION
+
+innotop monitors MySQL servers. Each of its modes shows you a different aspect
+of what's happening in the server. For example, there's a mode for monitoring
+replication, one for queries, and one for transactions. innotop refreshes its
+data periodically, so you see an updating view.
+
+innotop has lots of features for power users, but you can start and run it with
+virtually no configuration. If you're just getting started, see
+L<"QUICK-START">. Press '?' at any time while running innotop for
+context-sensitive help.
+
+=head1 QUICK-START
+
+To start innotop, open a terminal or command prompt. If you have installed
+innotop on your system, you should be able to just type "innotop" and press
+Enter; otherwise, you will need to change to innotop's directory and type "perl
+innotop".
+
+The first thing innotop needs to know is how to connect to a MySQL server. You
+can just enter the hostname of the server, for example "localhost" or
+"127.0.0.1" if the server is on the same machine as innotop. After this innotop
+will prompt you for a DSN (data source name). You should be able to just accept
+the defaults by pressing Enter.
+
+When innotop asks you about a table to use when resetting InnoDB deadlock
+information, just accept the default for now. This is an advanced feature you
+can configure later (see L<"D: InnoDB Deadlocks"> for more).
+
+If you have a .my.cnf file with your MySQL connection defaults, innotop can read
+it, and you won't need to specify a username and password if it's in that file.
+Otherwise, you should answer 'y' to the next couple of prompts.
+
+After this, you should be connected, and innotop should show you something like
+the following:
+
+ InnoDB Txns (? for help) localhost, 01:11:19, InnoDB 10s :-), 50 QPS,
+
+ CXN History Versions Undo Dirty Buf Used Bufs Txns MaxTxn
+ localhost 7 2035 0 0 0.00% 92.19% 1 07:34
+
+ CXN ID User Host Txn Status Time Undo Query Tex
+ localhost 98379 user1 webserver ACTIVE 07:34 0 SELECT `c
+ localhost 98450 user1 webserver ACTIVE 01:06 0 INSERT IN
+ localhost 97750 user1 webserver not starte 00:00 0
+ localhost 98375 user1 appserver not starte 00:00 0
+
+(This sample is truncated at the right so it will fit on a terminal when running
+'man innotop')
+
+This sample comes from a quiet server with few transactions active. If your
+server is busy, you'll see more output. Notice the first line on the screen,
+which tells you what mode you're in and what server you're connected to. You
+can change to other modes with keystrokes; press 'Q' to switch to a list of
+currently running queries.
+
+Press the '?' key to see what keys are active in the current mode. You can
+press any of these keys and innotop will either take the requested action or
+prompt you for more input. If your system has Term::ReadLine support, you can
+use TAB and other keys to auto-complete and edit input.
+
+To quit innotop, press the 'q' key.
+
+=head1 OPTIONS
+
+innotop is mostly configured via its configuration file, but some of the
+configuration options can come from the command line. You can also specify a
+file to monitor for InnoDB status output; see L<"MONITORING A FILE"> for more
+details.
+
+You can negate some options by prefixing the option name with --no. For
+example, --noinc (or --no-inc) negates L<"--inc">.
+
+=over
+
+=item --help
+
+Print a summary of command-line usage and exit.
+
+=item --color
+
+Enable or disable terminal coloring. Corresponds to the L<"color"> config file
+setting.
+
+=item --config
+
+Specifies a configuration file to read. This option is non-sticky, that is to
+say it does not persist to the configuration file itself.
+
+=item --nonint
+
+Enable non-interactive operation. See L<"NON-INTERACTIVE OPERATION"> for more.
+
+=item --count
+
+Refresh only the specified number of times (ticks) before exiting. Each refresh
+is a pause for L<"interval"> seconds, followed by requesting data from MySQL
+connections and printing it to the terminal.
+
+=item --delay
+
+Specifies the amount of time to pause between ticks (refreshes). Corresponds to
+the configuration option L<"interval">.
+
+=item --mode
+
+Specifies the mode in which innotop should start. Corresponds to the
+configuration option L<"mode">.
+
+=item --inc
+
+Specifies whether innotop should display absolute numbers or relative numbers
+(offsets from their previous values). Corresponds to the configuration option
+L<"status_inc">.
+
+=item --version
+
+Output version information and exit.
+
+=back
+
+=head1 HOTKEYS
+
+innotop is interactive, and you control it with key-presses.
+
+=over
+
+=item *
+
+Uppercase keys switch between modes.
+
+=item *
+
+Lowercase keys initiate some action within the current mode.
+
+=item *
+
+Other keys do something special like change configuration or show the
+innotop license.
+
+=back
+
+Press '?' at any time to see the currently active keys and what they do.
+
+=head1 MODES
+
+Each of innotop's modes retrieves and displays a particular type of data from
+the servers you're monitoring. You switch between modes with uppercase keys.
+The following is a brief description of each mode, in alphabetical order. To
+switch to the mode, press the key listed in front of its heading in the
+following list:
+
+=over
+
+=item B: InnoDB Buffers
+
+This mode displays information about the InnoDB buffer pool, page statistics,
+insert buffer, and adaptive hash index. The data comes from SHOW INNODB STATUS.
+
+This mode contains the L<"buffer_pool">, L<"page_statistics">,
+L<"insert_buffers">, and L<"adaptive_hash_index"> tables by default.
+
+=item C: Command Summary
+
+This mode is similar to mytop's Command Summary mode. It shows the
+L<"cmd_summary"> table, which looks something like the following:
+
+ Command Summary (? for help) localhost, 25+07:16:43, 2.45 QPS, 3 thd, 5.0.40
+ _____________________ Command Summary _____________________
+ Name Value Pct Last Incr Pct
+ Select_scan 3244858 69.89% 2 100.00%
+ Select_range 1354177 29.17% 0 0.00%
+ Select_full_join 39479 0.85% 0 0.00%
+ Select_full_range_join 4097 0.09% 0 0.00%
+ Select_range_check 0 0.00% 0 0.00%
+
+The command summary table is built by extracting variables from
+L<"STATUS_VARIABLES">. The variables must be numeric and must match the prefix
+given by the L<"cmd_filter"> configuration variable. The variables are then
+sorted by value descending and compared to the last variable, as shown above.
+The percentage columns are percentage of the total of all variables in the
+table, so you can see the relative weight of the variables.
+
+The example shows what you see if the prefix is "Select_". The default
+prefix is "Com_". You can choose a prefix with the 's' key.
+
+It's rather like running SHOW VARIABLES LIKE "prefix%" with memory and
+nice formatting.
+
+Values are aggregated across all servers. The Pct columns are not correctly
+aggregated across multiple servers. This is a known limitation of the grouping
+algorithm that may be fixed in the future.
+
+=item D: InnoDB Deadlocks
+
+This mode shows the transactions involved in the last InnoDB deadlock. A second
+table shows the locks each transaction held and waited for. A deadlock is
+caused by a cycle in the waits-for graph, so there should be two locks held and
+one waited for unless the deadlock information is truncated.
+
+InnoDB puts deadlock information before some other information in the SHOW
+INNODB STATUS output. If there are a lot of locks, the deadlock information can
+grow very large, and there is a limit on the size of the SHOW INNODB
+STATUS output. A large deadlock can fill the entire output, or even be
+truncated, and prevent you from seeing other information at all. If you are
+running innotop in another mode, for example T mode, and suddenly you don't see
+anything, you might want to check and see if a deadlock has wiped out the data
+you need.
+
+If it has, you can create a small deadlock to replace the large one. Use the
+'w' key to 'wipe' the large deadlock with a small one. This will not work
+unless you have defined a deadlock table for the connection (see L<"SERVER
+CONNECTIONS">).
+
+You can also configure innotop to automatically detect when a large deadlock
+needs to be replaced with a small one (see L<"auto_wipe_dl">).
+
+This mode displays the L<"deadlock_transactions"> and L<"deadlock_locks"> tables
+by default.
+
+=item F: InnoDB Foreign Key Errors
+
+This mode shows the last InnoDB foreign key error information, such as the
+table where it happened, when and who and what query caused it, and so on.
+
+InnoDB has a huge variety of foreign key error messages, and many of them are
+just hard to parse. innotop doesn't always do the best job here, but there's
+so much code devoted to parsing this messy, unparseable output that innotop is
+likely never to be perfect in this regard. If innotop doesn't show you what
+you need to see, just look at the status text directly.
+
+This mode displays the L<"fk_error"> table by default.
+
+=item I: InnoDB I/O Info
+
+This mode shows InnoDB's I/O statistics, including the I/O threads, pending I/O,
+file I/O miscellaneous, and log statistics. It displays the L<"io_threads">,
+L<"pending_io">, L<"file_io_misc">, and L<"log_statistics"> tables by default.
+
+=item L: Locks
+
+This mode shows information about current locks. At the moment only InnoDB
+locks are supported, and by default you'll only see locks for which transactions
+are waiting. This information comes from the TRANSACTIONS section of the InnoDB
+status text. If you have a very busy server, you may have frequent lock waits;
+it helps to be able to see which tables and indexes are the "hot spot" for
+locks. If your server is running pretty well, this mode should show nothing.
+
+You can configure MySQL and innotop to monitor not only locks for which a
+transaction is waiting, but those currently held, too. You can do this with the
+InnoDB Lock Monitor (L<http://dev.mysql.com/doc/en/innodb-monitor.html>). It's
+not documented in the MySQL manual, but creating the lock monitor with the
+following statement also affects the output of SHOW INNODB STATUS, which innotop
+uses:
+
+ CREATE TABLE innodb_lock_monitor(a int) ENGINE=INNODB;
+
+This causes InnoDB to print its output to the MySQL file every 16 seconds or so,
+as stated in the manual, but it also makes the normal SHOW INNODB STATUS output
+include lock information, which innotop can parse and display (that's the
+undocumented feature).
+
+This means you can do what may have seemed impossible: to a limited extent
+(InnoDB truncates some information in the output), you can see which transaction
+holds the locks something else is waiting for. You can also enable and disable
+the InnoDB Lock Monitor with the key mappings in this mode.
+
+This mode displays the L<"innodb_locks"> table by default. Here's a sample of
+the screen when one connection is waiting for locks another connection holds:
+
+ _________________________________ InnoDB Locks __________________________
+ CXN ID Type Waiting Wait Active Mode DB Table Index
+ localhost 12 RECORD 1 00:10 00:10 X test t1 PRIMARY
+ localhost 12 TABLE 0 00:10 00:10 IX test t1
+ localhost 12 RECORD 1 00:10 00:10 X test t1 PRIMARY
+ localhost 11 TABLE 0 00:00 00:25 IX test t1
+ localhost 11 RECORD 0 00:00 00:25 X test t1 PRIMARY
+
+You can see the first connection, ID 12, is waiting for a lock on the PRIMARY
+key on test.t1, and has been waiting for 10 seconds. The second connection
+isn't waiting, because the Waiting column is 0, but it holds locks on the same
+index. That tells you connection 11 is blocking connection 12.
+
+=item M: Master/Slave Replication Status
+
+This mode shows the output of SHOW SLAVE STATUS and SHOW MASTER STATUS in three
+tables. The first two divide the slave's status into SQL and I/O thread status,
+and the last shows master status. Filters are applied to eliminate non-slave
+servers from the slave tables, and non-master servers from the master table.
+
+This mode displays the L<"slave_sql_status">, L<"slave_io_status">, and
+L<"master_status"> tables by default.
+
+=item O: Open Tables
+
+This section comes from MySQL's SHOW OPEN TABLES command. By default it is
+filtered to show tables which are in use by one or more queries, so you can
+get a quick look at which tables are 'hot'. You can use this to guess which
+tables might be locked implicitly.
+
+This mode displays the L<"open_tables"> mode by default.
+
+=item Q: Query List
+
+This mode displays the output from SHOW FULL PROCESSLIST, much like B<mytop>'s
+query list mode. This mode does B<not> show InnoDB-related information. This
+is probably one of the most useful modes for general usage.
+
+There is an informative header that shows general status information about
+your server. You can toggle it on and off with the 'h' key. By default,
+innotop hides inactive processes and its own process. You can toggle these on
+and off with the 'i' and 'a' keys.
+
+You can EXPLAIN a query from this mode with the 'e' key. This displays the
+query's full text, the results of EXPLAIN, and in newer MySQL versions, even
+the optimized query resulting from EXPLAIN EXTENDED. innotop also tries to
+rewrite certain queries to make them EXPLAIN-able. For example, INSERT/SELECT
+statements are rewritable.
+
+This mode displays the L<"q_header"> and L<"processlist"> tables by default.
+
+=item R: InnoDB Row Operations and Semaphores
+
+This mode shows InnoDB row operations, row operation miscellaneous, semaphores,
+and information from the wait array. It displays the L<"row_operations">,
+L<"row_operation_misc">, L<"semaphores">, and L<"wait_array"> tables by default.
+
+=item S: Variables & Status
+
+This mode calculates statistics, such as queries per second, and prints them out
+in several different styles. You can show absolute values, or incremental values
+between ticks.
+
+You can switch between the views by pressing a key. The 's' key prints a
+single line each time the screen updates, in the style of B<vmstat>. The 'g'
+key changes the view to a graph of the same numbers, sort of like B<tload>.
+The 'v' key changes the view to a pivoted table of variable names on the left,
+with successive updates scrolling across the screen from left to right. You can
+choose how many updates to put on the screen with the L<"num_status_sets">
+configuration variable.
+
+Headers may be abbreviated to fit on the screen in interactive operation. You
+choose which variables to display with the 'c' key, which selects from
+predefined sets, or lets you create your own sets. You can edit the current set
+with the 'e' key.
+
+This mode doesn't really display any tables like other modes. Instead, it uses
+a table definition to extract and format the data, but it then transforms the
+result in special ways before outputting it. It uses the L<"var_status"> table
+definition for this.
+
+=item T: InnoDB Transactions
+
+This mode shows transactions from the InnoDB monitor's output, in B<top>-like
+format. This mode is the reason I wrote innotop.
+
+You can kill queries or processes with the 'k' and 'x' keys, and EXPLAIN a query
+with the 'e' or 'f' keys. InnoDB doesn't print the full query in transactions,
+so explaining may not work right if the query is truncated.
+
+The informational header can be toggled on and off with the 'h' key. By
+default, innotop hides inactive transactions and its own transaction. You can
+toggle this on and off with the 'i' and 'a' keys.
+
+This mode displays the L<"t_header"> and L<"innodb_transactions"> tables by
+default.
+
+=back
+
+=head1 INNOTOP STATUS
+
+The first line innotop displays is a "status bar" of sorts. What it contains
+depends on the mode you're in, and what servers you're monitoring. The first
+few words are always the innotop mode, such as "InnoDB Txns" for T mode,
+followed by a reminder to press '?' for help at any time.
+
+=head2 ONE SERVER
+
+The simplest case is when you're monitoring a single server. In this case, the
+name of the connection is next on the status line. This is the name you gave
+when you created the connection -- most likely the MySQL server's hostname.
+This is followed by the server's uptime.
+
+If you're in an InnoDB mode, such as T or B, the next word is "InnoDB" followed
+by some information about the SHOW INNODB STATUS output used to render the
+screen. The first word is the number of seconds since the last SHOW INNODB
+STATUS, which InnoDB uses to calculate some per-second statistics. The next is
+a smiley face indicating whether the InnoDB output is truncated. If the smiley
+face is a :-), all is well; there is no truncation. A :^| means the transaction
+list is so long, InnoDB has only printed out some of the transactions. Finally,
+a frown :-( means the output is incomplete, which is probably due to a deadlock
+printing too much lock information (see L<"D: InnoDB Deadlocks">).
+
+The next two words indicate the server's queries per second (QPS) and how many
+threads (connections) exist. Finally, the server's version number is the last
+thing on the line.
+
+=head2 MULTIPLE SERVERS
+
+If you are monitoring multiple servers (see L<"SERVER CONNECTIONS">), the status
+line does not show any details about individual servers. Instead, it shows the
+names of the connections that are active. Again, these are connection names you
+specified, which are likely to be the server's hostname. A connection that has
+an error is prefixed with an exclamation point.
+
+If you are monitoring a group of servers (see L<"SERVER GROUPS">), the status
+line shows the name of the group. If any connection in the group has an
+error, the group's name is followed by the fraction of the connections that
+don't have errors.
+
+See L<"ERROR HANDLING"> for more details about innotop's error handling.
+
+=head2 MONITORING A FILE
+
+If you give a filename on the command line, innotop will not connect to ANY
+servers at all. It will watch the specified file for InnoDB status output and
+use that as its data source. It will always show a single connection called
+'file'. And since it can't connect to a server, it can't determine how long the
+server it's monitoring has been up; so it calculates the server's uptime as time
+since innotop started running.
+
+=head1 SERVER ADMINISTRATION
+
+While innotop is primarily a monitor that lets you watch and analyze your
+servers, it can also send commands to servers. The most frequently useful
+commands are killing queries and stopping or starting slaves.
+
+You can kill a connection, or in newer versions of MySQL kill a query but not a
+connection, from L<"Q: Query List"> and L<"T: InnoDB Transactions"> modes.
+Press 'k' to issue a KILL command, or 'x' to issue a KILL QUERY command.
+innotop will prompt you for the server and/or connection ID to kill (innotop
+does not prompt you if there is only one possible choice for any input).
+innotop pre-selects the longest-running query, or the oldest connection.
+Confirm the command with 'y'.
+
+In L<"M: Master/Slave Replication Status"> mode, you can start and stop slaves
+with the 'a' and 'o' keys, respectively. You can send these commands to many
+slaves at once. innotop fills in a default command of START SLAVE or STOP SLAVE
+for you, but you can actually edit the command and send anything you wish, such
+as SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1 to make the slave skip one binlog event
+when it starts.
+
+You can also ask innotop to calculate the earliest binlog in use by any slave
+and issue a PURGE MASTER LOGS on the master. Use the 'b' key for this. innotop
+will prompt you for a master to run the command on, then prompt you for the
+connection names of that master's slaves (there is no way for innotop to
+determine this reliably itself). innotop will find the minimum binlog in use by
+these slave connections and suggest it as the argument to PURGE MASTER LOGS.
+
+=head1 SERVER CONNECTIONS
+
+When you create a server connection, innotop asks you for a series of inputs, as
+follows:
+
+=over
+
+=item DSN
+
+A DSN is a Data Source Name, which is the initial argument passed to the DBI
+module for connecting to a server. It is usually of the form
+
+ DBI:mysql:;mysql_read_default_group=mysql;host=HOSTNAME
+
+Since this DSN is passed to the DBD::mysql driver, you should read the driver's
+documentation at L<"http://search.cpan.org/dist/DBD-mysql/lib/DBD/mysql.pm"> for
+the exact details on all the options you can pass the driver in the DSN. You
+can read more about DBI at L<http://dbi.perl.org/docs/>, and especially at
+L<http://search.cpan.org/~timb/DBI/DBI.pm>.
+
+The mysql_read_default_group=mysql option lets the DBD driver read your MySQL
+options files, such as ~/.my.cnf on UNIX-ish systems. You can use this to avoid
+specifying a username or password for the connection.
+
+=item InnoDB Deadlock Table
+
+This optional item tells innotop a table name it can use to deliberately create
+a small deadlock (see L<"D: InnoDB Deadlocks">). If you specify this option,
+you just need to be sure the table doesn't exist, and that innotop can create
+and drop the table with the InnoDB storage engine. You can safely omit or just
+accept the default if you don't intend to use this.
+
+=item Username
+
+innotop will ask you if you want to specify a username. If you say 'y', it will
+then prompt you for a user name. If you have a MySQL option file that specifies
+your username, you don't have to specify a username.
+
+The username defaults to your login name on the system you're running innotop on.
+
+=item Password
+
+innotop will ask you if you want to specify a password. Like the username, the
+password is optional, but there's an additional prompt that asks if you want to
+save the password in the innotop configuration file. If you don't save it in
+the configuration file, innotop will prompt you for a password each time it
+starts. Passwords in the innotop configuration file are saved in plain text,
+not encrypted in any way.
+
+=back
+
+Once you finish answering these questions, you should be connected to a server.
+But innotop isn't limited to monitoring a single server; you can define many
+server connections and switch between them by pressing the '@' key. See
+L<"SWITCHING BETWEEN CONNECTIONS">.
+
+To create a new connection, press the '@' key and type the name of the new
+connection, then follow the steps given above.
+
+=head1 SERVER GROUPS
+
+If you have multiple MySQL instances, you can put them into named groups, such
+as 'all', 'masters', and 'slaves', which innotop can monitor all together.
+
+You can choose which group to monitor with the '#' key, and you can press the
+TAB key to switch to the next group. If you're not currently monitoring a
+group, pressing TAB selects the first group.
+
+To create a group, press the '#' key and type the name of your new group, then
+type the names of the connections you want the group to contain.
+
+=head1 SWITCHING BETWEEN CONNECTIONS
+
+innotop lets you quickly switch which servers you're monitoring. The most basic
+way is by pressing the '@' key and typing the name(s) of the connection(s) you
+want to use. This setting is per-mode, so you can monitor different connections
+in each mode, and innotop remembers which connections you choose.
+
+You can quickly switch to the 'next' connection in alphabetical order with the
+'n' key. If you're monitoring a server group (see L<"SERVER GROUPS">) this will
+switch to the first connection.
+
+You can also type many connection names, and innotop will fetch and display data
+from them all. Just separate the connection names with spaces, for example
+"server1 server2." Again, if you type the name of a connection that doesn't
+exist, innotop will prompt you for connection information and create the
+connection.
+
+Another way to monitor multiple connections at once is with server groups. You
+can use the TAB key to switch to the 'next' group in alphabetical order, or if
+you're not monitoring any groups, TAB will switch to the first group.
+
+innotop does not fetch data in parallel from connections, so if you are
+monitoring a large group or many connections, you may notice increased delay
+between ticks.
+
+When you monitor more than one connection, innotop's status bar changes. See
+L<"INNOTOP STATUS">.
+
+=head1 ERROR HANDLING
+
+Error handling is not that important when monitoring a single connection, but is
+crucial when you have many active connections. A crashed server or lost
+connection should not crash innotop. As a result, innotop will continue to run
+even when there is an error; it just won't display any information from the
+connection that had an error. Because of this, innotop's behavior might confuse
+you. It's a feature, not a bug!
+
+innotop does not continue to query connections that have errors, because they
+may slow innotop and make it hard to use, especially if the error is a problem
+connecting and causes a long time-out. Instead, innotop retries the connection
+occasionally to see if the error still exists. If so, it will wait until some
+point in the future. The wait time increases in ticks as the Fibonacci series,
+so it tries less frequently as time passes.
+
+Since errors might only happen in certain modes because of the SQL commands
+issued in those modes, innotop keeps track of which mode caused the error. If
+you switch to a different mode, innotop will retry the connection instead of
+waiting.
+
+By default innotop will display the problem in red text at the bottom of the
+first table on the screen. You can disable this behavior with the
+L<"show_cxn_errors_in_tbl"> configuration option, which is enabled by default.
+If the L<"debug"> option is enabled, innotop will display the error at the
+bottom of every table, not just the first. And if L<"show_cxn_errors"> is
+enabled, innotop will print the error text to STDOUT as well. Error messages
+might only display in the mode that caused the error, depending on the mode and
+whether innotop is avoiding querying that connection.
+
+=head1 NON-INTERACTIVE OPERATION
+
+You can run innotop in non-interactive mode, in which case it is entirely
+controlled from the configuration file and command-line options. To start
+innotop in non-interactive mode, give the L"<--nonint"> command-line option.
+This changes innotop's behavior in the following ways:
+
+=over
+
+=item *
+
+Certain Perl modules are not loaded. Term::Readline is not loaded, since
+innotop doesn't prompt interactively. Term::ANSIColor and Win32::Console::ANSI
+modules are not loaded. Term::ReadKey is still used, since innotop may have to
+prompt for connection passwords when starting up.
+
+=item *
+
+innotop does not clear the screen after each tick.
+
+=item *
+
+innotop does not persist any changes to the configuration file.
+
+=item *
+
+If L<"--count"> is given and innotop is in incremental mode (see L<"status_inc">
+and L<"--inc">), innotop actually refreshes one more time than specified so it
+can print incremental statistics. This suppresses output during the first
+tick, so innotop may appear to hang.
+
+=item *
+
+innotop only displays the first table in each mode. This is so the output can
+be easily processed with other command-line utilities such as awk and sed. To
+change which tables display in each mode, see L<"TABLES">. Since L<"Q: Query
+List"> mode is so important, innotop automatically disables the L<"q_header">
+table. This ensures you'll see the L<"processlist"> table, even if you have
+innotop configured to show the q_header table during interactive operation.
+Similarly, in L<"T: InnoDB Transactions"> mode, the L<"t_header"> table is
+suppressed so you see only the L<"innodb_transactions"> table.
+
+=item *
+
+All output is tab-separated instead of being column-aligned with whitespace, and
+innotop prints the full contents of each table instead of only printing one
+screenful at a time.
+
+=item *
+
+innotop only prints column headers once instead of every tick (see
+L<"hide_hdr">). innotop does not print table captions (see
+L<"display_table_captions">). innotop ensures there are no empty lines in the
+output.
+
+=item *
+
+innotop does not honor the L<"shorten"> transformation, which normally shortens
+some numbers to human-readable formats.
+
+=item *
+
+innotop does not print a status line (see L<"INNOTOP STATUS">).
+
+=back
+
+=head1 CONFIGURING
+
+Nearly everything about innotop is configurable. Most things are possible to
+change with built-in commands, but you can also edit the configuration file.
+
+While running innotop, press the '$' key to bring up the configuration editing
+dialog. Press another key to select the type of data you want to edit:
+
+=over
+
+=item S: Statement Sleep Times
+
+Edits SQL statement sleep delays, which make innotop pause for the specified
+amount of time after executing a statement. See L<"SQL STATEMENTS"> for a
+definition of each statement and what it does. By default innotop does not
+delay after any statements.
+
+This feature is included so you can customize the side-effects caused by
+monitoring your server. You may not see any effects, but some innotop users
+have noticed that certain MySQL versions under very high load with InnoDB
+enabled take longer than usual to execute SHOW GLOBAL STATUS. If innotop calls
+SHOW FULL PROCESSLIST immediately afterward, the processlist contains more
+queries than the machine actually averages at any given moment. Configuring
+innotop to pause briefly after calling SHOW GLOBAL STATUS alleviates this
+effect.
+
+Sleep times are stored in the L<"stmt_sleep_times"> section of the configuration
+file. Fractional-second sleeps are supported, subject to your hardware's
+limitations.
+
+=item c: Edit Columns
+
+Starts the table editor on one of the displayed tables. See L<"TABLE EDITOR">.
+An alternative way to start the table editor without entering the configuration
+dialog is with the '^' key.
+
+=item g: General Configuration
+
+Starts the configuration editor to edit global and mode-specific configuration
+variables (see L<"MODES">). innotop prompts you to choose a variable from among
+the global and mode-specific ones depending on the current mode.
+
+=item k: Row-Coloring Rules
+
+Starts the row-coloring rules editor on one of the displayed table(s). See
+L<"COLORS"> for details.
+
+=item p: Manage Plugins
+
+Starts the plugin configuration editor. See L<"PLUGINS"> for details.
+
+=item s: Server Groups
+
+Lets you create and edit server groups. See L<"SERVER GROUPS">.
+
+=item t: Choose Displayed Tables
+
+Lets you choose which tables to display in this mode. See L<"MODES"> and
+L<"TABLES">.
+
+=back
+
+=head1 CONFIGURATION FILE
+
+innotop's default configuration file location is in $HOME/.innotop, but can be
+overridden with the L<"--config"> command-line option. You can edit it by hand
+safely. innotop reads the configuration file when it starts, and writes it out
+again when it exits, so any changes you make while innotop is running will be
+lost.
+
+innotop doesn't store its entire configuration in the configuration file. It
+has a huge set of default configuration that it holds only in memory, and the
+configuration file only overrides these defaults. When you customize a default
+setting, innotop notices, and then stores the customizations into the file.
+This keeps the file size down, makes it easier to edit, and makes upgrades
+easier.
+
+A configuration file can be made read-only. See L<"readonly">.
+
+The configuration file is arranged into sections like an INI file. Each
+section begins with [section-name] and ends with [/section-name]. Each
+section's entries have a different syntax depending on the data they need to
+store. You can put comments in the file; any line that begins with a #
+character is a comment. innotop will not read the comments, so it won't write
+them back out to the file when it exits. Comments in read-only configuration
+files are still useful, though.
+
+The first line in the file is innotop's version number. This lets innotop
+notice when the file format is not backwards-compatible, and upgrade smoothly
+without destroying your customized configuration.
+
+The following list describes each section of the configuration file and the data
+it contains:
+
+=over
+
+=item general
+
+The 'general' section contains global configuration variables and variables that
+may be mode-specific, but don't belong in any other section. The syntax is a
+simple key=value list. innotop writes a comment above each value to help you
+edit the file by hand.
+
+=over
+
+=item S_func
+
+Controls S mode presentation (see L<"S: Variables & Status">). If g, values are
+graphed; if s, values are like vmstat; if p, values are in a pivoted table.
+
+=item S_set
+
+Specifies which set of variables to display in L<"S: Variables & Status"> mode.
+See L<"VARIABLE SETS">.
+
+=item auto_wipe_dl
+
+Instructs innotop to automatically wipe large deadlocks when it notices them.
+When this happens you may notice a slight delay. At the next tick, you will
+usually see the information that was being truncated by the large deadlock.
+
+=item charset
+
+Specifies what kind of characters to allow through the L<"no_ctrl_char">
+transformation. This keeps non-printable characters from confusing a
+terminal when you monitor queries that contain binary data, such as images.
+
+The default is 'ascii', which considers anything outside normal ASCII to be a
+control character. The other allowable values are 'unicode' and 'none'. 'none'
+considers every character a control character, which can be useful for
+collapsing ALL text fields in queries.
+
+=item cmd_filter
+
+This is the prefix that filters variables in L<"C: Command Summary"> mode.
+
+=item color
+
+Whether terminal coloring is permitted.
+
+=item cxn_timeout
+
+On MySQL versions 4.0.3 and newer, this variable is used to set the connection's
+timeout, so MySQL doesn't close the connection if it is not used for a while.
+This might happen because a connection isn't monitored in a particular mode, for
+example.
+
+=item debug
+
+This option enables more verbose errors and makes innotop more strict in some
+places. It can help in debugging filters and other user-defined code. It also
+makes innotop write a lot of information to L<"debugfile"> when there is a
+crash.
+
+=item debugfile
+
+A file to which innotop will write information when there is a crash. See
+L<"FILES">.
+
+=item display_table_captions
+
+innotop displays a table caption above most tables. This variable suppresses or
+shows captions on all tables globally. Some tables are configured with the
+hide_caption property, which overrides this.
+
+=item global
+
+Whether to show GLOBAL variables and status. innotop only tries to do this on
+servers which support the GLOBAL option to SHOW VARIABLES and SHOW STATUS. In
+some MySQL versions, you need certain privileges to do this; if you don't have
+them, innotop will not be able to fetch any variable and status data. This
+configuration variable lets you run innotop and fetch what data you can even
+without the elevated privileges.
+
+I can no longer find or reproduce the situation where GLOBAL wasn't allowed, but
+I know there was one.
+
+=item graph_char
+
+Defines the character to use when drawing graphs in L<"S: Variables & Status">
+mode.
+
+=item header_highlight
+
+Defines how to highlight column headers. This only works if Term::ANSIColor is
+available. Valid values are 'bold' and 'underline'.
+
+=item hide_hdr
+
+Hides column headers globally.
+
+=item interval
+
+The interval at which innotop will refresh its data (ticks). The interval is
+implemented as a sleep time between ticks, so the true interval will vary
+depending on how long it takes innotop to fetch and render data.
+
+This variable accepts fractions of a second.
+
+=item mode
+
+The mode in which innotop should start. Allowable arguments are the same as the
+key presses that select a mode interactively. See L<"MODES">.
+
+=item num_digits
+
+How many digits to show in fractional numbers and percents. This variable's
+range is between 0 and 9 and can be set directly from L<"S: Variables & Status">
+mode with the '+' and '-' keys. It is used in the L<"set_precision">,
+L<"shorten">, and L<"percent"> transformations.
+
+=item num_status_sets
+
+Controls how many sets of status variables to display in pivoted L<"S: Variables
+& Status"> mode. It also controls the number of old sets of variables innotop
+keeps in its memory, so the larger this variable is, the more memory innotop
+uses.
+
+=item plugin_dir
+
+Specifies where plugins can be found. By default, innotop stores plugins in the
+'plugins' subdirectory of your innotop configuration directory.
+
+=item readonly
+
+Whether the configuration file is readonly. This cannot be set interactively,
+because it would prevent itself from being written to the configuration file.
+
+=item show_cxn_errors
+
+Makes innotop print connection errors to STDOUT. See L<"ERROR HANDLING">.
+
+=item show_cxn_errors_in_tbl
+
+Makes innotop display connection errors as rows in the first table on screen.
+See L<"ERROR HANDLING">.
+
+=item show_percent
+
+Adds a '%' character after the value returned by the L<"percent">
+transformation.
+
+=item show_statusbar
+
+Controls whether to show the status bar in the display. See L<"INNOTOP
+STATUS">.
+
+=item skip_innodb
+
+Disables fetching SHOW INNODB STATUS, in case your server(s) do not have InnoDB
+enabled and you don't want innotop to try to fetch it. This can also be useful
+when you don't have the SUPER privilege, required to run SHOW INNODB STATUS.
+
+=item status_inc
+
+Whether to show absolute or incremental values for status variables.
+Incremental values are calculated as an offset from the last value innotop saw
+for that variable. This is a global setting, but will probably become
+mode-specific at some point. Right now it is honored a bit inconsistently; some
+modes don't pay attention to it.
+
+=back
+
+=item plugins
+
+This section holds a list of package names of active plugins. If the plugin
+exists, innotop will activate it. See L<"PLUGINS"> for more information.
+
+=item filters
+
+This section holds user-defined filters (see L<"FILTERS">). Each line is in the
+format filter_name=text='filter text' tbls='table list'.
+
+The filter text is the text of the subroutine's code. The table list is a list
+of tables to which the filter can apply. By default, user-defined filters apply
+to the table for which they were created, but you can manually override that by
+editing the definition in the configuration file.
+
+=item active_filters
+
+This section stores which filters are active on each table. Each line is in the
+format table_name=filter_list.
+
+=item tbl_meta
+
+This section stores user-defined or user-customized columns (see L<"COLUMNS">).
+Each line is in the format col_name=properties, where the properties are a
+name=quoted-value list.
+
+=item connections
+
+This section holds the server connections you have defined. Each line is in the
+format name=properties, where the properties are a name=value list. The
+properties are self-explanatory, and the only one that is treated specially is
+'pass' which is only present if 'savepass' is set. See L<"SERVER CONNECTIONS">.
+
+=item active_connections
+
+This section holds a list of which connections are active in each mode. Each
+line is in the format mode_name=connection_list.
+
+=item server_groups
+
+This section holds server groups. Each line is in the format
+name=connection_list. See L<"SERVER GROUPS">.
+
+=item active_server_groups
+
+This section holds a list of which server group is active in each mode. Each
+line is in the format mode_name=server_group.
+
+=item max_values_seen
+
+This section holds the maximum values seen for variables. This is used to scale
+the graphs in L<"S: Variables & Status"> mode. Each line is in the format
+name=value.
+
+=item active_columns
+
+This section holds table column lists. Each line is in the format
+tbl_name=column_list. See L<"COLUMNS">.
+
+=item sort_cols
+
+This section holds the sort definition. Each line is in the format
+tbl_name=column_list. If a column is prefixed with '-', that column sorts
+descending. See L<"SORTING">.
+
+=item visible_tables
+
+This section defines which tables are visible in each mode. Each line is in the
+format mode_name=table_list. See L<"TABLES">.
+
+=item varsets
+
+This section defines variable sets for use in L<"S: Status & Variables"> mode.
+Each line is in the format name=variable_list. See L<"VARIABLE SETS">.
+
+=item colors
+
+This section defines colorization rules. Each line is in the format
+tbl_name=property_list. See L<"COLORS">.
+
+=item stmt_sleep_times
+
+This section contains statement sleep times. Each line is in the format
+statement_name=sleep_time. See L<"S: Statement Sleep Times">.
+
+=item group_by
+
+This section contains column lists for table group_by expressions. Each line is
+in the format tbl_name=column_list. See L<"GROUPING">.
+
+=back
+
+=head1 CUSTOMIZING
+
+You can customize innotop a great deal. For example, you can:
+
+=over
+
+=item *
+
+Choose which tables to display, and in what order.
+
+=item *
+
+Choose which columns are in those tables, and create new columns.
+
+=item *
+
+Filter which rows display with built-in filters, user-defined filters, and
+quick-filters.
+
+=item *
+
+Sort the rows to put important data first or group together related rows.
+
+=item *
+
+Highlight rows with color.
+
+=item *
+
+Customize the alignment, width, and formatting of columns, and apply
+transformations to columns to extract parts of their values or format the values
+as you wish (for example, shortening large numbers to familiar units).
+
+=item *
+
+Design your own expressions to extract and combine data as you need. This gives
+you unlimited flexibility.
+
+=back
+
+All these and more are explained in the following sections.
+
+=head2 TABLES
+
+A table is what you'd expect: a collection of columns. It also has some other
+properties, such as a caption. Filters, sorting rules, and colorization rules
+belong to tables and are covered in later sections.
+
+Internally, table meta-data is defined in a data structure called %tbl_meta.
+This hash holds all built-in table definitions, which contain a lot of default
+instructions to innotop. The meta-data includes the caption, a list of columns
+the user has customized, a list of columns, a list of visible columns, a list of
+filters, color rules, a sort-column list, sort direction, and some information
+about the table's data sources. Most of this is customizable via the table
+editor (see L<"TABLE EDITOR">).
+
+You can choose which tables to show by pressing the '$' key. See L<"MODES"> and
+L<"TABLES">.
+
+The table life-cycle is as follows:
+
+=over
+
+=item *
+
+Each table begins with a data source, which is an array of hashes. See below
+for details on data sources.
+
+=item *
+
+Each element of the data source becomes a row in the final table.
+
+=item *
+
+For each element in the data source, innotop extracts values from the source and
+creates a row. This row is another hash, which later steps will refer to as
+$set. The values innotop extracts are determined by the table's columns. Each
+column has an extraction subroutine, compiled from an expression (see
+L<"EXPRESSIONS">). The resulting row is a hash whose keys are named the same as
+the column name.
+
+=item *
+
+innotop filters the rows, removing those that don't need to be displayed. See
+L<"FILTERS">.
+
+=item *
+
+innotop sorts the rows. See L<"SORTING">.
+
+=item *
+
+innotop groups the rows together, if specified. See L<"GROUPING">.
+
+=item *
+
+innotop colorizes the rows. See L<"COLORS">.
+
+=item *
+
+innotop transforms the column values in each row. See L<"TRANSFORMATIONS">.
+
+=item *
+
+innotop optionally pivots the rows (see L<"PIVOTING">), then filters and sorts
+them.
+
+=item *
+
+innotop formats and justifies the rows as a table. During this step, innotop
+applies further formatting to the column values, including alignment, maximum
+and minimum widths. innotop also does final error checking to ensure there are
+no crashes due to undefined values. innotop then adds a caption if specified,
+and the table is ready to print.
+
+=back
+
+The lifecycle is slightly different if the table is pivoted, as noted above. To
+clarify, if the table is pivoted, the process is extract, group, transform,
+pivot, filter, sort, create. If it's not pivoted, the process is extract,
+filter, sort, group, color, transform, create. This slightly convoluted process
+doesn't map all that well to SQL, but pivoting complicates things pretty
+thoroughly. Roughly speaking, filtering and sorting happen as late as needed to
+effect the final result as you might expect, but as early as possible for
+efficiency.
+
+Each built-in table is described below:
+
+=over
+
+=item adaptive_hash_index
+
+Displays data about InnoDB's adaptive hash index. Data source:
+L<"STATUS_VARIABLES">.
+
+=item buffer_pool
+
+Displays data about InnoDB's buffer pool. Data source: L<"STATUS_VARIABLES">.
+
+=item cmd_summary
+
+Displays weighted status variables. Data source: L<"STATUS_VARIABLES">.
+
+=item deadlock_locks
+
+Shows which locks were held and waited for by the last detected deadlock. Data
+source: L<"DEADLOCK_LOCKS">.
+
+=item deadlock_transactions
+
+Shows transactions involved in the last detected deadlock. Data source:
+L<"DEADLOCK_TRANSACTIONS">.
+
+=item explain
+
+Shows the output of EXPLAIN. Data source: L<"EXPLAIN">.
+
+=item file_io_misc
+
+Displays data about InnoDB's file and I/O operations. Data source:
+L<"STATUS_VARIABLES">.
+
+=item fk_error
+
+Displays various data about InnoDB's last foreign key error. Data source:
+L<"STATUS_VARIABLES">.
+
+=item innodb_locks
+
+Displays InnoDB locks. Data source: L<"INNODB_LOCKS">.
+
+=item innodb_transactions
+
+Displays data about InnoDB's current transactions. Data source:
+L<"INNODB_TRANSACTIONS">.
+
+=item insert_buffers
+
+Displays data about InnoDB's insert buffer. Data source: L<"STATUS_VARIABLES">.
+
+=item io_threads
+
+Displays data about InnoDB's I/O threads. Data source: L<"IO_THREADS">.
+
+=item log_statistics
+
+Displays data about InnoDB's logging system. Data source: L<"STATUS_VARIABLES">.
+
+=item master_status
+
+Displays replication master status. Data source: L<"STATUS_VARIABLES">.
+
+=item open_tables
+
+Displays open tables. Data source: L<"OPEN_TABLES">.
+
+=item page_statistics
+
+Displays InnoDB page statistics. Data source: L<"STATUS_VARIABLES">.
+
+=item pending_io
+
+Displays InnoDB pending I/O operations. Data source: L<"STATUS_VARIABLES">.
+
+=item processlist
+
+Displays current MySQL processes (threads/connections). Data source:
+L<"PROCESSLIST">.
+
+=item q_header
+
+Displays various status values. Data source: L<"STATUS_VARIABLES">.
+
+=item row_operation_misc
+
+Displays data about InnoDB's row operations. Data source:
+L<"STATUS_VARIABLES">.
+
+=item row_operations
+
+Displays data about InnoDB's row operations. Data source:
+L<"STATUS_VARIABLES">.
+
+=item semaphores
+
+Displays data about InnoDB's semaphores and mutexes. Data source:
+L<"STATUS_VARIABLES">.
+
+=item slave_io_status
+
+Displays data about the slave I/O thread. Data source:
+L<"STATUS_VARIABLES">.
+
+=item slave_sql_status
+
+Displays data about the slave SQL thread. Data source: L<"STATUS_VARIABLES">.
+
+=item t_header
+
+Displays various InnoDB status values. Data source: L<"STATUS_VARIABLES">.
+
+=item var_status
+
+Displays user-configurable data. Data source: L<"STATUS_VARIABLES">.
+
+=item wait_array
+
+Displays data about InnoDB's OS wait array. Data source: L<"OS_WAIT_ARRAY">.
+
+=back
+
+=head2 COLUMNS
+
+Columns belong to tables. You can choose a table's columns by pressing the '^'
+key, which starts the L<"TABLE EDITOR"> and lets you choose and edit columns.
+Pressing 'e' from within the table editor lets you edit the column's properties:
+
+=over
+
+=item *
+
+hdr: a column header. This appears in the first row of the table.
+
+=item *
+
+just: justification. '-' means left-justified and '' means right-justified,
+just as with printf formatting codes (not a coincidence).
+
+=item *
+
+dec: whether to further align the column on the decimal point.
+
+=item *
+
+num: whether the column is numeric. This affects how values are sorted
+(lexically or numerically).
+
+=item *
+
+label: a small note about the column, which appears in dialogs that help the
+user choose columns.
+
+=item *
+
+src: an expression that innotop uses to extract the column's data from its
+source (see L<"DATA SOURCES">). See L<"EXPRESSIONS"> for more on expressions.
+
+=item *
+
+minw: specifies a minimum display width. This helps stabilize the display,
+which makes it easier to read if the data is changing frequently.
+
+=item *
+
+maxw: similar to minw.
+
+=item *
+
+trans: a list of column transformations. See L<"TRANSFORMATIONS">.
+
+=item *
+
+agg: an aggregate function. See L<"GROUPING">. The default is L<"first">.
+
+=item *
+
+aggonly: controls whether the column only shows when grouping is enabled on the
+table (see L<"GROUPING">). By default, this is disabled. This means columns
+will always be shown by default, whether grouping is enabled or not. If a
+column's aggonly is set true, the column will appear when you toggle grouping on
+the table. Several columns are set this way, such as the count column on
+L<"processlist"> and L<"innodb_transactions">, so you don't see a count when the
+grouping isn't enabled, but you do when it is.
+
+=back
+
+=head2 FILTERS
+
+Filters remove rows from the display. They behave much like a WHERE clause in
+SQL. innotop has several built-in filters, which remove irrelevant information
+like inactive queries, but you can define your own as well. innotop also lets
+you create quick-filters, which do not get saved to the configuration file, and
+are just an easy way to quickly view only some rows.
+
+You can enable or disable a filter on any table. Press the '%' key (mnemonic: %
+looks kind of like a line being filtered between two circles) and choose which
+table you want to filter, if asked. You'll then see a list of possible filters
+and a list of filters currently enabled for that table. Type the names of
+filters you want to apply and press Enter.
+
+=head3 USER-DEFINED FILTERS
+
+If you type a name that doesn't exist, innotop will prompt you to create the
+filter. Filters are easy to create if you know Perl, and not hard if you don't.
+What you're doing is creating a subroutine that returns true if the row should
+be displayed. The row is a hash reference passed to your subroutine as $set.
+
+For example, imagine you want to filter the processlist table so you only see
+queries that have been running more than five minutes. Type a new name for your
+filter, and when prompted for the subroutine body, press TAB to initiate your
+terminal's auto-completion. You'll see the names of the columns in the
+L<"processlist"> table (innotop generally tries to help you with auto-completion
+lists). You want to filter on the 'time' column. Type the text "$set->{time} >
+300" to return true when the query is more than five minutes old. That's all
+you need to do.
+
+In other words, the code you're typing is surrounded by an implicit context,
+which looks like this:
+
+ sub filter {
+ my ( $set ) = @_;
+ # YOUR CODE HERE
+ }
+
+If your filter doesn't work, or if something else suddenly behaves differently,
+you might have made an error in your filter, and innotop is silently catching
+the error. Try enabling L<"debug"> to make innotop throw an error instead.
+
+=head3 QUICK-FILTERS
+
+innotop's quick-filters are a shortcut to create a temporary filter that doesn't
+persist when you restart innotop. To create a quick-filter, press the '/' key.
+innotop will prompt you for the column name and filter text. Again, you can use
+auto-completion on column names. The filter text can be just the text you want
+to "search for." For example, to filter the L<"processlist"> table on queries
+that refer to the products table, type '/' and then 'info product'.
+
+The filter text can actually be any Perl regular expression, but of course a
+literal string like 'product' works fine as a regular expression.
+
+Behind the scenes innotop compiles the quick-filter into a specially tagged
+filter that is otherwise like any other filter. It just isn't saved to the
+configuration file.
+
+To clear quick-filters, press the '\' key and innotop will clear them all at
+once.
+
+=head2 SORTING
+
+innotop has sensible built-in defaults to sort the most important rows to the
+top of the table. Like anything else in innotop, you can customize how any
+table is sorted.
+
+To start the sort dialog, start the L<"TABLE EDITOR"> with the '^' key, choose a
+table if necessary, and press the 's' key. You'll see a list of columns you can
+use in the sort expression and the current sort expression, if any. Enter a
+list of columns by which you want to sort and press Enter. If you want to
+reverse sort, prefix the column name with a minus sign. For example, if you
+want to sort by column a ascending, then column b descending, type 'a -b'. You
+can also explicitly add a + in front of columns you want to sort ascending, but
+it's not required.
+
+Some modes have keys mapped to open this dialog directly, and to quickly reverse
+sort direction. Press '?' as usual to see which keys are mapped in any mode.
+
+=head2 GROUPING
+
+innotop can group, or aggregate, rows together (I use the terms
+interchangeably). This is quite similar to an SQL GROUP BY clause. You can
+specify to group on certain columns, or if you don't specify any, the entire set
+of rows is treated as one group. This is quite like SQL so far, but unlike SQL,
+you can also select un-grouped columns. innotop actually aggregates every
+column. If you don't explicitly specify a grouping function, the default is
+'first'. This is basically a convenience so you don't have to specify an
+aggregate function for every column you want in the result.
+
+You can quickly toggle grouping on a table with the '=' key, which toggles its
+aggregate property. This property doesn't persist to the config file.
+
+The columns by which the table is grouped are specified in its group_by
+property. When you turn grouping on, innotop places the group_by columns at the
+far left of the table, even if they're not supposed to be visible. The rest of
+the visible columns appear in order after them.
+
+Two tables have default group_by lists and a count column built in:
+L<"processlist"> and L<"innodb_transactions">. The grouping is by connection
+and status, so you can quickly see how many queries or transactions are in a
+given status on each server you're monitoring. The time columns are aggregated
+as a sum; other columns are left at the default 'first' aggregation.
+
+By default, the table shown in L<"S: Variables & Status"> mode also uses
+grouping so you can monitor variables and status across many servers. The
+default aggregation function in this mode is 'avg'.
+
+Valid grouping functions are defined in the %agg_funcs hash. They include
+
+=over
+
+=item first
+
+Returns the first element in the group.
+
+=item count
+
+Returns the number of elements in the group, including undefined elements, much
+like SQL's COUNT(*).
+
+=item avg
+
+Returns the average of defined elements in the group.
+
+=item sum
+
+Returns the sum of elements in the group.
+
+=back
+
+Here's an example of grouping at work. Suppose you have a very busy server with
+hundreds of open connections, and you want to see how many connections are in
+what status. Using the built-in grouping rules, you can press 'Q' to enter
+L<"Q: Query List"> mode. Press '=' to toggle grouping (if necessary, select the
+L<"processlist"> table when prompted).
+
+Your display might now look like the following:
+
+ Query List (? for help) localhost, 32:33, 0.11 QPS, 1 thd, 5.0.38-log
+
+ CXN Cmd Cnt ID User Host Time Query
+ localhost Query 49 12933 webusr localhost 19:38 SELECT * FROM
+ localhost Sending Da 23 2383 webusr localhost 12:43 SELECT col1,
+ localhost Sleep 120 140 webusr localhost 5:18:12
+ localhost Statistics 12 19213 webusr localhost 01:19 SELECT * FROM
+
+That's actually quite a worrisome picture. You've got a lot of idle connections
+(Sleep), and some connections executing queries (Query and Sending Data).
+That's okay, but you also have a lot in Statistics status, collectively spending
+over a minute. That means the query optimizer is having a really hard time
+optimizing your statements. Something is wrong; it should normally take
+milliseconds to optimize queries. You might not have seen this pattern if you
+didn't look at your connections in aggregate. (This is a made-up example, but
+it can happen in real life).
+
+=head2 PIVOTING
+
+innotop can pivot a table for more compact display, similar to a Pivot Table in
+a spreadsheet (also known as a crosstab). Pivoting a table makes columns into
+rows. Assume you start with this table:
+
+ foo bar
+ === ===
+ 1 3
+ 2 4
+
+After pivoting, the table will look like this:
+
+ name set0 set1
+ ==== ==== ====
+ foo 1 2
+ bar 3 4
+
+To get reasonable results, you might need to group as well as pivoting.
+innotop currently does this for L<"S: Variables & Status"> mode.
+
+=head2 COLORS
+
+By default, innotop highlights rows with color so you can see at a glance which
+rows are more important. You can customize the colorization rules and add your
+own to any table. Open the table editor with the '^' key, choose a table if
+needed, and press 'o' to open the color editor dialog.
+
+The color editor dialog displays the rules applied to the table, in the order
+they are evaluated. Each row is evaluated against each rule to see if the rule
+matches the row; if it does, the row gets the specified color, and no further
+rules are evaluated. The rules look like the following:
+
+ state eq Locked black on_red
+ cmd eq Sleep white
+ user eq system user white
+ cmd eq Connect white
+ cmd eq Binlog Dump white
+ time > 600 red
+ time > 120 yellow
+ time > 60 green
+ time > 30 cyan
+
+This is the default rule set for the L<"processlist"> table. In order of
+priority, these rules make locked queries black on a red background, "gray out"
+connections from replication and sleeping queries, and make queries turn from
+cyan to red as they run longer.
+
+(For some reason, the ANSI color code "white" is actually a light gray. Your
+terminal's display may vary; experiment to find colors you like).
+
+You can use keystrokes to move the rules up and down, which re-orders their
+priority. You can also delete rules and add new ones. If you add a new rule,
+innotop prompts you for the column, an operator for the comparison, a value
+against which to compare the column, and a color to assign if the rule matches.
+There is auto-completion and prompting at each step.
+
+The value in the third step needs to be correctly quoted. innotop does not try
+to quote the value because it doesn't know whether it should treat the value as
+a string or a number. If you want to compare the column against a string, as
+for example in the first rule above, you should enter 'Locked' surrounded by
+quotes. If you get an error message about a bareword, you probably should have
+quoted something.
+
+=head2 EXPRESSIONS
+
+Expressions are at the core of how innotop works, and are what enables you to
+extend innotop as you wish. Recall the table lifecycle explained in
+L<"TABLES">. Expressions are used in the earliest step, where it extracts
+values from a data source to form rows.
+
+It does this by calling a subroutine for each column, passing it the source data
+set, a set of current values, and a set of previous values. These are all
+needed so the subroutine can calculate things like the difference between this
+tick and the previous tick.
+
+The subroutines that extract the data from the set are compiled from
+expressions. This gives significantly more power than just naming the values to
+fill the columns, because it allows the column's value to be calculated from
+whatever data is necessary, but avoids the need to write complicated and lengthy
+Perl code.
+
+innotop begins with a string of text that can look as simple as a value's name
+or as complicated as a full-fledged Perl expression. It looks at each
+'bareword' token in the string and decides whether it's supposed to be a key
+into the $set hash. A bareword is an unquoted value that isn't already
+surrounded by code-ish things like dollar signs or curly brackets. If innotop
+decides that the bareword isn't a function or other valid Perl code, it converts
+it into a hash access. After the whole string is processed, innotop compiles a
+subroutine, like this:
+
+ sub compute_column_value {
+ my ( $set, $cur, $pre ) = @_;
+ my $val = # EXPANDED STRING GOES HERE
+ return $val;
+ }
+
+Here's a concrete example, taken from the header table L<"q_header"> in L<"Q:
+Query List"> mode. This expression calculates the qps, or Queries Per Second,
+column's values, from the values returned by SHOW STATUS:
+
+ Questions/Uptime_hires
+
+innotop decides both words are barewords, and transforms this expression into
+the following Perl code:
+
+ $set->{Questions}/$set->{Uptime_hires}
+
+When surrounded by the rest of the subroutine's code, this is executable Perl
+that calculates a high-resolution queries-per-second value.
+
+The arguments to the subroutine are named $set, $cur, and $pre. In most cases,
+$set and $cur will be the same values. However, if L<"status_inc"> is set, $cur
+will not be the same as $set, because $set will already contain values that are
+the incremental difference between $cur and $pre.
+
+Every column in innotop is computed by subroutines compiled in the same fashion.
+There is no difference between innotop's built-in columns and user-defined
+columns. This keeps things consistent and predictable.
+
+=head2 TRANSFORMATIONS
+
+Transformations change how a value is rendered. For example, they can take a
+number of seconds and display it in H:M:S format. The following transformations
+are defined:
+
+=over
+
+=item commify
+
+Adds commas to large numbers every three decimal places.
+
+=item dulint_to_int
+
+Accepts two unsigned integers and converts them into a single longlong. This is
+useful for certain operations with InnoDB, which uses two integers as
+transaction identifiers, for example.
+
+=item no_ctrl_char
+
+Removes quoted control characters from the value. This is affected by the
+L<"charset"> configuration variable.
+
+This transformation only operates within quoted strings, for example, values to
+a SET clause in an UPDATE statement. It will not alter the UPDATE statement,
+but will collapse the quoted string to [BINARY] or [TEXT], depending on the
+charset.
+
+=item percent
+
+Converts a number to a percentage by multiplying it by two, formatting it with
+L<"num_digits"> digits after the decimal point, and optionally adding a percent
+sign (see L<"show_percent">).
+
+=item secs_to_time
+
+Formats a number of seconds as time in days+hours:minutes:seconds format.
+
+=item set_precision
+
+Formats numbers with L<"num_digits"> number of digits after the decimal point.
+
+=item shorten
+
+Formats a number as a unit of 1024 (k/M/G/T) and with L<"num_digits"> number of
+digits after the decimal point.
+
+=back
+
+=head2 TABLE EDITOR
+
+The innotop table editor lets you customize tables with keystrokes. You start
+the table editor with the '^' key. If there's more than one table on the
+screen, it will prompt you to choose one of them. Once you do, innotop will
+show you something like this:
+
+ Editing table definition for Buffer Pool. Press ? for help, q to quit.
+
+ name hdr label src
+ cxn CXN Connection from which cxn
+ buf_pool_size Size Buffer pool size IB_bp_buf_poo
+ buf_free Free Bufs Buffers free in the b IB_bp_buf_fre
+ pages_total Pages Pages total IB_bp_pages_t
+ pages_modified Dirty Pages Pages modified (dirty IB_bp_pages_m
+ buf_pool_hit_rate Hit Rate Buffer pool hit rate IB_bp_buf_poo
+ total_mem_alloc Memory Total memory allocate IB_bp_total_m
+ add_pool_alloc Add'l Pool Additonal pool alloca IB_bp_add_poo
+
+The first line shows which table you're editing, and reminds you again to press
+'?' for a list of key mappings. The rest is a tabular representation of the
+table's columns, because that's likely what you're trying to edit. However, you
+can edit more than just the table's columns; this screen can start the filter
+editor, color rule editor, and more.
+
+Each row in the display shows a single column in the table you're editing, along
+with a couple of its properties such as its header and source expression (see
+L<"EXPRESSIONS">).
+
+The key mappings are Vim-style, as in many other places. Pressing 'j' and 'k'
+moves the highlight up or down. You can then (d)elete or (e)dit the highlighted
+column. You can also (a)dd a column to the table. This actually just activates
+one of the columns already defined for the table; it prompts you to choose from
+among the columns available but not currently displayed. Finally, you can
+re-order the columns with the '+' and '-' keys.
+
+You can do more than just edit the columns with the table editor, you can also
+edit other properties, such as the table's sort expression and group-by
+expression. Press '?' to see the full list, of course.
+
+If you want to really customize and create your own column, as opposed to just
+activating a built-in one that's not currently displayed, press the (n)ew key,
+and innotop will prompt you for the information it needs:
+
+=over
+
+=item *
+
+The column name: this needs to be a word without any funny characters, e.g. just
+letters, numbers and underscores.
+
+=item *
+
+The column header: this is the label that appears at the top of the column, in
+the table header. This can have spaces and funny characters, but be careful not
+to make it too wide and waste space on-screen.
+
+=item *
+
+The column's data source: this is an expression that determines what data from
+the source (see L<"TABLES">) innotop will put into the column. This can just be
+the name of an item in the source, or it can be a more complex expression, as
+described in L<"EXPRESSIONS">.
+
+=back
+
+Once you've entered the required data, your table has a new column. There is no
+difference between this column and the built-in ones; it can have all the same
+properties and behaviors. innotop will write the column's definition to the
+configuration file, so it will persist across sessions.
+
+Here's an example: suppose you want to track how many times your slaves have
+retried transactions. According to the MySQL manual, the
+Slave_retried_transactions status variable gives you that data: "The total
+number of times since startup that the replication slave SQL thread has retried
+transactions. This variable was added in version 5.0.4." This is appropriate to
+add to the L<"slave_sql_status"> table.
+
+To add the column, switch to the replication-monitoring mode with the 'M' key,
+and press the '^' key to start the table editor. When prompted, choose
+slave_sql_status as the table, then press 'n' to create the column. Type
+'retries' as the column name, 'Retries' as the column header, and
+'Slave_retried_transactions' as the source. Now the column is created, and you
+see the table editor screen again. Press 'q' to exit the table editor, and
+you'll see your column at the end of the table.
+
+=head1 VARIABLE SETS
+
+Variable sets are used in L<"S: Variables & Status"> mode to define more easily
+what variables you want to monitor. Behind the scenes they are compiled to a
+list of expressions, and then into a column list so they can be treated just
+like columns in any other table, in terms of data extraction and
+transformations. However, you're protected from the tedious details by a syntax
+that ought to feel very natural to you: a SQL SELECT list.
+
+The data source for variable sets, and indeed the entire S mode, is the
+combination of SHOW STATUS, SHOW VARIABLES, and SHOW INNODB STATUS. Imagine
+that you had a huge table with one column per variable returned from those
+statements. That's the data source for variable sets. You can now query this
+data source just like you'd expect. For example:
+
+ Questions, Uptime, Questions/Uptime as QPS
+
+Behind the scenes innotop will split that variable set into three expressions,
+compile them and turn them into a table definition, then extract as usual. This
+becomes a "variable set," or a "list of variables you want to monitor."
+
+innotop lets you name and save your variable sets, and writes them to the
+configuration file. You can choose which variable set you want to see with the
+'c' key, or activate the next and previous sets with the '>' and '<' keys.
+There are many built-in variable sets as well, which should give you a good
+start for creating your own. Press 'e' to edit the current variable set, or
+just to see how it's defined. To create a new one, just press 'c' and type its
+name.
+
+You may want to use some of the functions listed in L<"TRANSFORMATIONS"> to help
+format the results. In particular, L<"set_precision"> is often useful to limit
+the number of digits you see. Extending the above example, here's how:
+
+ Questions, Uptime, set_precision(Questions/Uptime) as QPS
+
+Actually, this still needs a little more work. If your L<"interval"> is less
+than one second, you might be dividing by zero because Uptime is incremental in
+this mode by default. Instead, use Uptime_hires:
+
+ Questions, Uptime, set_precision(Questions/Uptime_hires) as QPS
+
+This example is simple, but it shows how easy it is to choose which variables
+you want to monitor.
+
+=head1 PLUGINS
+
+innotop has a simple but powerful plugin mechanism by which you can extend
+or modify its existing functionality, and add new functionality. innotop's
+plugin functionality is event-based: plugins register themselves to be called
+when events happen. They then have a chance to influence the event.
+
+An innotop plugin is a Perl module placed in innotop's L<"plugin_dir">
+directory. On UNIX systems, you can place a symbolic link to the module instead
+of putting the actual file there. innotop automatically discovers the file. If
+there is a corresponding entry in the L<"plugins"> configuration file section,
+innotop loads and activates the plugin.
+
+The module must conform to innotop's plugin interface. Additionally, the source
+code of the module must be written in such a way that innotop can inspect the
+file and determine the package name and description.
+
+=head2 Package Source Convention
+
+innotop inspects the plugin module's source to determine the Perl package name.
+It looks for a line of the form "package Foo;" and if found, considers the
+plugin's package name to be Foo. Of course the package name can be a valid Perl
+package name, with double semicolons and so on.
+
+It also looks for a description in the source code, to make the plugin editor
+more human-friendly. The description is a comment line of the form "#
+description: Foo", where "Foo" is the text innotop will consider to be the
+plugin's description.
+
+=head2 Plugin Interface
+
+The innotop plugin interface is quite simple: innotop expects the plugin to be
+an object-oriented module it can call certain methods on. The methods are
+
+=over
+
+=item new(%variables)
+
+This is the plugin's constructor. It is passed a hash of innotop's variables,
+which it can manipulate (see L<"Plugin Variables">). It must return a reference
+to the newly created plugin object.
+
+At construction time, innotop has only loaded the general configuration and
+created the default built-in variables with their default contents (which is
+quite a lot). Therefore, the state of the program is exactly as in the innotop
+source code, plus the configuration variables from the L<"general"> section in
+the config file.
+
+If your plugin manipulates the variables, it is changing global data, which is
+shared by innotop and all plugins. Plugins are loaded in the order they're
+listed in the config file. Your plugin may load before or after another plugin,
+so there is a potential for conflict or interaction between plugins if they
+modify data other plugins use or modify.
+
+=item register_for_events()
+
+This method must return a list of events in which the plugin is interested, if
+any. See L<"Plugin Events"> for the defined events. If the plugin returns an
+event that's not defined, the event is ignored.
+
+=item event handlers
+
+The plugin must implement a method named the same as each event for which it has
+registered. In other words, if the plugin returns qw(foo bar) from
+register_for_events(), it must have foo() and bar() methods. These methods are
+callbacks for the events. See L<"Plugin Events"> for more details about each
+event.
+
+=back
+
+=head2 Plugin Variables
+
+The plugin's constructor is passed a hash of innotop's variables, which it can
+manipulate. It is probably a good idea if the plugin object saves a copy of it
+for later use. The variables are defined in the innotop variable
+%pluggable_vars, and are as follows:
+
+=over
+
+=item action_for
+
+A hashref of key mappings. These are innotop's global hot-keys.
+
+=item agg_funcs
+
+A hashref of functions that can be used for grouping. See L<"GROUPING">.
+
+=item config
+
+The global configuration hash.
+
+=item connections
+
+A hashref of connection specifications. These are just specifications of how to
+connect to a server.
+
+=item dbhs
+
+A hashref of innotop's database connections. These are actual DBI connection
+objects.
+
+=item filters
+
+A hashref of filters applied to table rows. See L<"FILTERS"> for more.
+
+=item modes
+
+A hashref of modes. See L<"MODES"> for more.
+
+=item server_groups
+
+A hashref of server groups. See L<"SERVER GROUPS">.
+
+=item tbl_meta
+
+A hashref of innotop's table meta-data, with one entry per table (see
+L<"TABLES"> for more information).
+
+=item trans_funcs
+
+A hashref of transformation functions. See L<"TRANSFORMATIONS">.
+
+=item var_sets
+
+A hashref of variable sets. See L<"VARIABLE SETS">.
+
+=back
+
+=head2 Plugin Events
+
+Each event is defined somewhere in the innotop source code. When innotop runs
+that code, it executes the callback function for each plugin that expressed its
+interest in the event. innotop passes some data for each event. The events are
+defined in the %event_listener_for variable, and are as follows:
+
+=over
+
+=item extract_values($set, $cur, $pre, $tbl)
+
+This event occurs inside the function that extracts values from a data source.
+The arguments are the set of values, the current values, the previous values,
+and the table name.
+
+=item set_to_tbl
+
+Events are defined at many places in this subroutine, which is responsible for
+turning an arrayref of hashrefs into an arrayref of lines that can be printed to
+the screen. The events all pass the same data: an arrayref of rows and the name
+of the table being created. The events are set_to_tbl_pre_filter,
+set_to_tbl_pre_sort,set_to_tbl_pre_group, set_to_tbl_pre_colorize,
+set_to_tbl_pre_transform, set_to_tbl_pre_pivot, set_to_tbl_pre_create,
+set_to_tbl_post_create.
+
+=item draw_screen($lines)
+
+This event occurs inside the subroutine that prints the lines to the screen.
+$lines is an arrayref of strings.
+
+=back
+
+=head2 Simple Plugin Example
+
+The easiest way to explain the plugin functionality is probably with a simple
+example. The following module adds a column to the beginning of every table and
+sets its value to 1.
+
+ use strict;
+ use warnings FATAL => 'all';
+
+ package Innotop::Plugin::Example;
+ # description: Adds an 'example' column to every table
+
+ sub new {
+ my ( $class, %vars ) = @_;
+ # Store reference to innotop's variables in $self
+ my $self = bless { %vars }, $class;
+
+ # Design the example column
+ my $col = {
+ hdr => 'Example',
+ just => '',
+ dec => 0,
+ num => 1,
+ label => 'Example',
+ src => 'example', # Get data from this column in the data source
+ tbl => '',
+ trans => [],
+ };
+
+ # Add the column to every table.
+ my $tbl_meta = $vars{tbl_meta};
+ foreach my $tbl ( values %$tbl_meta ) {
+ # Add the column to the list of defined columns
+ $tbl->{cols}->{example} = $col;
+ # Add the column to the list of visible columns
+ unshift @{$tbl->{visible}}, 'example';
+ }
+
+ # Be sure to return a reference to the object.
+ return $self;
+ }
+
+ # I'd like to be called when a data set is being rendered into a table, please.
+ sub register_for_events {
+ my ( $self ) = @_;
+ return qw(set_to_tbl_pre_filter);
+ }
+
+ # This method will be called when the event fires.
+ sub set_to_tbl_pre_filter {
+ my ( $self, $rows, $tbl ) = @_;
+ # Set the example column's data source to the value 1.
+ foreach my $row ( @$rows ) {
+ $row->{example} = 1;
+ }
+ }
+
+ 1;
+
+=head2 Plugin Editor
+
+The plugin editor lets you view the plugins innotop discovered and activate or
+deactivate them. Start the editor by pressing $ to start the configuration
+editor from any mode. Press the 'p' key to start the plugin editor. You'll see
+a list of plugins innotop discovered. You can use the 'j' and 'k' keys to move
+the highlight to the desired one, then press the * key to toggle it active or
+inactive. Exit the editor and restart innotop for the changes to take effect.
+
+=head1 SQL STATEMENTS
+
+innotop uses a limited set of SQL statements to retrieve data from MySQL for
+display. The statements are customized depending on the server version against
+which they are executed; for example, on MySQL 5 and newer, INNODB_STATUS
+executes "SHOW ENGINE INNODB STATUS", while on earlier versions it executes
+"SHOW INNODB STATUS". The statements are as follows:
+
+ Statement SQL executed
+ =================== ===============================
+ INNODB_STATUS SHOW [ENGINE] INNODB STATUS
+ KILL_CONNECTION KILL
+ KILL_QUERY KILL QUERY
+ OPEN_TABLES SHOW OPEN TABLES
+ PROCESSLIST SHOW FULL PROCESSLIST
+ SHOW_MASTER_LOGS SHOW MASTER LOGS
+ SHOW_MASTER_STATUS SHOW MASTER STATUS
+ SHOW_SLAVE_STATUS SHOW SLAVE STATUS
+ SHOW_STATUS SHOW [GLOBAL] STATUS
+ SHOW_VARIABLES SHOW [GLOBAL] VARIABLES
+
+=head1 DATA SOURCES
+
+Each time innotop extracts values to create a table (see L<"EXPRESSIONS"> and
+L<"TABLES">), it does so from a particular data source. Largely because of the
+complex data extracted from SHOW INNODB STATUS, this is slightly messy. SHOW
+INNODB STATUS contains a mixture of single values and repeated values that form
+nested data sets.
+
+Whenever innotop fetches data from MySQL, it adds two extra bits to each set:
+cxn and Uptime_hires. cxn is the name of the connection from which the data
+came. Uptime_hires is a high-resolution version of the server's Uptime status
+variable, which is important if your L<"interval"> setting is sub-second.
+
+Here are the kinds of data sources from which data is extracted:
+
+=over
+
+=item STATUS_VARIABLES
+
+This is the broadest category, into which the most kinds of data fall. It
+begins with the combination of SHOW STATUS and SHOW VARIABLES, but other sources
+may be included as needed, for example, SHOW MASTER STATUS and SHOW SLAVE
+STATUS, as well as many of the non-repeated values from SHOW INNODB STATUS.
+
+=item DEADLOCK_LOCKS
+
+This data is extracted from the transaction list in the LATEST DETECTED DEADLOCK
+section of SHOW INNODB STATUS. It is nested two levels deep: transactions, then
+locks.
+
+=item DEADLOCK_TRANSACTIONS
+
+This data is from the transaction list in the LATEST DETECTED DEADLOCK
+section of SHOW INNODB STATUS. It is nested one level deep.
+
+=item EXPLAIN
+
+This data is from the result set returned by EXPLAIN.
+
+=item INNODB_TRANSACTIONS
+
+This data is from the TRANSACTIONS section of SHOW INNODB STATUS.
+
+=item IO_THREADS
+
+This data is from the list of threads in the the FILE I/O section of SHOW INNODB
+STATUS.
+
+=item INNODB_LOCKS
+
+This data is from the TRANSACTIONS section of SHOW INNODB STATUS and is nested
+two levels deep.
+
+=item OPEN_TABLES
+
+This data is from SHOW OPEN TABLES.
+
+=item PROCESSLIST
+
+This data is from SHOW FULL PROCESSLIST.
+
+=item OS_WAIT_ARRAY
+
+This data is from the SEMAPHORES section of SHOW INNODB STATUS and is nested one
+level deep. It comes from the lines that look like this:
+
+ --Thread 1568861104 has waited at btr0cur.c line 424 ....
+
+=back
+
+=head1 MYSQL PRIVILEGES
+
+=over
+
+=item *
+
+You must connect to MySQL as a user who has the SUPER privilege for many of the
+functions.
+
+=item *
+
+If you don't have the SUPER privilege, you can still run some functions, but you
+won't necessarily see all the same data.
+
+=item *
+
+You need the PROCESS privilege to see the list of currently running queries in Q
+mode.
+
+=item *
+
+You need special privileges to start and stop slave servers.
+
+=item *
+
+You need appropriate privileges to create and drop the deadlock tables if needed
+(see L<"SERVER CONNECTIONS">).
+
+=back
+
+=head1 SYSTEM REQUIREMENTS
+
+You need Perl to run innotop, of course. You also need a few Perl modules: DBI,
+DBD::mysql, Term::ReadKey, and Time::HiRes. These should be included with most
+Perl distributions, but in case they are not, I recommend using versions
+distributed with your operating system or Perl distribution, not from CPAN.
+Term::ReadKey in particular has been known to cause problems if installed from
+CPAN.
+
+If you have Term::ANSIColor, innotop will use it to format headers more readably
+and compactly. (Under Microsoft Windows, you also need Win32::Console::ANSI for
+terminal formatting codes to be honored). If you install Term::ReadLine,
+preferably Term::ReadLine::Gnu, you'll get nice auto-completion support.
+
+I run innotop on Gentoo GNU/Linux, Debian and Ubuntu, and I've had feedback from
+people successfully running it on Red Hat, CentOS, Solaris, and Mac OSX. I
+don't see any reason why it won't work on other UNIX-ish operating systems, but
+I don't know for sure. It also runs on Windows under ActivePerl without
+problem.
+
+I use innotop on MySQL versions 3.23.58, 4.0.27, 4.1.0, 4.1.22, 5.0.26, 5.1.15,
+and 5.2.3. If it doesn't run correctly for you, that is a bug and I hope you
+report it.
+
+=head1 FILES
+
+$HOMEDIR/.innotop is used to store configuration information. Files include the
+configuration file innotop.ini, the core_dump file which contains verbose error
+messages if L<"debug"> is enabled, and the plugins/ subdirectory.
+
+=head1 GLOSSARY OF TERMS
+
+=over
+
+=item tick
+
+A tick is a refresh event, when innotop re-fetches data from connections and
+displays it.
+
+=back
+
+=head1 ACKNOWLEDGEMENTS
+
+I'm grateful to the following people for various reasons, and hope I haven't
+forgotten to include anyone:
+
+Allen K. Smith,
+Aurimas Mikalauskas,
+Bartosz Fenski,
+Brian Miezejewski,
+Christian Hammers,
+Cyril Scetbon,
+Dane Miller,
+David Multer,
+Dr. Frank Ullrich,
+Giuseppe Maxia,
+Google.com Site Reliability Engineers,
+Jan Pieter Kunst,
+Jari Aalto,
+Jay Pipes,
+Jeremy Zawodny,
+Johan Idren,
+Kristian Kohntopp,
+Lenz Grimmer,
+Maciej Dobrzanski,
+Michiel Betel,
+MySQL AB,
+Paul McCullagh,
+Sebastien Estienne,
+Sourceforge.net,
+Steven Kreuzer,
+The Gentoo MySQL Team,
+Trevor Price,
+Yaar Schnitman,
+and probably more people I've neglected to include.
+
+(If I misspelled your name, it's probably because I'm afraid of putting
+international characters into this documentation; earlier versions of Perl might
+not be able to compile it then).
+
+=head1 COPYRIGHT, LICENSE AND WARRANTY
+
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+
+THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License. On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA.
+
+Execute innotop and press '!' to see this information at any time.
+
+=head1 AUTHOR
+
+Baron Schwartz.
+
+=head1 BUGS
+
+You can report bugs, ask for improvements, and get other help and support at
+L<http://sourceforge.net/projects/innotop>. There are mailing lists, forums,
+a bug tracker, etc. Please use these instead of contacting me directly, as it
+makes my job easier and benefits others if the discussions are permanent and
+public. Of course, if you need to contact me in private, please do.
+
+=cut
diff --git a/storage/xtradb/build/debian/additions/innotop/innotop.1 b/storage/xtradb/build/debian/additions/innotop/innotop.1
new file mode 100644
index 00000000000..ef708c3974c
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/innotop.1
@@ -0,0 +1,2086 @@
+.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sh \" Subsection heading
+.br
+.if t .Sp
+.ne 5
+.PP
+\fB\\$1\fR
+.PP
+..
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings. \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote. \*(C+ will
+.\" give a nicer C++. Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+. ds -- \(*W-
+. ds PI pi
+. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
+. ds L" ""
+. ds R" ""
+. ds C` ""
+. ds C' ""
+'br\}
+.el\{\
+. ds -- \|\(em\|
+. ds PI \(*p
+. ds L" ``
+. ds R" ''
+'br\}
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
+.\" entries marked with X<> in POD. Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.if \nF \{\
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
+..
+. nr % 0
+. rr F
+.\}
+.\"
+.\" For nroff, turn off justification. Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.hy 0
+.if n .na
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear. Run. Save yourself. No user-serviceable parts.
+. \" fudge factors for nroff and troff
+.if n \{\
+. ds #H 0
+. ds #V .8m
+. ds #F .3m
+. ds #[ \f1
+. ds #] \fP
+.\}
+.if t \{\
+. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+. ds #V .6m
+. ds #F 0
+. ds #[ \&
+. ds #] \&
+.\}
+. \" simple accents for nroff and troff
+.if n \{\
+. ds ' \&
+. ds ` \&
+. ds ^ \&
+. ds , \&
+. ds ~ ~
+. ds /
+.\}
+.if t \{\
+. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+. \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+. \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+. \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+. ds : e
+. ds 8 ss
+. ds o a
+. ds d- d\h'-1'\(ga
+. ds D- D\h'-1'\(hy
+. ds th \o'bp'
+. ds Th \o'LP'
+. ds ae ae
+. ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "INNOTOP 1p"
+.TH INNOTOP 1p "2007-11-09" "perl v5.8.8" "User Contributed Perl Documentation"
+.SH "NAME"
+innotop \- MySQL and InnoDB transaction/status monitor.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+To monitor servers normally:
+.PP
+.Vb 1
+\& innotop
+.Ve
+.PP
+To monitor InnoDB status information from a file:
+.PP
+.Vb 1
+\& innotop /var/log/mysql/mysqld.err
+.Ve
+.PP
+To run innotop non-interactively in a pipe-and-filter configuration:
+.PP
+.Vb 1
+\& innotop \-\-count 5 \-d 1 \-n
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+innotop monitors MySQL servers. Each of its modes shows you a different aspect
+of what's happening in the server. For example, there's a mode for monitoring
+replication, one for queries, and one for transactions. innotop refreshes its
+data periodically, so you see an updating view.
+.PP
+innotop has lots of features for power users, but you can start and run it with
+virtually no configuration. If you're just getting started, see
+\&\*(L"\s-1QUICK\-START\s0\*(R". Press '?' at any time while running innotop for
+context-sensitive help.
+.SH "QUICK-START"
+.IX Header "QUICK-START"
+To start innotop, open a terminal or command prompt. If you have installed
+innotop on your system, you should be able to just type \*(L"innotop\*(R" and press
+Enter; otherwise, you will need to change to innotop's directory and type \*(L"perl
+innotop\*(R".
+.PP
+The first thing innotop needs to know is how to connect to a MySQL server. You
+can just enter the hostname of the server, for example \*(L"localhost\*(R" or
+\&\*(L"127.0.0.1\*(R" if the server is on the same machine as innotop. After this innotop
+will prompt you for a \s-1DSN\s0 (data source name). You should be able to just accept
+the defaults by pressing Enter.
+.PP
+When innotop asks you about a table to use when resetting InnoDB deadlock
+information, just accept the default for now. This is an advanced feature you
+can configure later (see \*(L"D: InnoDB Deadlocks\*(R" for more).
+.PP
+If you have a .my.cnf file with your MySQL connection defaults, innotop can read
+it, and you won't need to specify a username and password if it's in that file.
+Otherwise, you should answer 'y' to the next couple of prompts.
+.PP
+After this, you should be connected, and innotop should show you something like
+the following:
+.PP
+.Vb 1
+\& InnoDB Txns (? for help) localhost, 01:11:19, InnoDB 10s :\-), 50 QPS,
+.Ve
+.PP
+.Vb 2
+\& CXN History Versions Undo Dirty Buf Used Bufs Txns MaxTxn
+\& localhost 7 2035 0 0 0.00% 92.19% 1 07:34
+.Ve
+.PP
+.Vb 5
+\& CXN ID User Host Txn Status Time Undo Query Tex
+\& localhost 98379 user1 webserver ACTIVE 07:34 0 SELECT `c
+\& localhost 98450 user1 webserver ACTIVE 01:06 0 INSERT IN
+\& localhost 97750 user1 webserver not starte 00:00 0
+\& localhost 98375 user1 appserver not starte 00:00 0
+.Ve
+.PP
+(This sample is truncated at the right so it will fit on a terminal when running
+\&'man innotop')
+.PP
+This sample comes from a quiet server with few transactions active. If your
+server is busy, you'll see more output. Notice the first line on the screen,
+which tells you what mode you're in and what server you're connected to. You
+can change to other modes with keystrokes; press 'Q' to switch to a list of
+currently running queries.
+.PP
+Press the '?' key to see what keys are active in the current mode. You can
+press any of these keys and innotop will either take the requested action or
+prompt you for more input. If your system has Term::ReadLine support, you can
+use \s-1TAB\s0 and other keys to auto-complete and edit input.
+.PP
+To quit innotop, press the 'q' key.
+.SH "OPTIONS"
+.IX Header "OPTIONS"
+innotop is mostly configured via its configuration file, but some of the
+configuration options can come from the command line. You can also specify a
+file to monitor for InnoDB status output; see \*(L"\s-1MONITORING\s0 A \s-1FILE\s0\*(R" for more
+details.
+.PP
+You can negate some options by prefixing the option name with \-\-no. For
+example, \-\-noinc (or \-\-no\-inc) negates \*(L"\-\-inc\*(R".
+.IP "\-\-help" 4
+.IX Item "--help"
+Print a summary of command-line usage and exit.
+.IP "\-\-color" 4
+.IX Item "--color"
+Enable or disable terminal coloring. Corresponds to the \*(L"color\*(R" config file
+setting.
+.IP "\-\-config" 4
+.IX Item "--config"
+Specifies a configuration file to read. This option is non\-sticky, that is to
+say it does not persist to the configuration file itself.
+.IP "\-\-nonint" 4
+.IX Item "--nonint"
+Enable non-interactive operation. See \*(L"\s-1NON\-INTERACTIVE\s0 \s-1OPERATION\s0\*(R" for more.
+.IP "\-\-count" 4
+.IX Item "--count"
+Refresh only the specified number of times (ticks) before exiting. Each refresh
+is a pause for \*(L"interval\*(R" seconds, followed by requesting data from MySQL
+connections and printing it to the terminal.
+.IP "\-\-delay" 4
+.IX Item "--delay"
+Specifies the amount of time to pause between ticks (refreshes). Corresponds to
+the configuration option \*(L"interval\*(R".
+.IP "\-\-mode" 4
+.IX Item "--mode"
+Specifies the mode in which innotop should start. Corresponds to the
+configuration option \*(L"mode\*(R".
+.IP "\-\-inc" 4
+.IX Item "--inc"
+Specifies whether innotop should display absolute numbers or relative numbers
+(offsets from their previous values). Corresponds to the configuration option
+\&\*(L"status_inc\*(R".
+.IP "\-\-version" 4
+.IX Item "--version"
+Output version information and exit.
+.SH "HOTKEYS"
+.IX Header "HOTKEYS"
+innotop is interactive, and you control it with key\-presses.
+.IP "\(bu" 4
+Uppercase keys switch between modes.
+.IP "\(bu" 4
+Lowercase keys initiate some action within the current mode.
+.IP "\(bu" 4
+Other keys do something special like change configuration or show the
+innotop license.
+.PP
+Press '?' at any time to see the currently active keys and what they do.
+.SH "MODES"
+.IX Header "MODES"
+Each of innotop's modes retrieves and displays a particular type of data from
+the servers you're monitoring. You switch between modes with uppercase keys.
+The following is a brief description of each mode, in alphabetical order. To
+switch to the mode, press the key listed in front of its heading in the
+following list:
+.IP "B: InnoDB Buffers" 4
+.IX Item "B: InnoDB Buffers"
+This mode displays information about the InnoDB buffer pool, page statistics,
+insert buffer, and adaptive hash index. The data comes from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.Sp
+This mode contains the \*(L"buffer_pool\*(R", \*(L"page_statistics\*(R",
+\&\*(L"insert_buffers\*(R", and \*(L"adaptive_hash_index\*(R" tables by default.
+.IP "C: Command Summary" 4
+.IX Item "C: Command Summary"
+This mode is similar to mytop's Command Summary mode. It shows the
+\&\*(L"cmd_summary\*(R" table, which looks something like the following:
+.Sp
+.Vb 8
+\& Command Summary (? for help) localhost, 25+07:16:43, 2.45 QPS, 3 thd, 5.0.40
+\& _____________________ Command Summary _____________________
+\& Name Value Pct Last Incr Pct
+\& Select_scan 3244858 69.89% 2 100.00%
+\& Select_range 1354177 29.17% 0 0.00%
+\& Select_full_join 39479 0.85% 0 0.00%
+\& Select_full_range_join 4097 0.09% 0 0.00%
+\& Select_range_check 0 0.00% 0 0.00%
+.Ve
+.Sp
+The command summary table is built by extracting variables from
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R". The variables must be numeric and must match the prefix
+given by the \*(L"cmd_filter\*(R" configuration variable. The variables are then
+sorted by value descending and compared to the last variable, as shown above.
+The percentage columns are percentage of the total of all variables in the
+table, so you can see the relative weight of the variables.
+.Sp
+The example shows what you see if the prefix is \*(L"Select_\*(R". The default
+prefix is \*(L"Com_\*(R". You can choose a prefix with the 's' key.
+.Sp
+It's rather like running \s-1SHOW\s0 \s-1VARIABLES\s0 \s-1LIKE\s0 \*(L"prefix%\*(R" with memory and
+nice formatting.
+.Sp
+Values are aggregated across all servers. The Pct columns are not correctly
+aggregated across multiple servers. This is a known limitation of the grouping
+algorithm that may be fixed in the future.
+.IP "D: InnoDB Deadlocks" 4
+.IX Item "D: InnoDB Deadlocks"
+This mode shows the transactions involved in the last InnoDB deadlock. A second
+table shows the locks each transaction held and waited for. A deadlock is
+caused by a cycle in the waits-for graph, so there should be two locks held and
+one waited for unless the deadlock information is truncated.
+.Sp
+InnoDB puts deadlock information before some other information in the \s-1SHOW\s0
+\&\s-1INNODB\s0 \s-1STATUS\s0 output. If there are a lot of locks, the deadlock information can
+grow very large, and there is a limit on the size of the \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0 output. A large deadlock can fill the entire output, or even be
+truncated, and prevent you from seeing other information at all. If you are
+running innotop in another mode, for example T mode, and suddenly you don't see
+anything, you might want to check and see if a deadlock has wiped out the data
+you need.
+.Sp
+If it has, you can create a small deadlock to replace the large one. Use the
+\&'w' key to 'wipe' the large deadlock with a small one. This will not work
+unless you have defined a deadlock table for the connection (see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R").
+.Sp
+You can also configure innotop to automatically detect when a large deadlock
+needs to be replaced with a small one (see \*(L"auto_wipe_dl\*(R").
+.Sp
+This mode displays the \*(L"deadlock_transactions\*(R" and \*(L"deadlock_locks\*(R" tables
+by default.
+.IP "F: InnoDB Foreign Key Errors" 4
+.IX Item "F: InnoDB Foreign Key Errors"
+This mode shows the last InnoDB foreign key error information, such as the
+table where it happened, when and who and what query caused it, and so on.
+.Sp
+InnoDB has a huge variety of foreign key error messages, and many of them are
+just hard to parse. innotop doesn't always do the best job here, but there's
+so much code devoted to parsing this messy, unparseable output that innotop is
+likely never to be perfect in this regard. If innotop doesn't show you what
+you need to see, just look at the status text directly.
+.Sp
+This mode displays the \*(L"fk_error\*(R" table by default.
+.IP "I: InnoDB I/O Info" 4
+.IX Item "I: InnoDB I/O Info"
+This mode shows InnoDB's I/O statistics, including the I/O threads, pending I/O,
+file I/O miscellaneous, and log statistics. It displays the \*(L"io_threads\*(R",
+\&\*(L"pending_io\*(R", \*(L"file_io_misc\*(R", and \*(L"log_statistics\*(R" tables by default.
+.IP "L: Locks" 4
+.IX Item "L: Locks"
+This mode shows information about current locks. At the moment only InnoDB
+locks are supported, and by default you'll only see locks for which transactions
+are waiting. This information comes from the \s-1TRANSACTIONS\s0 section of the InnoDB
+status text. If you have a very busy server, you may have frequent lock waits;
+it helps to be able to see which tables and indexes are the \*(L"hot spot\*(R" for
+locks. If your server is running pretty well, this mode should show nothing.
+.Sp
+You can configure MySQL and innotop to monitor not only locks for which a
+transaction is waiting, but those currently held, too. You can do this with the
+InnoDB Lock Monitor (<http://dev.mysql.com/doc/en/innodb\-monitor.html>). It's
+not documented in the MySQL manual, but creating the lock monitor with the
+following statement also affects the output of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, which innotop
+uses:
+.Sp
+.Vb 1
+\& CREATE TABLE innodb_lock_monitor(a int) ENGINE=INNODB;
+.Ve
+.Sp
+This causes InnoDB to print its output to the MySQL file every 16 seconds or so,
+as stated in the manual, but it also makes the normal \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 output
+include lock information, which innotop can parse and display (that's the
+undocumented feature).
+.Sp
+This means you can do what may have seemed impossible: to a limited extent
+(InnoDB truncates some information in the output), you can see which transaction
+holds the locks something else is waiting for. You can also enable and disable
+the InnoDB Lock Monitor with the key mappings in this mode.
+.Sp
+This mode displays the \*(L"innodb_locks\*(R" table by default. Here's a sample of
+the screen when one connection is waiting for locks another connection holds:
+.Sp
+.Vb 7
+\& _________________________________ InnoDB Locks __________________________
+\& CXN ID Type Waiting Wait Active Mode DB Table Index
+\& localhost 12 RECORD 1 00:10 00:10 X test t1 PRIMARY
+\& localhost 12 TABLE 0 00:10 00:10 IX test t1
+\& localhost 12 RECORD 1 00:10 00:10 X test t1 PRIMARY
+\& localhost 11 TABLE 0 00:00 00:25 IX test t1
+\& localhost 11 RECORD 0 00:00 00:25 X test t1 PRIMARY
+.Ve
+.Sp
+You can see the first connection, \s-1ID\s0 12, is waiting for a lock on the \s-1PRIMARY\s0
+key on test.t1, and has been waiting for 10 seconds. The second connection
+isn't waiting, because the Waiting column is 0, but it holds locks on the same
+index. That tells you connection 11 is blocking connection 12.
+.IP "M: Master/Slave Replication Status" 4
+.IX Item "M: Master/Slave Replication Status"
+This mode shows the output of \s-1SHOW\s0 \s-1SLAVE\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1MASTER\s0 \s-1STATUS\s0 in three
+tables. The first two divide the slave's status into \s-1SQL\s0 and I/O thread status,
+and the last shows master status. Filters are applied to eliminate non-slave
+servers from the slave tables, and non-master servers from the master table.
+.Sp
+This mode displays the \*(L"slave_sql_status\*(R", \*(L"slave_io_status\*(R", and
+\&\*(L"master_status\*(R" tables by default.
+.IP "O: Open Tables" 4
+.IX Item "O: Open Tables"
+This section comes from MySQL's \s-1SHOW\s0 \s-1OPEN\s0 \s-1TABLES\s0 command. By default it is
+filtered to show tables which are in use by one or more queries, so you can
+get a quick look at which tables are 'hot'. You can use this to guess which
+tables might be locked implicitly.
+.Sp
+This mode displays the \*(L"open_tables\*(R" mode by default.
+.IP "Q: Query List" 4
+.IX Item "Q: Query List"
+This mode displays the output from \s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0, much like \fBmytop\fR's
+query list mode. This mode does \fBnot\fR show InnoDB-related information. This
+is probably one of the most useful modes for general usage.
+.Sp
+There is an informative header that shows general status information about
+your server. You can toggle it on and off with the 'h' key. By default,
+innotop hides inactive processes and its own process. You can toggle these on
+and off with the 'i' and 'a' keys.
+.Sp
+You can \s-1EXPLAIN\s0 a query from this mode with the 'e' key. This displays the
+query's full text, the results of \s-1EXPLAIN\s0, and in newer MySQL versions, even
+the optimized query resulting from \s-1EXPLAIN\s0 \s-1EXTENDED\s0. innotop also tries to
+rewrite certain queries to make them EXPLAIN\-able. For example, \s-1INSERT/SELECT\s0
+statements are rewritable.
+.Sp
+This mode displays the \*(L"q_header\*(R" and \*(L"processlist\*(R" tables by default.
+.IP "R: InnoDB Row Operations and Semaphores" 4
+.IX Item "R: InnoDB Row Operations and Semaphores"
+This mode shows InnoDB row operations, row operation miscellaneous, semaphores,
+and information from the wait array. It displays the \*(L"row_operations\*(R",
+\&\*(L"row_operation_misc\*(R", \*(L"semaphores\*(R", and \*(L"wait_array\*(R" tables by default.
+.IP "S: Variables & Status" 4
+.IX Item "S: Variables & Status"
+This mode calculates statistics, such as queries per second, and prints them out
+in several different styles. You can show absolute values, or incremental values
+between ticks.
+.Sp
+You can switch between the views by pressing a key. The 's' key prints a
+single line each time the screen updates, in the style of \fBvmstat\fR. The 'g'
+key changes the view to a graph of the same numbers, sort of like \fBtload\fR.
+The 'v' key changes the view to a pivoted table of variable names on the left,
+with successive updates scrolling across the screen from left to right. You can
+choose how many updates to put on the screen with the \*(L"num_status_sets\*(R"
+configuration variable.
+.Sp
+Headers may be abbreviated to fit on the screen in interactive operation. You
+choose which variables to display with the 'c' key, which selects from
+predefined sets, or lets you create your own sets. You can edit the current set
+with the 'e' key.
+.Sp
+This mode doesn't really display any tables like other modes. Instead, it uses
+a table definition to extract and format the data, but it then transforms the
+result in special ways before outputting it. It uses the \*(L"var_status\*(R" table
+definition for this.
+.IP "T: InnoDB Transactions" 4
+.IX Item "T: InnoDB Transactions"
+This mode shows transactions from the InnoDB monitor's output, in \fBtop\fR\-like
+format. This mode is the reason I wrote innotop.
+.Sp
+You can kill queries or processes with the 'k' and 'x' keys, and \s-1EXPLAIN\s0 a query
+with the 'e' or 'f' keys. InnoDB doesn't print the full query in transactions,
+so explaining may not work right if the query is truncated.
+.Sp
+The informational header can be toggled on and off with the 'h' key. By
+default, innotop hides inactive transactions and its own transaction. You can
+toggle this on and off with the 'i' and 'a' keys.
+.Sp
+This mode displays the \*(L"t_header\*(R" and \*(L"innodb_transactions\*(R" tables by
+default.
+.SH "INNOTOP STATUS"
+.IX Header "INNOTOP STATUS"
+The first line innotop displays is a \*(L"status bar\*(R" of sorts. What it contains
+depends on the mode you're in, and what servers you're monitoring. The first
+few words are always the innotop mode, such as \*(L"InnoDB Txns\*(R" for T mode,
+followed by a reminder to press '?' for help at any time.
+.Sh "\s-1ONE\s0 \s-1SERVER\s0"
+.IX Subsection "ONE SERVER"
+The simplest case is when you're monitoring a single server. In this case, the
+name of the connection is next on the status line. This is the name you gave
+when you created the connection \*(-- most likely the MySQL server's hostname.
+This is followed by the server's uptime.
+.PP
+If you're in an InnoDB mode, such as T or B, the next word is \*(L"InnoDB\*(R" followed
+by some information about the \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 output used to render the
+screen. The first word is the number of seconds since the last \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0, which InnoDB uses to calculate some per-second statistics. The next is
+a smiley face indicating whether the InnoDB output is truncated. If the smiley
+face is a :\-), all is well; there is no truncation. A :^| means the transaction
+list is so long, InnoDB has only printed out some of the transactions. Finally,
+a frown :\-( means the output is incomplete, which is probably due to a deadlock
+printing too much lock information (see \*(L"D: InnoDB Deadlocks\*(R").
+.PP
+The next two words indicate the server's queries per second (\s-1QPS\s0) and how many
+threads (connections) exist. Finally, the server's version number is the last
+thing on the line.
+.Sh "\s-1MULTIPLE\s0 \s-1SERVERS\s0"
+.IX Subsection "MULTIPLE SERVERS"
+If you are monitoring multiple servers (see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R"), the status
+line does not show any details about individual servers. Instead, it shows the
+names of the connections that are active. Again, these are connection names you
+specified, which are likely to be the server's hostname. A connection that has
+an error is prefixed with an exclamation point.
+.PP
+If you are monitoring a group of servers (see \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R"), the status
+line shows the name of the group. If any connection in the group has an
+error, the group's name is followed by the fraction of the connections that
+don't have errors.
+.PP
+See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R" for more details about innotop's error handling.
+.Sh "\s-1MONITORING\s0 A \s-1FILE\s0"
+.IX Subsection "MONITORING A FILE"
+If you give a filename on the command line, innotop will not connect to \s-1ANY\s0
+servers at all. It will watch the specified file for InnoDB status output and
+use that as its data source. It will always show a single connection called
+\&'file'. And since it can't connect to a server, it can't determine how long the
+server it's monitoring has been up; so it calculates the server's uptime as time
+since innotop started running.
+.SH "SERVER ADMINISTRATION"
+.IX Header "SERVER ADMINISTRATION"
+While innotop is primarily a monitor that lets you watch and analyze your
+servers, it can also send commands to servers. The most frequently useful
+commands are killing queries and stopping or starting slaves.
+.PP
+You can kill a connection, or in newer versions of MySQL kill a query but not a
+connection, from \*(L"Q: Query List\*(R" and \*(L"T: InnoDB Transactions\*(R" modes.
+Press 'k' to issue a \s-1KILL\s0 command, or 'x' to issue a \s-1KILL\s0 \s-1QUERY\s0 command.
+innotop will prompt you for the server and/or connection \s-1ID\s0 to kill (innotop
+does not prompt you if there is only one possible choice for any input).
+innotop pre-selects the longest-running query, or the oldest connection.
+Confirm the command with 'y'.
+.PP
+In \*(L"M: Master/Slave Replication Status\*(R" mode, you can start and stop slaves
+with the 'a' and 'o' keys, respectively. You can send these commands to many
+slaves at once. innotop fills in a default command of \s-1START\s0 \s-1SLAVE\s0 or \s-1STOP\s0 \s-1SLAVE\s0
+for you, but you can actually edit the command and send anything you wish, such
+as \s-1SET\s0 \s-1GLOBAL\s0 SQL_SLAVE_SKIP_COUNTER=1 to make the slave skip one binlog event
+when it starts.
+.PP
+You can also ask innotop to calculate the earliest binlog in use by any slave
+and issue a \s-1PURGE\s0 \s-1MASTER\s0 \s-1LOGS\s0 on the master. Use the 'b' key for this. innotop
+will prompt you for a master to run the command on, then prompt you for the
+connection names of that master's slaves (there is no way for innotop to
+determine this reliably itself). innotop will find the minimum binlog in use by
+these slave connections and suggest it as the argument to \s-1PURGE\s0 \s-1MASTER\s0 \s-1LOGS\s0.
+.SH "SERVER CONNECTIONS"
+.IX Header "SERVER CONNECTIONS"
+When you create a server connection, innotop asks you for a series of inputs, as
+follows:
+.IP "\s-1DSN\s0" 4
+.IX Item "DSN"
+A \s-1DSN\s0 is a Data Source Name, which is the initial argument passed to the \s-1DBI\s0
+module for connecting to a server. It is usually of the form
+.Sp
+.Vb 1
+\& DBI:mysql:;mysql_read_default_group=mysql;host=HOSTNAME
+.Ve
+.Sp
+Since this \s-1DSN\s0 is passed to the DBD::mysql driver, you should read the driver's
+documentation at \*(L"http://search.cpan.org/dist/DBD\-mysql/lib/DBD/mysql.pm\*(R" for
+the exact details on all the options you can pass the driver in the \s-1DSN\s0. You
+can read more about \s-1DBI\s0 at <http://dbi.perl.org/docs/>, and especially at
+<http://search.cpan.org/~timb/DBI/DBI.pm>.
+.Sp
+The mysql_read_default_group=mysql option lets the \s-1DBD\s0 driver read your MySQL
+options files, such as ~/.my.cnf on UNIX-ish systems. You can use this to avoid
+specifying a username or password for the connection.
+.IP "InnoDB Deadlock Table" 4
+.IX Item "InnoDB Deadlock Table"
+This optional item tells innotop a table name it can use to deliberately create
+a small deadlock (see \*(L"D: InnoDB Deadlocks\*(R"). If you specify this option,
+you just need to be sure the table doesn't exist, and that innotop can create
+and drop the table with the InnoDB storage engine. You can safely omit or just
+accept the default if you don't intend to use this.
+.IP "Username" 4
+.IX Item "Username"
+innotop will ask you if you want to specify a username. If you say 'y', it will
+then prompt you for a user name. If you have a MySQL option file that specifies
+your username, you don't have to specify a username.
+.Sp
+The username defaults to your login name on the system you're running innotop on.
+.IP "Password" 4
+.IX Item "Password"
+innotop will ask you if you want to specify a password. Like the username, the
+password is optional, but there's an additional prompt that asks if you want to
+save the password in the innotop configuration file. If you don't save it in
+the configuration file, innotop will prompt you for a password each time it
+starts. Passwords in the innotop configuration file are saved in plain text,
+not encrypted in any way.
+.PP
+Once you finish answering these questions, you should be connected to a server.
+But innotop isn't limited to monitoring a single server; you can define many
+server connections and switch between them by pressing the '@' key. See
+\&\*(L"\s-1SWITCHING\s0 \s-1BETWEEN\s0 \s-1CONNECTIONS\s0\*(R".
+.PP
+To create a new connection, press the '@' key and type the name of the new
+connection, then follow the steps given above.
+.SH "SERVER GROUPS"
+.IX Header "SERVER GROUPS"
+If you have multiple MySQL instances, you can put them into named groups, such
+as 'all', 'masters', and 'slaves', which innotop can monitor all together.
+.PP
+You can choose which group to monitor with the '#' key, and you can press the
+\&\s-1TAB\s0 key to switch to the next group. If you're not currently monitoring a
+group, pressing \s-1TAB\s0 selects the first group.
+.PP
+To create a group, press the '#' key and type the name of your new group, then
+type the names of the connections you want the group to contain.
+.SH "SWITCHING BETWEEN CONNECTIONS"
+.IX Header "SWITCHING BETWEEN CONNECTIONS"
+innotop lets you quickly switch which servers you're monitoring. The most basic
+way is by pressing the '@' key and typing the name(s) of the connection(s) you
+want to use. This setting is per\-mode, so you can monitor different connections
+in each mode, and innotop remembers which connections you choose.
+.PP
+You can quickly switch to the 'next' connection in alphabetical order with the
+\&'n' key. If you're monitoring a server group (see \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R") this will
+switch to the first connection.
+.PP
+You can also type many connection names, and innotop will fetch and display data
+from them all. Just separate the connection names with spaces, for example
+\&\*(L"server1 server2.\*(R" Again, if you type the name of a connection that doesn't
+exist, innotop will prompt you for connection information and create the
+connection.
+.PP
+Another way to monitor multiple connections at once is with server groups. You
+can use the \s-1TAB\s0 key to switch to the 'next' group in alphabetical order, or if
+you're not monitoring any groups, \s-1TAB\s0 will switch to the first group.
+.PP
+innotop does not fetch data in parallel from connections, so if you are
+monitoring a large group or many connections, you may notice increased delay
+between ticks.
+.PP
+When you monitor more than one connection, innotop's status bar changes. See
+\&\*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R".
+.SH "ERROR HANDLING"
+.IX Header "ERROR HANDLING"
+Error handling is not that important when monitoring a single connection, but is
+crucial when you have many active connections. A crashed server or lost
+connection should not crash innotop. As a result, innotop will continue to run
+even when there is an error; it just won't display any information from the
+connection that had an error. Because of this, innotop's behavior might confuse
+you. It's a feature, not a bug!
+.PP
+innotop does not continue to query connections that have errors, because they
+may slow innotop and make it hard to use, especially if the error is a problem
+connecting and causes a long time\-out. Instead, innotop retries the connection
+occasionally to see if the error still exists. If so, it will wait until some
+point in the future. The wait time increases in ticks as the Fibonacci series,
+so it tries less frequently as time passes.
+.PP
+Since errors might only happen in certain modes because of the \s-1SQL\s0 commands
+issued in those modes, innotop keeps track of which mode caused the error. If
+you switch to a different mode, innotop will retry the connection instead of
+waiting.
+.PP
+By default innotop will display the problem in red text at the bottom of the
+first table on the screen. You can disable this behavior with the
+\&\*(L"show_cxn_errors_in_tbl\*(R" configuration option, which is enabled by default.
+If the \*(L"debug\*(R" option is enabled, innotop will display the error at the
+bottom of every table, not just the first. And if \*(L"show_cxn_errors\*(R" is
+enabled, innotop will print the error text to \s-1STDOUT\s0 as well. Error messages
+might only display in the mode that caused the error, depending on the mode and
+whether innotop is avoiding querying that connection.
+.SH "NON-INTERACTIVE OPERATION"
+.IX Header "NON-INTERACTIVE OPERATION"
+You can run innotop in non-interactive mode, in which case it is entirely
+controlled from the configuration file and command-line options. To start
+innotop in non-interactive mode, give the L\*(L"<\-\-nonint\*(R"> command-line option.
+This changes innotop's behavior in the following ways:
+.IP "\(bu" 4
+Certain Perl modules are not loaded. Term::Readline is not loaded, since
+innotop doesn't prompt interactively. Term::ANSIColor and Win32::Console::ANSI
+modules are not loaded. Term::ReadKey is still used, since innotop may have to
+prompt for connection passwords when starting up.
+.IP "\(bu" 4
+innotop does not clear the screen after each tick.
+.IP "\(bu" 4
+innotop does not persist any changes to the configuration file.
+.IP "\(bu" 4
+If \*(L"\-\-count\*(R" is given and innotop is in incremental mode (see \*(L"status_inc\*(R"
+and \*(L"\-\-inc\*(R"), innotop actually refreshes one more time than specified so it
+can print incremental statistics. This suppresses output during the first
+tick, so innotop may appear to hang.
+.IP "\(bu" 4
+innotop only displays the first table in each mode. This is so the output can
+be easily processed with other command-line utilities such as awk and sed. To
+change which tables display in each mode, see \*(L"\s-1TABLES\s0\*(R". Since \*(L"Q: Query List\*(R" mode is so important, innotop automatically disables the \*(L"q_header\*(R"
+table. This ensures you'll see the \*(L"processlist\*(R" table, even if you have
+innotop configured to show the q_header table during interactive operation.
+Similarly, in \*(L"T: InnoDB Transactions\*(R" mode, the \*(L"t_header\*(R" table is
+suppressed so you see only the \*(L"innodb_transactions\*(R" table.
+.IP "\(bu" 4
+All output is tab-separated instead of being column-aligned with whitespace, and
+innotop prints the full contents of each table instead of only printing one
+screenful at a time.
+.IP "\(bu" 4
+innotop only prints column headers once instead of every tick (see
+\&\*(L"hide_hdr\*(R"). innotop does not print table captions (see
+\&\*(L"display_table_captions\*(R"). innotop ensures there are no empty lines in the
+output.
+.IP "\(bu" 4
+innotop does not honor the \*(L"shorten\*(R" transformation, which normally shortens
+some numbers to human-readable formats.
+.IP "\(bu" 4
+innotop does not print a status line (see \*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R").
+.SH "CONFIGURING"
+.IX Header "CONFIGURING"
+Nearly everything about innotop is configurable. Most things are possible to
+change with built-in commands, but you can also edit the configuration file.
+.PP
+While running innotop, press the '$' key to bring up the configuration editing
+dialog. Press another key to select the type of data you want to edit:
+.IP "S: Statement Sleep Times" 4
+.IX Item "S: Statement Sleep Times"
+Edits \s-1SQL\s0 statement sleep delays, which make innotop pause for the specified
+amount of time after executing a statement. See \*(L"\s-1SQL\s0 \s-1STATEMENTS\s0\*(R" for a
+definition of each statement and what it does. By default innotop does not
+delay after any statements.
+.Sp
+This feature is included so you can customize the side-effects caused by
+monitoring your server. You may not see any effects, but some innotop users
+have noticed that certain MySQL versions under very high load with InnoDB
+enabled take longer than usual to execute \s-1SHOW\s0 \s-1GLOBAL\s0 \s-1STATUS\s0. If innotop calls
+\&\s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0 immediately afterward, the processlist contains more
+queries than the machine actually averages at any given moment. Configuring
+innotop to pause briefly after calling \s-1SHOW\s0 \s-1GLOBAL\s0 \s-1STATUS\s0 alleviates this
+effect.
+.Sp
+Sleep times are stored in the \*(L"stmt_sleep_times\*(R" section of the configuration
+file. Fractional-second sleeps are supported, subject to your hardware's
+limitations.
+.IP "c: Edit Columns" 4
+.IX Item "c: Edit Columns"
+Starts the table editor on one of the displayed tables. See \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R".
+An alternative way to start the table editor without entering the configuration
+dialog is with the '^' key.
+.IP "g: General Configuration" 4
+.IX Item "g: General Configuration"
+Starts the configuration editor to edit global and mode-specific configuration
+variables (see \*(L"\s-1MODES\s0\*(R"). innotop prompts you to choose a variable from among
+the global and mode-specific ones depending on the current mode.
+.IP "k: Row-Coloring Rules" 4
+.IX Item "k: Row-Coloring Rules"
+Starts the row-coloring rules editor on one of the displayed table(s). See
+\&\*(L"\s-1COLORS\s0\*(R" for details.
+.IP "p: Manage Plugins" 4
+.IX Item "p: Manage Plugins"
+Starts the plugin configuration editor. See \*(L"\s-1PLUGINS\s0\*(R" for details.
+.IP "s: Server Groups" 4
+.IX Item "s: Server Groups"
+Lets you create and edit server groups. See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "t: Choose Displayed Tables" 4
+.IX Item "t: Choose Displayed Tables"
+Lets you choose which tables to display in this mode. See \*(L"\s-1MODES\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R".
+.SH "CONFIGURATION FILE"
+.IX Header "CONFIGURATION FILE"
+innotop's default configuration file location is in \f(CW$HOME\fR/.innotop, but can be
+overridden with the \*(L"\-\-config\*(R" command-line option. You can edit it by hand
+safely. innotop reads the configuration file when it starts, and writes it out
+again when it exits, so any changes you make while innotop is running will be
+lost.
+.PP
+innotop doesn't store its entire configuration in the configuration file. It
+has a huge set of default configuration that it holds only in memory, and the
+configuration file only overrides these defaults. When you customize a default
+setting, innotop notices, and then stores the customizations into the file.
+This keeps the file size down, makes it easier to edit, and makes upgrades
+easier.
+.PP
+A configuration file can be made read\-only. See \*(L"readonly\*(R".
+.PP
+The configuration file is arranged into sections like an \s-1INI\s0 file. Each
+section begins with [section\-name] and ends with [/section\-name]. Each
+section's entries have a different syntax depending on the data they need to
+store. You can put comments in the file; any line that begins with a #
+character is a comment. innotop will not read the comments, so it won't write
+them back out to the file when it exits. Comments in read-only configuration
+files are still useful, though.
+.PP
+The first line in the file is innotop's version number. This lets innotop
+notice when the file format is not backwards\-compatible, and upgrade smoothly
+without destroying your customized configuration.
+.PP
+The following list describes each section of the configuration file and the data
+it contains:
+.IP "general" 4
+.IX Item "general"
+The 'general' section contains global configuration variables and variables that
+may be mode\-specific, but don't belong in any other section. The syntax is a
+simple key=value list. innotop writes a comment above each value to help you
+edit the file by hand.
+.RS 4
+.IP "S_func" 4
+.IX Item "S_func"
+Controls S mode presentation (see \*(L"S: Variables & Status\*(R"). If g, values are
+graphed; if s, values are like vmstat; if p, values are in a pivoted table.
+.IP "S_set" 4
+.IX Item "S_set"
+Specifies which set of variables to display in \*(L"S: Variables & Status\*(R" mode.
+See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.IP "auto_wipe_dl" 4
+.IX Item "auto_wipe_dl"
+Instructs innotop to automatically wipe large deadlocks when it notices them.
+When this happens you may notice a slight delay. At the next tick, you will
+usually see the information that was being truncated by the large deadlock.
+.IP "charset" 4
+.IX Item "charset"
+Specifies what kind of characters to allow through the \*(L"no_ctrl_char\*(R"
+transformation. This keeps non-printable characters from confusing a
+terminal when you monitor queries that contain binary data, such as images.
+.Sp
+The default is 'ascii', which considers anything outside normal \s-1ASCII\s0 to be a
+control character. The other allowable values are 'unicode' and 'none'. 'none'
+considers every character a control character, which can be useful for
+collapsing \s-1ALL\s0 text fields in queries.
+.IP "cmd_filter" 4
+.IX Item "cmd_filter"
+This is the prefix that filters variables in \*(L"C: Command Summary\*(R" mode.
+.IP "color" 4
+.IX Item "color"
+Whether terminal coloring is permitted.
+.IP "cxn_timeout" 4
+.IX Item "cxn_timeout"
+On MySQL versions 4.0.3 and newer, this variable is used to set the connection's
+timeout, so MySQL doesn't close the connection if it is not used for a while.
+This might happen because a connection isn't monitored in a particular mode, for
+example.
+.IP "debug" 4
+.IX Item "debug"
+This option enables more verbose errors and makes innotop more strict in some
+places. It can help in debugging filters and other user-defined code. It also
+makes innotop write a lot of information to \*(L"debugfile\*(R" when there is a
+crash.
+.IP "debugfile" 4
+.IX Item "debugfile"
+A file to which innotop will write information when there is a crash. See
+\&\*(L"\s-1FILES\s0\*(R".
+.IP "display_table_captions" 4
+.IX Item "display_table_captions"
+innotop displays a table caption above most tables. This variable suppresses or
+shows captions on all tables globally. Some tables are configured with the
+hide_caption property, which overrides this.
+.IP "global" 4
+.IX Item "global"
+Whether to show \s-1GLOBAL\s0 variables and status. innotop only tries to do this on
+servers which support the \s-1GLOBAL\s0 option to \s-1SHOW\s0 \s-1VARIABLES\s0 and \s-1SHOW\s0 \s-1STATUS\s0. In
+some MySQL versions, you need certain privileges to do this; if you don't have
+them, innotop will not be able to fetch any variable and status data. This
+configuration variable lets you run innotop and fetch what data you can even
+without the elevated privileges.
+.Sp
+I can no longer find or reproduce the situation where \s-1GLOBAL\s0 wasn't allowed, but
+I know there was one.
+.IP "graph_char" 4
+.IX Item "graph_char"
+Defines the character to use when drawing graphs in \*(L"S: Variables & Status\*(R"
+mode.
+.IP "header_highlight" 4
+.IX Item "header_highlight"
+Defines how to highlight column headers. This only works if Term::ANSIColor is
+available. Valid values are 'bold' and 'underline'.
+.IP "hide_hdr" 4
+.IX Item "hide_hdr"
+Hides column headers globally.
+.IP "interval" 4
+.IX Item "interval"
+The interval at which innotop will refresh its data (ticks). The interval is
+implemented as a sleep time between ticks, so the true interval will vary
+depending on how long it takes innotop to fetch and render data.
+.Sp
+This variable accepts fractions of a second.
+.IP "mode" 4
+.IX Item "mode"
+The mode in which innotop should start. Allowable arguments are the same as the
+key presses that select a mode interactively. See \*(L"\s-1MODES\s0\*(R".
+.IP "num_digits" 4
+.IX Item "num_digits"
+How many digits to show in fractional numbers and percents. This variable's
+range is between 0 and 9 and can be set directly from \*(L"S: Variables & Status\*(R"
+mode with the '+' and '\-' keys. It is used in the \*(L"set_precision\*(R",
+\&\*(L"shorten\*(R", and \*(L"percent\*(R" transformations.
+.IP "num_status_sets" 4
+.IX Item "num_status_sets"
+Controls how many sets of status variables to display in pivoted \*(L"S: Variables & Status\*(R" mode. It also controls the number of old sets of variables innotop
+keeps in its memory, so the larger this variable is, the more memory innotop
+uses.
+.IP "plugin_dir" 4
+.IX Item "plugin_dir"
+Specifies where plugins can be found. By default, innotop stores plugins in the
+\&'plugins' subdirectory of your innotop configuration directory.
+.IP "readonly" 4
+.IX Item "readonly"
+Whether the configuration file is readonly. This cannot be set interactively,
+because it would prevent itself from being written to the configuration file.
+.IP "show_cxn_errors" 4
+.IX Item "show_cxn_errors"
+Makes innotop print connection errors to \s-1STDOUT\s0. See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R".
+.IP "show_cxn_errors_in_tbl" 4
+.IX Item "show_cxn_errors_in_tbl"
+Makes innotop display connection errors as rows in the first table on screen.
+See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R".
+.IP "show_percent" 4
+.IX Item "show_percent"
+Adds a '%' character after the value returned by the \*(L"percent\*(R"
+transformation.
+.IP "show_statusbar" 4
+.IX Item "show_statusbar"
+Controls whether to show the status bar in the display. See \*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R".
+.IP "skip_innodb" 4
+.IX Item "skip_innodb"
+Disables fetching \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, in case your server(s) do not have InnoDB
+enabled and you don't want innotop to try to fetch it. This can also be useful
+when you don't have the \s-1SUPER\s0 privilege, required to run \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "status_inc" 4
+.IX Item "status_inc"
+Whether to show absolute or incremental values for status variables.
+Incremental values are calculated as an offset from the last value innotop saw
+for that variable. This is a global setting, but will probably become
+mode-specific at some point. Right now it is honored a bit inconsistently; some
+modes don't pay attention to it.
+.RE
+.RS 4
+.RE
+.IP "plugins" 4
+.IX Item "plugins"
+This section holds a list of package names of active plugins. If the plugin
+exists, innotop will activate it. See \*(L"\s-1PLUGINS\s0\*(R" for more information.
+.IP "filters" 4
+.IX Item "filters"
+This section holds user-defined filters (see \*(L"\s-1FILTERS\s0\*(R"). Each line is in the
+format filter_name=text='filter text' tbls='table list'.
+.Sp
+The filter text is the text of the subroutine's code. The table list is a list
+of tables to which the filter can apply. By default, user-defined filters apply
+to the table for which they were created, but you can manually override that by
+editing the definition in the configuration file.
+.IP "active_filters" 4
+.IX Item "active_filters"
+This section stores which filters are active on each table. Each line is in the
+format table_name=filter_list.
+.IP "tbl_meta" 4
+.IX Item "tbl_meta"
+This section stores user-defined or user-customized columns (see \*(L"\s-1COLUMNS\s0\*(R").
+Each line is in the format col_name=properties, where the properties are a
+name=quoted\-value list.
+.IP "connections" 4
+.IX Item "connections"
+This section holds the server connections you have defined. Each line is in the
+format name=properties, where the properties are a name=value list. The
+properties are self\-explanatory, and the only one that is treated specially is
+\&'pass' which is only present if 'savepass' is set. See \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R".
+.IP "active_connections" 4
+.IX Item "active_connections"
+This section holds a list of which connections are active in each mode. Each
+line is in the format mode_name=connection_list.
+.IP "server_groups" 4
+.IX Item "server_groups"
+This section holds server groups. Each line is in the format
+name=connection_list. See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "active_server_groups" 4
+.IX Item "active_server_groups"
+This section holds a list of which server group is active in each mode. Each
+line is in the format mode_name=server_group.
+.IP "max_values_seen" 4
+.IX Item "max_values_seen"
+This section holds the maximum values seen for variables. This is used to scale
+the graphs in \*(L"S: Variables & Status\*(R" mode. Each line is in the format
+name=value.
+.IP "active_columns" 4
+.IX Item "active_columns"
+This section holds table column lists. Each line is in the format
+tbl_name=column_list. See \*(L"\s-1COLUMNS\s0\*(R".
+.IP "sort_cols" 4
+.IX Item "sort_cols"
+This section holds the sort definition. Each line is in the format
+tbl_name=column_list. If a column is prefixed with '\-', that column sorts
+descending. See \*(L"\s-1SORTING\s0\*(R".
+.IP "visible_tables" 4
+.IX Item "visible_tables"
+This section defines which tables are visible in each mode. Each line is in the
+format mode_name=table_list. See \*(L"\s-1TABLES\s0\*(R".
+.IP "varsets" 4
+.IX Item "varsets"
+This section defines variable sets for use in \*(L"S: Status & Variables\*(R" mode.
+Each line is in the format name=variable_list. See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.IP "colors" 4
+.IX Item "colors"
+This section defines colorization rules. Each line is in the format
+tbl_name=property_list. See \*(L"\s-1COLORS\s0\*(R".
+.IP "stmt_sleep_times" 4
+.IX Item "stmt_sleep_times"
+This section contains statement sleep times. Each line is in the format
+statement_name=sleep_time. See \*(L"S: Statement Sleep Times\*(R".
+.IP "group_by" 4
+.IX Item "group_by"
+This section contains column lists for table group_by expressions. Each line is
+in the format tbl_name=column_list. See \*(L"\s-1GROUPING\s0\*(R".
+.SH "CUSTOMIZING"
+.IX Header "CUSTOMIZING"
+You can customize innotop a great deal. For example, you can:
+.IP "\(bu" 4
+Choose which tables to display, and in what order.
+.IP "\(bu" 4
+Choose which columns are in those tables, and create new columns.
+.IP "\(bu" 4
+Filter which rows display with built-in filters, user-defined filters, and
+quick\-filters.
+.IP "\(bu" 4
+Sort the rows to put important data first or group together related rows.
+.IP "\(bu" 4
+Highlight rows with color.
+.IP "\(bu" 4
+Customize the alignment, width, and formatting of columns, and apply
+transformations to columns to extract parts of their values or format the values
+as you wish (for example, shortening large numbers to familiar units).
+.IP "\(bu" 4
+Design your own expressions to extract and combine data as you need. This gives
+you unlimited flexibility.
+.PP
+All these and more are explained in the following sections.
+.Sh "\s-1TABLES\s0"
+.IX Subsection "TABLES"
+A table is what you'd expect: a collection of columns. It also has some other
+properties, such as a caption. Filters, sorting rules, and colorization rules
+belong to tables and are covered in later sections.
+.PP
+Internally, table meta-data is defined in a data structure called \f(CW%tbl_meta\fR.
+This hash holds all built-in table definitions, which contain a lot of default
+instructions to innotop. The meta-data includes the caption, a list of columns
+the user has customized, a list of columns, a list of visible columns, a list of
+filters, color rules, a sort-column list, sort direction, and some information
+about the table's data sources. Most of this is customizable via the table
+editor (see \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R").
+.PP
+You can choose which tables to show by pressing the '$' key. See \*(L"\s-1MODES\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R".
+.PP
+The table life-cycle is as follows:
+.IP "\(bu" 4
+Each table begins with a data source, which is an array of hashes. See below
+for details on data sources.
+.IP "\(bu" 4
+Each element of the data source becomes a row in the final table.
+.IP "\(bu" 4
+For each element in the data source, innotop extracts values from the source and
+creates a row. This row is another hash, which later steps will refer to as
+\&\f(CW$set\fR. The values innotop extracts are determined by the table's columns. Each
+column has an extraction subroutine, compiled from an expression (see
+\&\*(L"\s-1EXPRESSIONS\s0\*(R"). The resulting row is a hash whose keys are named the same as
+the column name.
+.IP "\(bu" 4
+innotop filters the rows, removing those that don't need to be displayed. See
+\&\*(L"\s-1FILTERS\s0\*(R".
+.IP "\(bu" 4
+innotop sorts the rows. See \*(L"\s-1SORTING\s0\*(R".
+.IP "\(bu" 4
+innotop groups the rows together, if specified. See \*(L"\s-1GROUPING\s0\*(R".
+.IP "\(bu" 4
+innotop colorizes the rows. See \*(L"\s-1COLORS\s0\*(R".
+.IP "\(bu" 4
+innotop transforms the column values in each row. See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "\(bu" 4
+innotop optionally pivots the rows (see \*(L"\s-1PIVOTING\s0\*(R"), then filters and sorts
+them.
+.IP "\(bu" 4
+innotop formats and justifies the rows as a table. During this step, innotop
+applies further formatting to the column values, including alignment, maximum
+and minimum widths. innotop also does final error checking to ensure there are
+no crashes due to undefined values. innotop then adds a caption if specified,
+and the table is ready to print.
+.PP
+The lifecycle is slightly different if the table is pivoted, as noted above. To
+clarify, if the table is pivoted, the process is extract, group, transform,
+pivot, filter, sort, create. If it's not pivoted, the process is extract,
+filter, sort, group, color, transform, create. This slightly convoluted process
+doesn't map all that well to \s-1SQL\s0, but pivoting complicates things pretty
+thoroughly. Roughly speaking, filtering and sorting happen as late as needed to
+effect the final result as you might expect, but as early as possible for
+efficiency.
+.PP
+Each built-in table is described below:
+.IP "adaptive_hash_index" 4
+.IX Item "adaptive_hash_index"
+Displays data about InnoDB's adaptive hash index. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "buffer_pool" 4
+.IX Item "buffer_pool"
+Displays data about InnoDB's buffer pool. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "cmd_summary" 4
+.IX Item "cmd_summary"
+Displays weighted status variables. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "deadlock_locks" 4
+.IX Item "deadlock_locks"
+Shows which locks were held and waited for by the last detected deadlock. Data
+source: \*(L"\s-1DEADLOCK_LOCKS\s0\*(R".
+.IP "deadlock_transactions" 4
+.IX Item "deadlock_transactions"
+Shows transactions involved in the last detected deadlock. Data source:
+\&\*(L"\s-1DEADLOCK_TRANSACTIONS\s0\*(R".
+.IP "explain" 4
+.IX Item "explain"
+Shows the output of \s-1EXPLAIN\s0. Data source: \*(L"\s-1EXPLAIN\s0\*(R".
+.IP "file_io_misc" 4
+.IX Item "file_io_misc"
+Displays data about InnoDB's file and I/O operations. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "fk_error" 4
+.IX Item "fk_error"
+Displays various data about InnoDB's last foreign key error. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "innodb_locks" 4
+.IX Item "innodb_locks"
+Displays InnoDB locks. Data source: \*(L"\s-1INNODB_LOCKS\s0\*(R".
+.IP "innodb_transactions" 4
+.IX Item "innodb_transactions"
+Displays data about InnoDB's current transactions. Data source:
+\&\*(L"\s-1INNODB_TRANSACTIONS\s0\*(R".
+.IP "insert_buffers" 4
+.IX Item "insert_buffers"
+Displays data about InnoDB's insert buffer. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "io_threads" 4
+.IX Item "io_threads"
+Displays data about InnoDB's I/O threads. Data source: \*(L"\s-1IO_THREADS\s0\*(R".
+.IP "log_statistics" 4
+.IX Item "log_statistics"
+Displays data about InnoDB's logging system. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "master_status" 4
+.IX Item "master_status"
+Displays replication master status. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "open_tables" 4
+.IX Item "open_tables"
+Displays open tables. Data source: \*(L"\s-1OPEN_TABLES\s0\*(R".
+.IP "page_statistics" 4
+.IX Item "page_statistics"
+Displays InnoDB page statistics. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "pending_io" 4
+.IX Item "pending_io"
+Displays InnoDB pending I/O operations. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "processlist" 4
+.IX Item "processlist"
+Displays current MySQL processes (threads/connections). Data source:
+\&\*(L"\s-1PROCESSLIST\s0\*(R".
+.IP "q_header" 4
+.IX Item "q_header"
+Displays various status values. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "row_operation_misc" 4
+.IX Item "row_operation_misc"
+Displays data about InnoDB's row operations. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "row_operations" 4
+.IX Item "row_operations"
+Displays data about InnoDB's row operations. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "semaphores" 4
+.IX Item "semaphores"
+Displays data about InnoDB's semaphores and mutexes. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "slave_io_status" 4
+.IX Item "slave_io_status"
+Displays data about the slave I/O thread. Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "slave_sql_status" 4
+.IX Item "slave_sql_status"
+Displays data about the slave \s-1SQL\s0 thread. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "t_header" 4
+.IX Item "t_header"
+Displays various InnoDB status values. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "var_status" 4
+.IX Item "var_status"
+Displays user-configurable data. Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "wait_array" 4
+.IX Item "wait_array"
+Displays data about InnoDB's \s-1OS\s0 wait array. Data source: \*(L"\s-1OS_WAIT_ARRAY\s0\*(R".
+.Sh "\s-1COLUMNS\s0"
+.IX Subsection "COLUMNS"
+Columns belong to tables. You can choose a table's columns by pressing the '^'
+key, which starts the \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R" and lets you choose and edit columns.
+Pressing 'e' from within the table editor lets you edit the column's properties:
+.IP "\(bu" 4
+hdr: a column header. This appears in the first row of the table.
+.IP "\(bu" 4
+just: justification. '\-' means left-justified and '' means right\-justified,
+just as with printf formatting codes (not a coincidence).
+.IP "\(bu" 4
+dec: whether to further align the column on the decimal point.
+.IP "\(bu" 4
+num: whether the column is numeric. This affects how values are sorted
+(lexically or numerically).
+.IP "\(bu" 4
+label: a small note about the column, which appears in dialogs that help the
+user choose columns.
+.IP "\(bu" 4
+src: an expression that innotop uses to extract the column's data from its
+source (see \*(L"\s-1DATA\s0 \s-1SOURCES\s0\*(R"). See \*(L"\s-1EXPRESSIONS\s0\*(R" for more on expressions.
+.IP "\(bu" 4
+minw: specifies a minimum display width. This helps stabilize the display,
+which makes it easier to read if the data is changing frequently.
+.IP "\(bu" 4
+maxw: similar to minw.
+.IP "\(bu" 4
+trans: a list of column transformations. See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "\(bu" 4
+agg: an aggregate function. See \*(L"\s-1GROUPING\s0\*(R". The default is \*(L"first\*(R".
+.IP "\(bu" 4
+aggonly: controls whether the column only shows when grouping is enabled on the
+table (see \*(L"\s-1GROUPING\s0\*(R"). By default, this is disabled. This means columns
+will always be shown by default, whether grouping is enabled or not. If a
+column's aggonly is set true, the column will appear when you toggle grouping on
+the table. Several columns are set this way, such as the count column on
+\&\*(L"processlist\*(R" and \*(L"innodb_transactions\*(R", so you don't see a count when the
+grouping isn't enabled, but you do when it is.
+.Sh "\s-1FILTERS\s0"
+.IX Subsection "FILTERS"
+Filters remove rows from the display. They behave much like a \s-1WHERE\s0 clause in
+\&\s-1SQL\s0. innotop has several built-in filters, which remove irrelevant information
+like inactive queries, but you can define your own as well. innotop also lets
+you create quick\-filters, which do not get saved to the configuration file, and
+are just an easy way to quickly view only some rows.
+.PP
+You can enable or disable a filter on any table. Press the '%' key (mnemonic: %
+looks kind of like a line being filtered between two circles) and choose which
+table you want to filter, if asked. You'll then see a list of possible filters
+and a list of filters currently enabled for that table. Type the names of
+filters you want to apply and press Enter.
+.PP
+\fIUSER-DEFINED \s-1FILTERS\s0\fR
+.IX Subsection "USER-DEFINED FILTERS"
+.PP
+If you type a name that doesn't exist, innotop will prompt you to create the
+filter. Filters are easy to create if you know Perl, and not hard if you don't.
+What you're doing is creating a subroutine that returns true if the row should
+be displayed. The row is a hash reference passed to your subroutine as \f(CW$set\fR.
+.PP
+For example, imagine you want to filter the processlist table so you only see
+queries that have been running more than five minutes. Type a new name for your
+filter, and when prompted for the subroutine body, press \s-1TAB\s0 to initiate your
+terminal's auto\-completion. You'll see the names of the columns in the
+\&\*(L"processlist\*(R" table (innotop generally tries to help you with auto-completion
+lists). You want to filter on the 'time' column. Type the text \*(L"$set\->{time} >
+300\*(R" to return true when the query is more than five minutes old. That's all
+you need to do.
+.PP
+In other words, the code you're typing is surrounded by an implicit context,
+which looks like this:
+.PP
+.Vb 4
+\& sub filter {
+\& my ( $set ) = @_;
+\& # YOUR CODE HERE
+\& }
+.Ve
+.PP
+If your filter doesn't work, or if something else suddenly behaves differently,
+you might have made an error in your filter, and innotop is silently catching
+the error. Try enabling \*(L"debug\*(R" to make innotop throw an error instead.
+.PP
+\fIQUICK-FILTERS\fR
+.IX Subsection "QUICK-FILTERS"
+.PP
+innotop's quick-filters are a shortcut to create a temporary filter that doesn't
+persist when you restart innotop. To create a quick\-filter, press the '/' key.
+innotop will prompt you for the column name and filter text. Again, you can use
+auto-completion on column names. The filter text can be just the text you want
+to \*(L"search for.\*(R" For example, to filter the \*(L"processlist\*(R" table on queries
+that refer to the products table, type '/' and then 'info product'.
+.PP
+The filter text can actually be any Perl regular expression, but of course a
+literal string like 'product' works fine as a regular expression.
+.PP
+Behind the scenes innotop compiles the quick-filter into a specially tagged
+filter that is otherwise like any other filter. It just isn't saved to the
+configuration file.
+.PP
+To clear quick\-filters, press the '\e' key and innotop will clear them all at
+once.
+.Sh "\s-1SORTING\s0"
+.IX Subsection "SORTING"
+innotop has sensible built-in defaults to sort the most important rows to the
+top of the table. Like anything else in innotop, you can customize how any
+table is sorted.
+.PP
+To start the sort dialog, start the \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R" with the '^' key, choose a
+table if necessary, and press the 's' key. You'll see a list of columns you can
+use in the sort expression and the current sort expression, if any. Enter a
+list of columns by which you want to sort and press Enter. If you want to
+reverse sort, prefix the column name with a minus sign. For example, if you
+want to sort by column a ascending, then column b descending, type 'a \-b'. You
+can also explicitly add a + in front of columns you want to sort ascending, but
+it's not required.
+.PP
+Some modes have keys mapped to open this dialog directly, and to quickly reverse
+sort direction. Press '?' as usual to see which keys are mapped in any mode.
+.Sh "\s-1GROUPING\s0"
+.IX Subsection "GROUPING"
+innotop can group, or aggregate, rows together (I use the terms
+interchangeably). This is quite similar to an \s-1SQL\s0 \s-1GROUP\s0 \s-1BY\s0 clause. You can
+specify to group on certain columns, or if you don't specify any, the entire set
+of rows is treated as one group. This is quite like \s-1SQL\s0 so far, but unlike \s-1SQL\s0,
+you can also select un-grouped columns. innotop actually aggregates every
+column. If you don't explicitly specify a grouping function, the default is
+\&'first'. This is basically a convenience so you don't have to specify an
+aggregate function for every column you want in the result.
+.PP
+You can quickly toggle grouping on a table with the '=' key, which toggles its
+aggregate property. This property doesn't persist to the config file.
+.PP
+The columns by which the table is grouped are specified in its group_by
+property. When you turn grouping on, innotop places the group_by columns at the
+far left of the table, even if they're not supposed to be visible. The rest of
+the visible columns appear in order after them.
+.PP
+Two tables have default group_by lists and a count column built in:
+\&\*(L"processlist\*(R" and \*(L"innodb_transactions\*(R". The grouping is by connection
+and status, so you can quickly see how many queries or transactions are in a
+given status on each server you're monitoring. The time columns are aggregated
+as a sum; other columns are left at the default 'first' aggregation.
+.PP
+By default, the table shown in \*(L"S: Variables & Status\*(R" mode also uses
+grouping so you can monitor variables and status across many servers. The
+default aggregation function in this mode is 'avg'.
+.PP
+Valid grouping functions are defined in the \f(CW%agg_funcs\fR hash. They include
+.IP "first" 4
+.IX Item "first"
+Returns the first element in the group.
+.IP "count" 4
+.IX Item "count"
+Returns the number of elements in the group, including undefined elements, much
+like \s-1SQL\s0's \s-1COUNT\s0(*).
+.IP "avg" 4
+.IX Item "avg"
+Returns the average of defined elements in the group.
+.IP "sum" 4
+.IX Item "sum"
+Returns the sum of elements in the group.
+.PP
+Here's an example of grouping at work. Suppose you have a very busy server with
+hundreds of open connections, and you want to see how many connections are in
+what status. Using the built-in grouping rules, you can press 'Q' to enter
+\&\*(L"Q: Query List\*(R" mode. Press '=' to toggle grouping (if necessary, select the
+\&\*(L"processlist\*(R" table when prompted).
+.PP
+Your display might now look like the following:
+.PP
+.Vb 1
+\& Query List (? for help) localhost, 32:33, 0.11 QPS, 1 thd, 5.0.38\-log
+.Ve
+.PP
+.Vb 5
+\& CXN Cmd Cnt ID User Host Time Query
+\& localhost Query 49 12933 webusr localhost 19:38 SELECT * FROM
+\& localhost Sending Da 23 2383 webusr localhost 12:43 SELECT col1,
+\& localhost Sleep 120 140 webusr localhost 5:18:12
+\& localhost Statistics 12 19213 webusr localhost 01:19 SELECT * FROM
+.Ve
+.PP
+That's actually quite a worrisome picture. You've got a lot of idle connections
+(Sleep), and some connections executing queries (Query and Sending Data).
+That's okay, but you also have a lot in Statistics status, collectively spending
+over a minute. That means the query optimizer is having a really hard time
+optimizing your statements. Something is wrong; it should normally take
+milliseconds to optimize queries. You might not have seen this pattern if you
+didn't look at your connections in aggregate. (This is a made-up example, but
+it can happen in real life).
+.Sh "\s-1PIVOTING\s0"
+.IX Subsection "PIVOTING"
+innotop can pivot a table for more compact display, similar to a Pivot Table in
+a spreadsheet (also known as a crosstab). Pivoting a table makes columns into
+rows. Assume you start with this table:
+.PP
+.Vb 4
+\& foo bar
+\& === ===
+\& 1 3
+\& 2 4
+.Ve
+.PP
+After pivoting, the table will look like this:
+.PP
+.Vb 4
+\& name set0 set1
+\& ==== ==== ====
+\& foo 1 2
+\& bar 3 4
+.Ve
+.PP
+To get reasonable results, you might need to group as well as pivoting.
+innotop currently does this for \*(L"S: Variables & Status\*(R" mode.
+.Sh "\s-1COLORS\s0"
+.IX Subsection "COLORS"
+By default, innotop highlights rows with color so you can see at a glance which
+rows are more important. You can customize the colorization rules and add your
+own to any table. Open the table editor with the '^' key, choose a table if
+needed, and press 'o' to open the color editor dialog.
+.PP
+The color editor dialog displays the rules applied to the table, in the order
+they are evaluated. Each row is evaluated against each rule to see if the rule
+matches the row; if it does, the row gets the specified color, and no further
+rules are evaluated. The rules look like the following:
+.PP
+.Vb 9
+\& state eq Locked black on_red
+\& cmd eq Sleep white
+\& user eq system user white
+\& cmd eq Connect white
+\& cmd eq Binlog Dump white
+\& time > 600 red
+\& time > 120 yellow
+\& time > 60 green
+\& time > 30 cyan
+.Ve
+.PP
+This is the default rule set for the \*(L"processlist\*(R" table. In order of
+priority, these rules make locked queries black on a red background, \*(L"gray out\*(R"
+connections from replication and sleeping queries, and make queries turn from
+cyan to red as they run longer.
+.PP
+(For some reason, the \s-1ANSI\s0 color code \*(L"white\*(R" is actually a light gray. Your
+terminal's display may vary; experiment to find colors you like).
+.PP
+You can use keystrokes to move the rules up and down, which re-orders their
+priority. You can also delete rules and add new ones. If you add a new rule,
+innotop prompts you for the column, an operator for the comparison, a value
+against which to compare the column, and a color to assign if the rule matches.
+There is auto-completion and prompting at each step.
+.PP
+The value in the third step needs to be correctly quoted. innotop does not try
+to quote the value because it doesn't know whether it should treat the value as
+a string or a number. If you want to compare the column against a string, as
+for example in the first rule above, you should enter 'Locked' surrounded by
+quotes. If you get an error message about a bareword, you probably should have
+quoted something.
+.Sh "\s-1EXPRESSIONS\s0"
+.IX Subsection "EXPRESSIONS"
+Expressions are at the core of how innotop works, and are what enables you to
+extend innotop as you wish. Recall the table lifecycle explained in
+\&\*(L"\s-1TABLES\s0\*(R". Expressions are used in the earliest step, where it extracts
+values from a data source to form rows.
+.PP
+It does this by calling a subroutine for each column, passing it the source data
+set, a set of current values, and a set of previous values. These are all
+needed so the subroutine can calculate things like the difference between this
+tick and the previous tick.
+.PP
+The subroutines that extract the data from the set are compiled from
+expressions. This gives significantly more power than just naming the values to
+fill the columns, because it allows the column's value to be calculated from
+whatever data is necessary, but avoids the need to write complicated and lengthy
+Perl code.
+.PP
+innotop begins with a string of text that can look as simple as a value's name
+or as complicated as a full-fledged Perl expression. It looks at each
+\&'bareword' token in the string and decides whether it's supposed to be a key
+into the \f(CW$set\fR hash. A bareword is an unquoted value that isn't already
+surrounded by code-ish things like dollar signs or curly brackets. If innotop
+decides that the bareword isn't a function or other valid Perl code, it converts
+it into a hash access. After the whole string is processed, innotop compiles a
+subroutine, like this:
+.PP
+.Vb 5
+\& sub compute_column_value {
+\& my ( $set, $cur, $pre ) = @_;
+\& my $val = # EXPANDED STRING GOES HERE
+\& return $val;
+\& }
+.Ve
+.PP
+Here's a concrete example, taken from the header table \*(L"q_header\*(R" in \*(L"Q: Query List\*(R" mode. This expression calculates the qps, or Queries Per Second,
+column's values, from the values returned by \s-1SHOW\s0 \s-1STATUS:\s0
+.PP
+.Vb 1
+\& Questions/Uptime_hires
+.Ve
+.PP
+innotop decides both words are barewords, and transforms this expression into
+the following Perl code:
+.PP
+.Vb 1
+\& $set\->{Questions}/$set\->{Uptime_hires}
+.Ve
+.PP
+When surrounded by the rest of the subroutine's code, this is executable Perl
+that calculates a high-resolution queries-per-second value.
+.PP
+The arguments to the subroutine are named \f(CW$set\fR, \f(CW$cur\fR, and \f(CW$pre\fR. In most cases,
+\&\f(CW$set\fR and \f(CW$cur\fR will be the same values. However, if \*(L"status_inc\*(R" is set, \f(CW$cur\fR
+will not be the same as \f(CW$set\fR, because \f(CW$set\fR will already contain values that are
+the incremental difference between \f(CW$cur\fR and \f(CW$pre\fR.
+.PP
+Every column in innotop is computed by subroutines compiled in the same fashion.
+There is no difference between innotop's built-in columns and user-defined
+columns. This keeps things consistent and predictable.
+.Sh "\s-1TRANSFORMATIONS\s0"
+.IX Subsection "TRANSFORMATIONS"
+Transformations change how a value is rendered. For example, they can take a
+number of seconds and display it in H:M:S format. The following transformations
+are defined:
+.IP "commify" 4
+.IX Item "commify"
+Adds commas to large numbers every three decimal places.
+.IP "dulint_to_int" 4
+.IX Item "dulint_to_int"
+Accepts two unsigned integers and converts them into a single longlong. This is
+useful for certain operations with InnoDB, which uses two integers as
+transaction identifiers, for example.
+.IP "no_ctrl_char" 4
+.IX Item "no_ctrl_char"
+Removes quoted control characters from the value. This is affected by the
+\&\*(L"charset\*(R" configuration variable.
+.Sp
+This transformation only operates within quoted strings, for example, values to
+a \s-1SET\s0 clause in an \s-1UPDATE\s0 statement. It will not alter the \s-1UPDATE\s0 statement,
+but will collapse the quoted string to [\s-1BINARY\s0] or [\s-1TEXT\s0], depending on the
+charset.
+.IP "percent" 4
+.IX Item "percent"
+Converts a number to a percentage by multiplying it by two, formatting it with
+\&\*(L"num_digits\*(R" digits after the decimal point, and optionally adding a percent
+sign (see \*(L"show_percent\*(R").
+.IP "secs_to_time" 4
+.IX Item "secs_to_time"
+Formats a number of seconds as time in days+hours:minutes:seconds format.
+.IP "set_precision" 4
+.IX Item "set_precision"
+Formats numbers with \*(L"num_digits\*(R" number of digits after the decimal point.
+.IP "shorten" 4
+.IX Item "shorten"
+Formats a number as a unit of 1024 (k/M/G/T) and with \*(L"num_digits\*(R" number of
+digits after the decimal point.
+.Sh "\s-1TABLE\s0 \s-1EDITOR\s0"
+.IX Subsection "TABLE EDITOR"
+The innotop table editor lets you customize tables with keystrokes. You start
+the table editor with the '^' key. If there's more than one table on the
+screen, it will prompt you to choose one of them. Once you do, innotop will
+show you something like this:
+.PP
+.Vb 1
+\& Editing table definition for Buffer Pool. Press ? for help, q to quit.
+.Ve
+.PP
+.Vb 9
+\& name hdr label src
+\& cxn CXN Connection from which cxn
+\& buf_pool_size Size Buffer pool size IB_bp_buf_poo
+\& buf_free Free Bufs Buffers free in the b IB_bp_buf_fre
+\& pages_total Pages Pages total IB_bp_pages_t
+\& pages_modified Dirty Pages Pages modified (dirty IB_bp_pages_m
+\& buf_pool_hit_rate Hit Rate Buffer pool hit rate IB_bp_buf_poo
+\& total_mem_alloc Memory Total memory allocate IB_bp_total_m
+\& add_pool_alloc Add\(aql Pool Additonal pool alloca IB_bp_add_poo
+.Ve
+.PP
+The first line shows which table you're editing, and reminds you again to press
+\&'?' for a list of key mappings. The rest is a tabular representation of the
+table's columns, because that's likely what you're trying to edit. However, you
+can edit more than just the table's columns; this screen can start the filter
+editor, color rule editor, and more.
+.PP
+Each row in the display shows a single column in the table you're editing, along
+with a couple of its properties such as its header and source expression (see
+\&\*(L"\s-1EXPRESSIONS\s0\*(R").
+.PP
+The key mappings are Vim\-style, as in many other places. Pressing 'j' and 'k'
+moves the highlight up or down. You can then (d)elete or (e)dit the highlighted
+column. You can also (a)dd a column to the table. This actually just activates
+one of the columns already defined for the table; it prompts you to choose from
+among the columns available but not currently displayed. Finally, you can
+re-order the columns with the '+' and '\-' keys.
+.PP
+You can do more than just edit the columns with the table editor, you can also
+edit other properties, such as the table's sort expression and group-by
+expression. Press '?' to see the full list, of course.
+.PP
+If you want to really customize and create your own column, as opposed to just
+activating a built-in one that's not currently displayed, press the (n)ew key,
+and innotop will prompt you for the information it needs:
+.IP "\(bu" 4
+The column name: this needs to be a word without any funny characters, e.g. just
+letters, numbers and underscores.
+.IP "\(bu" 4
+The column header: this is the label that appears at the top of the column, in
+the table header. This can have spaces and funny characters, but be careful not
+to make it too wide and waste space on\-screen.
+.IP "\(bu" 4
+The column's data source: this is an expression that determines what data from
+the source (see \*(L"\s-1TABLES\s0\*(R") innotop will put into the column. This can just be
+the name of an item in the source, or it can be a more complex expression, as
+described in \*(L"\s-1EXPRESSIONS\s0\*(R".
+.PP
+Once you've entered the required data, your table has a new column. There is no
+difference between this column and the built-in ones; it can have all the same
+properties and behaviors. innotop will write the column's definition to the
+configuration file, so it will persist across sessions.
+.PP
+Here's an example: suppose you want to track how many times your slaves have
+retried transactions. According to the MySQL manual, the
+Slave_retried_transactions status variable gives you that data: \*(L"The total
+number of times since startup that the replication slave \s-1SQL\s0 thread has retried
+transactions. This variable was added in version 5.0.4.\*(R" This is appropriate to
+add to the \*(L"slave_sql_status\*(R" table.
+.PP
+To add the column, switch to the replication-monitoring mode with the 'M' key,
+and press the '^' key to start the table editor. When prompted, choose
+slave_sql_status as the table, then press 'n' to create the column. Type
+\&'retries' as the column name, 'Retries' as the column header, and
+\&'Slave_retried_transactions' as the source. Now the column is created, and you
+see the table editor screen again. Press 'q' to exit the table editor, and
+you'll see your column at the end of the table.
+.SH "VARIABLE SETS"
+.IX Header "VARIABLE SETS"
+Variable sets are used in \*(L"S: Variables & Status\*(R" mode to define more easily
+what variables you want to monitor. Behind the scenes they are compiled to a
+list of expressions, and then into a column list so they can be treated just
+like columns in any other table, in terms of data extraction and
+transformations. However, you're protected from the tedious details by a syntax
+that ought to feel very natural to you: a \s-1SQL\s0 \s-1SELECT\s0 list.
+.PP
+The data source for variable sets, and indeed the entire S mode, is the
+combination of \s-1SHOW\s0 \s-1STATUS\s0, \s-1SHOW\s0 \s-1VARIABLES\s0, and \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0. Imagine
+that you had a huge table with one column per variable returned from those
+statements. That's the data source for variable sets. You can now query this
+data source just like you'd expect. For example:
+.PP
+.Vb 1
+\& Questions, Uptime, Questions/Uptime as QPS
+.Ve
+.PP
+Behind the scenes innotop will split that variable set into three expressions,
+compile them and turn them into a table definition, then extract as usual. This
+becomes a \*(L"variable set,\*(R" or a \*(L"list of variables you want to monitor.\*(R"
+.PP
+innotop lets you name and save your variable sets, and writes them to the
+configuration file. You can choose which variable set you want to see with the
+\&'c' key, or activate the next and previous sets with the '>' and '<' keys.
+There are many built-in variable sets as well, which should give you a good
+start for creating your own. Press 'e' to edit the current variable set, or
+just to see how it's defined. To create a new one, just press 'c' and type its
+name.
+.PP
+You may want to use some of the functions listed in \*(L"\s-1TRANSFORMATIONS\s0\*(R" to help
+format the results. In particular, \*(L"set_precision\*(R" is often useful to limit
+the number of digits you see. Extending the above example, here's how:
+.PP
+.Vb 1
+\& Questions, Uptime, set_precision(Questions/Uptime) as QPS
+.Ve
+.PP
+Actually, this still needs a little more work. If your \*(L"interval\*(R" is less
+than one second, you might be dividing by zero because Uptime is incremental in
+this mode by default. Instead, use Uptime_hires:
+.PP
+.Vb 1
+\& Questions, Uptime, set_precision(Questions/Uptime_hires) as QPS
+.Ve
+.PP
+This example is simple, but it shows how easy it is to choose which variables
+you want to monitor.
+.SH "PLUGINS"
+.IX Header "PLUGINS"
+innotop has a simple but powerful plugin mechanism by which you can extend
+or modify its existing functionality, and add new functionality. innotop's
+plugin functionality is event\-based: plugins register themselves to be called
+when events happen. They then have a chance to influence the event.
+.PP
+An innotop plugin is a Perl module placed in innotop's \*(L"plugin_dir\*(R"
+directory. On \s-1UNIX\s0 systems, you can place a symbolic link to the module instead
+of putting the actual file there. innotop automatically discovers the file. If
+there is a corresponding entry in the \*(L"plugins\*(R" configuration file section,
+innotop loads and activates the plugin.
+.PP
+The module must conform to innotop's plugin interface. Additionally, the source
+code of the module must be written in such a way that innotop can inspect the
+file and determine the package name and description.
+.Sh "Package Source Convention"
+.IX Subsection "Package Source Convention"
+innotop inspects the plugin module's source to determine the Perl package name.
+It looks for a line of the form \*(L"package Foo;\*(R" and if found, considers the
+plugin's package name to be Foo. Of course the package name can be a valid Perl
+package name, with double semicolons and so on.
+.PP
+It also looks for a description in the source code, to make the plugin editor
+more human\-friendly. The description is a comment line of the form \*(L"#
+description: Foo\*(R", where \*(L"Foo\*(R" is the text innotop will consider to be the
+plugin's description.
+.Sh "Plugin Interface"
+.IX Subsection "Plugin Interface"
+The innotop plugin interface is quite simple: innotop expects the plugin to be
+an object-oriented module it can call certain methods on. The methods are
+.IP "new(%variables)" 4
+.IX Item "new(%variables)"
+This is the plugin's constructor. It is passed a hash of innotop's variables,
+which it can manipulate (see \*(L"Plugin Variables\*(R"). It must return a reference
+to the newly created plugin object.
+.Sp
+At construction time, innotop has only loaded the general configuration and
+created the default built-in variables with their default contents (which is
+quite a lot). Therefore, the state of the program is exactly as in the innotop
+source code, plus the configuration variables from the \*(L"general\*(R" section in
+the config file.
+.Sp
+If your plugin manipulates the variables, it is changing global data, which is
+shared by innotop and all plugins. Plugins are loaded in the order they're
+listed in the config file. Your plugin may load before or after another plugin,
+so there is a potential for conflict or interaction between plugins if they
+modify data other plugins use or modify.
+.IP "\fIregister_for_events()\fR" 4
+.IX Item "register_for_events()"
+This method must return a list of events in which the plugin is interested, if
+any. See \*(L"Plugin Events\*(R" for the defined events. If the plugin returns an
+event that's not defined, the event is ignored.
+.IP "event handlers" 4
+.IX Item "event handlers"
+The plugin must implement a method named the same as each event for which it has
+registered. In other words, if the plugin returns qw(foo bar) from
+\&\fIregister_for_events()\fR, it must have \fIfoo()\fR and \fIbar()\fR methods. These methods are
+callbacks for the events. See \*(L"Plugin Events\*(R" for more details about each
+event.
+.Sh "Plugin Variables"
+.IX Subsection "Plugin Variables"
+The plugin's constructor is passed a hash of innotop's variables, which it can
+manipulate. It is probably a good idea if the plugin object saves a copy of it
+for later use. The variables are defined in the innotop variable
+\&\f(CW%pluggable_vars\fR, and are as follows:
+.IP "action_for" 4
+.IX Item "action_for"
+A hashref of key mappings. These are innotop's global hot\-keys.
+.IP "agg_funcs" 4
+.IX Item "agg_funcs"
+A hashref of functions that can be used for grouping. See \*(L"\s-1GROUPING\s0\*(R".
+.IP "config" 4
+.IX Item "config"
+The global configuration hash.
+.IP "connections" 4
+.IX Item "connections"
+A hashref of connection specifications. These are just specifications of how to
+connect to a server.
+.IP "dbhs" 4
+.IX Item "dbhs"
+A hashref of innotop's database connections. These are actual \s-1DBI\s0 connection
+objects.
+.IP "filters" 4
+.IX Item "filters"
+A hashref of filters applied to table rows. See \*(L"\s-1FILTERS\s0\*(R" for more.
+.IP "modes" 4
+.IX Item "modes"
+A hashref of modes. See \*(L"\s-1MODES\s0\*(R" for more.
+.IP "server_groups" 4
+.IX Item "server_groups"
+A hashref of server groups. See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "tbl_meta" 4
+.IX Item "tbl_meta"
+A hashref of innotop's table meta\-data, with one entry per table (see
+\&\*(L"\s-1TABLES\s0\*(R" for more information).
+.IP "trans_funcs" 4
+.IX Item "trans_funcs"
+A hashref of transformation functions. See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "var_sets" 4
+.IX Item "var_sets"
+A hashref of variable sets. See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.Sh "Plugin Events"
+.IX Subsection "Plugin Events"
+Each event is defined somewhere in the innotop source code. When innotop runs
+that code, it executes the callback function for each plugin that expressed its
+interest in the event. innotop passes some data for each event. The events are
+defined in the \f(CW%event_listener_for\fR variable, and are as follows:
+.ie n .IP "extract_values($set, $cur\fR, \f(CW$pre\fR, \f(CW$tbl)" 4
+.el .IP "extract_values($set, \f(CW$cur\fR, \f(CW$pre\fR, \f(CW$tbl\fR)" 4
+.IX Item "extract_values($set, $cur, $pre, $tbl)"
+This event occurs inside the function that extracts values from a data source.
+The arguments are the set of values, the current values, the previous values,
+and the table name.
+.IP "set_to_tbl" 4
+.IX Item "set_to_tbl"
+Events are defined at many places in this subroutine, which is responsible for
+turning an arrayref of hashrefs into an arrayref of lines that can be printed to
+the screen. The events all pass the same data: an arrayref of rows and the name
+of the table being created. The events are set_to_tbl_pre_filter,
+set_to_tbl_pre_sort,set_to_tbl_pre_group, set_to_tbl_pre_colorize,
+set_to_tbl_pre_transform, set_to_tbl_pre_pivot, set_to_tbl_pre_create,
+set_to_tbl_post_create.
+.IP "draw_screen($lines)" 4
+.IX Item "draw_screen($lines)"
+This event occurs inside the subroutine that prints the lines to the screen.
+\&\f(CW$lines\fR is an arrayref of strings.
+.Sh "Simple Plugin Example"
+.IX Subsection "Simple Plugin Example"
+The easiest way to explain the plugin functionality is probably with a simple
+example. The following module adds a column to the beginning of every table and
+sets its value to 1.
+.PP
+.Vb 2
+\& use strict;
+\& use warnings FATAL => \(aqall\(aq;
+.Ve
+.PP
+.Vb 2
+\& package Innotop::Plugin::Example;
+\& # description: Adds an \(aqexample\(aq column to every table
+.Ve
+.PP
+.Vb 4
+\& sub new {
+\& my ( $class, %vars ) = @_;
+\& # Store reference to innotop\(aqs variables in $self
+\& my $self = bless { %vars }, $class;
+.Ve
+.PP
+.Vb 11
+\& # Design the example column
+\& my $col = {
+\& hdr => \(aqExample\(aq,
+\& just => \(aq\(aq,
+\& dec => 0,
+\& num => 1,
+\& label => \(aqExample\(aq,
+\& src => \(aqexample\(aq, # Get data from this column in the data source
+\& tbl => \(aq\(aq,
+\& trans => [],
+\& };
+.Ve
+.PP
+.Vb 8
+\& # Add the column to every table.
+\& my $tbl_meta = $vars{tbl_meta};
+\& foreach my $tbl ( values %$tbl_meta ) {
+\& # Add the column to the list of defined columns
+\& $tbl\->{cols}\->{example} = $col;
+\& # Add the column to the list of visible columns
+\& unshift @{$tbl\->{visible}}, \(aqexample\(aq;
+\& }
+.Ve
+.PP
+.Vb 3
+\& # Be sure to return a reference to the object.
+\& return $self;
+\& }
+.Ve
+.PP
+.Vb 5
+\& # I\(aqd like to be called when a data set is being rendered into a table, please.
+\& sub register_for_events {
+\& my ( $self ) = @_;
+\& return qw(set_to_tbl_pre_filter);
+\& }
+.Ve
+.PP
+.Vb 8
+\& # This method will be called when the event fires.
+\& sub set_to_tbl_pre_filter {
+\& my ( $self, $rows, $tbl ) = @_;
+\& # Set the example column\(aqs data source to the value 1.
+\& foreach my $row ( @$rows ) {
+\& $row\->{example} = 1;
+\& }
+\& }
+.Ve
+.PP
+.Vb 1
+\& 1;
+.Ve
+.Sh "Plugin Editor"
+.IX Subsection "Plugin Editor"
+The plugin editor lets you view the plugins innotop discovered and activate or
+deactivate them. Start the editor by pressing $ to start the configuration
+editor from any mode. Press the 'p' key to start the plugin editor. You'll see
+a list of plugins innotop discovered. You can use the 'j' and 'k' keys to move
+the highlight to the desired one, then press the * key to toggle it active or
+inactive. Exit the editor and restart innotop for the changes to take effect.
+.SH "SQL STATEMENTS"
+.IX Header "SQL STATEMENTS"
+innotop uses a limited set of \s-1SQL\s0 statements to retrieve data from MySQL for
+display. The statements are customized depending on the server version against
+which they are executed; for example, on MySQL 5 and newer, \s-1INNODB_STATUS\s0
+executes \*(L"\s-1SHOW\s0 \s-1ENGINE\s0 \s-1INNODB\s0 \s-1STATUS\s0\*(R", while on earlier versions it executes
+\&\*(L"\s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0\*(R". The statements are as follows:
+.PP
+.Vb 12
+\& Statement SQL executed
+\& =================== ===============================
+\& INNODB_STATUS SHOW [ENGINE] INNODB STATUS
+\& KILL_CONNECTION KILL
+\& KILL_QUERY KILL QUERY
+\& OPEN_TABLES SHOW OPEN TABLES
+\& PROCESSLIST SHOW FULL PROCESSLIST
+\& SHOW_MASTER_LOGS SHOW MASTER LOGS
+\& SHOW_MASTER_STATUS SHOW MASTER STATUS
+\& SHOW_SLAVE_STATUS SHOW SLAVE STATUS
+\& SHOW_STATUS SHOW [GLOBAL] STATUS
+\& SHOW_VARIABLES SHOW [GLOBAL] VARIABLES
+.Ve
+.SH "DATA SOURCES"
+.IX Header "DATA SOURCES"
+Each time innotop extracts values to create a table (see \*(L"\s-1EXPRESSIONS\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R"), it does so from a particular data source. Largely because of the
+complex data extracted from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, this is slightly messy. \s-1SHOW\s0
+\&\s-1INNODB\s0 \s-1STATUS\s0 contains a mixture of single values and repeated values that form
+nested data sets.
+.PP
+Whenever innotop fetches data from MySQL, it adds two extra bits to each set:
+cxn and Uptime_hires. cxn is the name of the connection from which the data
+came. Uptime_hires is a high-resolution version of the server's Uptime status
+variable, which is important if your \*(L"interval\*(R" setting is sub\-second.
+.PP
+Here are the kinds of data sources from which data is extracted:
+.IP "\s-1STATUS_VARIABLES\s0" 4
+.IX Item "STATUS_VARIABLES"
+This is the broadest category, into which the most kinds of data fall. It
+begins with the combination of \s-1SHOW\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1VARIABLES\s0, but other sources
+may be included as needed, for example, \s-1SHOW\s0 \s-1MASTER\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1SLAVE\s0
+\&\s-1STATUS\s0, as well as many of the non-repeated values from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "\s-1DEADLOCK_LOCKS\s0" 4
+.IX Item "DEADLOCK_LOCKS"
+This data is extracted from the transaction list in the \s-1LATEST\s0 \s-1DETECTED\s0 \s-1DEADLOCK\s0
+section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0. It is nested two levels deep: transactions, then
+locks.
+.IP "\s-1DEADLOCK_TRANSACTIONS\s0" 4
+.IX Item "DEADLOCK_TRANSACTIONS"
+This data is from the transaction list in the \s-1LATEST\s0 \s-1DETECTED\s0 \s-1DEADLOCK\s0
+section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0. It is nested one level deep.
+.IP "\s-1EXPLAIN\s0" 4
+.IX Item "EXPLAIN"
+This data is from the result set returned by \s-1EXPLAIN\s0.
+.IP "\s-1INNODB_TRANSACTIONS\s0" 4
+.IX Item "INNODB_TRANSACTIONS"
+This data is from the \s-1TRANSACTIONS\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "\s-1IO_THREADS\s0" 4
+.IX Item "IO_THREADS"
+This data is from the list of threads in the the \s-1FILE\s0 I/O section of \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0.
+.IP "\s-1INNODB_LOCKS\s0" 4
+.IX Item "INNODB_LOCKS"
+This data is from the \s-1TRANSACTIONS\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 and is nested
+two levels deep.
+.IP "\s-1OPEN_TABLES\s0" 4
+.IX Item "OPEN_TABLES"
+This data is from \s-1SHOW\s0 \s-1OPEN\s0 \s-1TABLES\s0.
+.IP "\s-1PROCESSLIST\s0" 4
+.IX Item "PROCESSLIST"
+This data is from \s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0.
+.IP "\s-1OS_WAIT_ARRAY\s0" 4
+.IX Item "OS_WAIT_ARRAY"
+This data is from the \s-1SEMAPHORES\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 and is nested one
+level deep. It comes from the lines that look like this:
+.Sp
+.Vb 1
+\& \-\-Thread 1568861104 has waited at btr0cur.c line 424 ....
+.Ve
+.SH "MYSQL PRIVILEGES"
+.IX Header "MYSQL PRIVILEGES"
+.IP "\(bu" 4
+You must connect to MySQL as a user who has the \s-1SUPER\s0 privilege for many of the
+functions.
+.IP "\(bu" 4
+If you don't have the \s-1SUPER\s0 privilege, you can still run some functions, but you
+won't necessarily see all the same data.
+.IP "\(bu" 4
+You need the \s-1PROCESS\s0 privilege to see the list of currently running queries in Q
+mode.
+.IP "\(bu" 4
+You need special privileges to start and stop slave servers.
+.IP "\(bu" 4
+You need appropriate privileges to create and drop the deadlock tables if needed
+(see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R").
+.SH "SYSTEM REQUIREMENTS"
+.IX Header "SYSTEM REQUIREMENTS"
+You need Perl to run innotop, of course. You also need a few Perl modules: \s-1DBI\s0,
+DBD::mysql, Term::ReadKey, and Time::HiRes. These should be included with most
+Perl distributions, but in case they are not, I recommend using versions
+distributed with your operating system or Perl distribution, not from \s-1CPAN\s0.
+Term::ReadKey in particular has been known to cause problems if installed from
+\&\s-1CPAN\s0.
+.PP
+If you have Term::ANSIColor, innotop will use it to format headers more readably
+and compactly. (Under Microsoft Windows, you also need Win32::Console::ANSI for
+terminal formatting codes to be honored). If you install Term::ReadLine,
+preferably Term::ReadLine::Gnu, you'll get nice auto-completion support.
+.PP
+I run innotop on Gentoo GNU/Linux, Debian and Ubuntu, and I've had feedback from
+people successfully running it on Red Hat, CentOS, Solaris, and Mac \s-1OSX\s0. I
+don't see any reason why it won't work on other UNIX-ish operating systems, but
+I don't know for sure. It also runs on Windows under ActivePerl without
+problem.
+.PP
+I use innotop on MySQL versions 3.23.58, 4.0.27, 4.1.0, 4.1.22, 5.0.26, 5.1.15,
+and 5.2.3. If it doesn't run correctly for you, that is a bug and I hope you
+report it.
+.SH "FILES"
+.IX Header "FILES"
+$HOMEDIR/.innotop is used to store configuration information. Files include the
+configuration file innotop.ini, the core_dump file which contains verbose error
+messages if \*(L"debug\*(R" is enabled, and the plugins/ subdirectory.
+.SH "GLOSSARY OF TERMS"
+.IX Header "GLOSSARY OF TERMS"
+.IP "tick" 4
+.IX Item "tick"
+A tick is a refresh event, when innotop re-fetches data from connections and
+displays it.
+.SH "ACKNOWLEDGEMENTS"
+.IX Header "ACKNOWLEDGEMENTS"
+I'm grateful to the following people for various reasons, and hope I haven't
+forgotten to include anyone:
+.PP
+Allen K. Smith,
+Aurimas Mikalauskas,
+Bartosz Fenski,
+Brian Miezejewski,
+Christian Hammers,
+Cyril Scetbon,
+Dane Miller,
+David Multer,
+Dr. Frank Ullrich,
+Giuseppe Maxia,
+Google.com Site Reliability Engineers,
+Jan Pieter Kunst,
+Jari Aalto,
+Jay Pipes,
+Jeremy Zawodny,
+Johan Idren,
+Kristian Kohntopp,
+Lenz Grimmer,
+Maciej Dobrzanski,
+Michiel Betel,
+MySQL \s-1AB\s0,
+Paul McCullagh,
+Sebastien Estienne,
+Sourceforge.net,
+Steven Kreuzer,
+The Gentoo MySQL Team,
+Trevor Price,
+Yaar Schnitman,
+and probably more people I've neglected to include.
+.PP
+(If I misspelled your name, it's probably because I'm afraid of putting
+international characters into this documentation; earlier versions of Perl might
+not be able to compile it then).
+.SH "COPYRIGHT, LICENSE AND WARRANTY"
+.IX Header "COPYRIGHT, LICENSE AND WARRANTY"
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+.PP
+\&\s-1THIS\s0 \s-1PROGRAM\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R" \s-1AND\s0 \s-1WITHOUT\s0 \s-1ANY\s0 \s-1EXPRESS\s0 \s-1OR\s0 \s-1IMPLIED\s0
+\&\s-1WARRANTIES\s0, \s-1INCLUDING\s0, \s-1WITHOUT\s0 \s-1LIMITATION\s0, \s-1THE\s0 \s-1IMPLIED\s0 \s-1WARRANTIES\s0 \s-1OF\s0
+\&\s-1MERCHANTIBILITY\s0 \s-1AND\s0 \s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0.
+.PP
+This program is free software; you can redistribute it and/or modify it under
+the terms of the \s-1GNU\s0 General Public License as published by the Free Software
+Foundation, version 2; \s-1OR\s0 the Perl Artistic License. On \s-1UNIX\s0 and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+.PP
+You should have received a copy of the \s-1GNU\s0 General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, \s-1MA\s0 02111\-1307 \s-1USA\s0.
+.PP
+Execute innotop and press '!' to see this information at any time.
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Baron Schwartz.
+.SH "BUGS"
+.IX Header "BUGS"
+You can report bugs, ask for improvements, and get other help and support at
+<http://sourceforge.net/projects/innotop>. There are mailing lists, forums,
+a bug tracker, etc. Please use these instead of contacting me directly, as it
+makes my job easier and benefits others if the discussions are permanent and
+public. Of course, if you need to contact me in private, please do.
diff --git a/storage/xtradb/build/debian/additions/msql2mysql.1 b/storage/xtradb/build/debian/additions/msql2mysql.1
new file mode 100644
index 00000000000..8fe05e7415d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/msql2mysql.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+msql2mysql \- MySQL importer for msql style data.
+.SH SYNOPSIS
+msql2mysql [options]
+.SH DESCRIPTION
+This program imports old msql database files.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/my.cnf b/storage/xtradb/build/debian/additions/my.cnf
new file mode 100644
index 00000000000..997523b9c2f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/my.cnf
@@ -0,0 +1,129 @@
+#
+# The MySQL database server configuration file.
+#
+# You can copy this to one of:
+# - "/etc/mysql/my.cnf" to set global options,
+# - "~/.my.cnf" to set user-specific options.
+#
+# One can use all long options that the program supports.
+# Run program with --help to get a list of available options and with
+# --print-defaults to see which it would actually understand and use.
+#
+# For explanations see
+# http://dev.mysql.com/doc/mysql/en/server-system-variables.html
+
+# This will be passed to all mysql clients
+# It has been reported that passwords should be enclosed with ticks/quotes
+# escpecially if they contain "#" chars...
+# Remember to edit /etc/mysql/debian.cnf when changing the socket location.
+[client]
+port = 3306
+socket = /var/run/mysqld/mysqld.sock
+
+# Here is entries for some specific programs
+# The following values assume you have at least 32M ram
+
+# This was formally known as [safe_mysqld]. Both versions are currently parsed.
+[mysqld_safe]
+socket = /var/run/mysqld/mysqld.sock
+nice = 0
+
+[mysqld]
+#
+# * Basic Settings
+#
+user = mysql
+pid-file = /var/run/mysqld/mysqld.pid
+socket = /var/run/mysqld/mysqld.sock
+port = 3306
+basedir = /usr
+datadir = /var/lib/mysql
+tmpdir = /tmp
+language = /usr/share/mysql/english
+skip-external-locking
+#
+# For compatibility to other Debian packages that still use
+# libmysqlclient10 and libmysqlclient12.
+old_passwords = 1
+#
+# Instead of skip-networking the default is now to listen only on
+# localhost which is more compatible and is not less secure.
+bind-address = 127.0.0.1
+#
+# * Fine Tuning
+#
+key_buffer = 16M
+max_allowed_packet = 16M
+thread_stack = 128K
+thread_cache_size = 8
+# This replaces the startup script and checks MyISAM tables if needed
+# the first time they are touched
+myisam-recover = BACKUP
+#max_connections = 100
+#table_cache = 64
+#thread_concurrency = 10
+#
+# * Query Cache Configuration
+#
+query_cache_limit = 1M
+query_cache_size = 16M
+#
+# * Logging and Replication
+#
+# Both location gets rotated by the cronjob.
+# Be aware that this log type is a performance killer.
+# As of 5.1 you can enable the at runtime!
+#log_type = FILE
+#general_log = /var/log/mysql/mysql.log
+#
+# Error logging goes to syslog due to /etc/mysql/conf.d/mysqld_safe_syslog.cnf.
+#
+# Here you can see queries with especially long duration
+#log_slow_queries = /var/log/mysql/mysql-slow.log
+#long_query_time = 2
+#log-queries-not-using-indexes
+#
+# The following can be used as easy to replay backup logs or for replication.
+# note: if you are setting up a replication slave, see README.Debian about
+# other settings you may need to change.
+#server-id = 1
+#log_bin = /var/log/mysql/mysql-bin.log
+expire_logs_days = 10
+max_binlog_size = 100M
+#binlog_do_db = include_database_name
+#binlog_ignore_db = include_database_name
+#
+# * InnoDB
+#
+# InnoDB is enabled by default with a 10MB datafile in /var/lib/mysql/.
+# Read the manual for more InnoDB related options. There are many!
+#
+# * Security Features
+#
+# Read the manual, too, if you want chroot!
+# chroot = /var/lib/mysql/
+#
+# For generating SSL certificates I recommend the OpenSSL GUI "tinyca".
+#
+# ssl-ca=/etc/mysql/cacert.pem
+# ssl-cert=/etc/mysql/server-cert.pem
+# ssl-key=/etc/mysql/server-key.pem
+
+
+
+[mysqldump]
+quick
+quote-names
+max_allowed_packet = 16M
+
+[mysql]
+#no-auto-rehash # faster start of mysql but no tab completition
+
+[isamchk]
+key_buffer = 16M
+
+#
+# * IMPORTANT: Additional settings that can override those from this file!
+# The files must end with '.cnf', otherwise they'll be ignored.
+#
+!includedir /etc/mysql/conf.d/
diff --git a/storage/xtradb/build/debian/additions/my_print_defaults.1 b/storage/xtradb/build/debian/additions/my_print_defaults.1
new file mode 100644
index 00000000000..ebef4157016
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/my_print_defaults.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+my_print_defaults \- MySQL helper script that prints defaults.
+.SH SYNOPSIS
+my_print_defaults [options]
+.SH DESCRIPTION
+Prints all arguments that is give to some program using the default files.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisam_ftdump.1 b/storage/xtradb/build/debian/additions/myisam_ftdump.1
new file mode 100644
index 00000000000..e2de358efcc
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisam_ftdump.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisam_ftdump \- Dumps full text tables.
+.SH SYNOPSIS
+myisam_ftdump [options]
+.SH DESCRIPTION
+Dumps information and contents of full text tables.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisamchk.1 b/storage/xtradb/build/debian/additions/myisamchk.1
new file mode 100644
index 00000000000..fe7f34961e0
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisamchk.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisamchk \- Checks MySQL myisam type databases.
+.SH SYNOPSIS
+myisamchk [options]
+.SH DESCRIPTION
+Description, check and repair of ISAM tables.
+Used without options all tables on the command will be checked for errors
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisamlog.1 b/storage/xtradb/build/debian/additions/myisamlog.1
new file mode 100644
index 00000000000..959d547df94
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisamlog.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisamlog \- MySQL helper script.
+.SH SYNOPSIS
+myisamlog [options]
+.SH DESCRIPTION
+Function unknown. Mail to ch@debian.org.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisampack.1 b/storage/xtradb/build/debian/additions/myisampack.1
new file mode 100644
index 00000000000..93168304a17
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisampack.1
@@ -0,0 +1,19 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisampack \- Compresses MySQL database files.
+.SH SYNOPSIS
+myisampack [options]
+.SH DESCRIPTION
+Pack a MyISAM-table to take much less space.
+Keys are not updated, you must run myisamchk -rq on the datafile
+afterwards to update the keys.
+You should give the .MYI file as the filename argument.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides b/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides
new file mode 100644
index 00000000000..ae589c2472e
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides
@@ -0,0 +1,2 @@
+W: mysql-dfsg source: maintainer-script-lacks-debhelper-token debian/percona-xtradb-server.postinst
+W: percona-xtradb-server: possible-bashism-in-maintainer-script postinst:68 'p{("a".."z","A".."Z",0..9)[int(rand(62))]}'
diff --git a/storage/xtradb/build/debian/additions/mysql_config.1 b/storage/xtradb/build/debian/additions/mysql_config.1
new file mode 100644
index 00000000000..88095e22b9e
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_config.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlconfig \- MySQL compile settings.
+.SH SYNOPSIS
+mysqlconfig [options]
+.SH DESCRIPTION
+This program is only useful for people who want to compile agains
+libmysqlclient.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_convert_table_format.1 b/storage/xtradb/build/debian/additions/mysql_convert_table_format.1
new file mode 100644
index 00000000000..3c23581df43
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_convert_table_format.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_convert_table_format \- MySQL table converter.
+.SH SYNOPSIS
+mysql_convert_table_format [options]
+.SH DESCRIPTION
+Conversion of a MySQL tables to other table types.
+If no tables has been specifed, all tables in the database will be converted.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_find_rows.1 b/storage/xtradb/build/debian/additions/mysql_find_rows.1
new file mode 100644
index 00000000000..35a70b1f960
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_find_rows.1
@@ -0,0 +1,18 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_find_rows \- MySQL shell skript for searching in update logs.
+.SH SYNOPSIS
+mysql_find_rows [options]
+.SH DESCRIPTION
+Prints all SQL queries that matches a regexp or contains a 'use
+database' or 'set ..' command to stdout. A SQL query may contain
+newlines. This is useful to find things in a MySQL update log.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_fix_extensions.1 b/storage/xtradb/build/debian/additions/mysql_fix_extensions.1
new file mode 100644
index 00000000000..3f0a028ca3f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_fix_extensions.1
@@ -0,0 +1,18 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_fix_extensions \- Corrects MySQL database file names.
+.SH SYNOPSIS
+mysql_fix_extensions <datadir>
+.SH DESCRIPTION
+Makes .frm lowercase and .MYI/MYD/ISM/ISD uppercase
+useful when datafiles are copied from windows.
+Does not work with RAID, with InnoDB or BDB tables.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_install_db.1 b/storage/xtradb/build/debian/additions/mysql_install_db.1
new file mode 100644
index 00000000000..11f1f2967a2
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_install_db.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_install_db \- MySQL helper program.
+.SH SYNOPSIS
+mysql_install_db [options]
+.SH DESCRIPTION
+This program is normally not needed by any user.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_secure_installation.1 b/storage/xtradb/build/debian/additions/mysql_secure_installation.1
new file mode 100644
index 00000000000..d65b7f5d09d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_secure_installation.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_secure_installation \- Secures the MySQL access control lists.
+.SH SYNOPSIS
+mysql_secure_installation [options]
+.SH DESCRIPTION
+This interactive programm suggests changes like removing anonymous users that
+are supposed to make your installation more secure.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_setpermission.1 b/storage/xtradb/build/debian/additions/mysql_setpermission.1
new file mode 100644
index 00000000000..77167e0d58f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_setpermission.1
@@ -0,0 +1,23 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_setpermission \- Adds MySQL users or changes passwords.
+.SH SYNOPSIS
+mysql_setpermission [options]
+.SH DESCRIPTION
+The permission setter is a little program which can help you add users
+or databases or change passwords in MySQL. Keep in mind that we don't
+check permissions which already been set in MySQL. So if you can't
+connect to MySQL using the permission you just added, take a look at
+the permissions which have already been set in MySQL.
+
+The permission setter first reads your .my.cnf file in your Home
+directory if it exists.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_tableinfo.1 b/storage/xtradb/build/debian/additions/mysql_tableinfo.1
new file mode 100644
index 00000000000..1de4f5d5943
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_tableinfo.1
@@ -0,0 +1,322 @@
+.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sh \" Subsection heading
+.br
+.if t .Sp
+.ne 5
+.PP
+\fB\\$1\fR
+.PP
+..
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings. \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote. | will give a
+.\" real vertical bar. \*(C+ will give a nicer C++. Capital omega is used to
+.\" do unbreakable dashes and therefore won't be available. \*(C` and \*(C'
+.\" expand to `' in nroff, nothing in troff, for use with C<>.
+.tr \(*W-|\(bv\*(Tr
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+. ds -- \(*W-
+. ds PI pi
+. if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+. if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
+. ds L" ""
+. ds R" ""
+. ds C` ""
+. ds C' ""
+'br\}
+.el\{\
+. ds -- \|\(em\|
+. ds PI \(*p
+. ds L" ``
+. ds R" ''
+'br\}
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
+.\" entries marked with X<> in POD. Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.if \nF \{\
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
+..
+. nr % 0
+. rr F
+.\}
+.\"
+.\" For nroff, turn off justification. Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.hy 0
+.if n .na
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear. Run. Save yourself. No user-serviceable parts.
+. \" fudge factors for nroff and troff
+.if n \{\
+. ds #H 0
+. ds #V .8m
+. ds #F .3m
+. ds #[ \f1
+. ds #] \fP
+.\}
+.if t \{\
+. ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+. ds #V .6m
+. ds #F 0
+. ds #[ \&
+. ds #] \&
+.\}
+. \" simple accents for nroff and troff
+.if n \{\
+. ds ' \&
+. ds ` \&
+. ds ^ \&
+. ds , \&
+. ds ~ ~
+. ds /
+.\}
+.if t \{\
+. ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+. ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+. ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+. ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+. ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+. ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+. \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+. \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+. \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+. ds : e
+. ds 8 ss
+. ds o a
+. ds d- d\h'-1'\(ga
+. ds D- D\h'-1'\(hy
+. ds th \o'bp'
+. ds Th \o'LP'
+. ds ae ae
+. ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "MYSQL_TABLEINFO 1"
+.TH MYSQL_TABLEINFO 1 "2003-04-05" "perl v5.8.0" "User Contributed Perl Documentation"
+.SH "NAME"
+mysql_tableinfo \- creates and populates information tables with
+the output of SHOW DATABASES, SHOW TABLES (or SHOW TABLE STATUS),
+SHOW COLUMNS and SHOW INDEX.
+.PP
+This is version 1.1.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\& mysql_tableinfo [OPTIONS] database_to_write [database_like_wild] [table_like_wild]
+.Ve
+.PP
+.Vb 2
+\& Do not backquote (``) database_to_write,
+\& and do not quote ('') database_like_wild or table_like_wild
+.Ve
+.PP
+.Vb 1
+\& Examples:
+.Ve
+.PP
+.Vb 1
+\& mysql_tableinfo info
+.Ve
+.PP
+.Vb 1
+\& mysql_tableinfo info this_db
+.Ve
+.PP
+.Vb 1
+\& mysql_tableinfo info %a% b%
+.Ve
+.PP
+.Vb 1
+\& mysql_tableinfo info --clear-only
+.Ve
+.PP
+.Vb 1
+\& mysql_tableinfo info --col --idx --table-status
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+mysql_tableinfo asks a MySQL server information about its
+databases, tables, table columns and index, and stores this
+in tables called `db`, `tbl` (or `tbl_status`), `col`, `idx`
+(with an optional prefix specified with \-\-prefix).
+After that, you can query these information tables, for example
+to build your admin scripts with \s-1SQL\s0 queries, like
+.PP
+\&\s-1SELECT\s0 \s-1CONCAT\s0(\*(L"\s-1CHECK\s0 \s-1TABLE\s0 \*(R",`database`,\*(L".\*(R",`table`,\*(L" \s-1EXTENDED\s0;\*(R")
+\&\s-1FROM\s0 info.tbl \s-1WHERE\s0 ... ;
+.PP
+as people usually do with some other \s-1RDBMS\s0
+(note: to increase the speed of your queries on the info tables,
+you may add some index on them).
+.PP
+The database_like_wild and table_like_wild instructs the program
+to gather information only about databases and tables
+whose names match these patterns. If the info
+tables already exist, their rows matching the patterns are simply
+deleted and replaced by the new ones. That is,
+old rows not matching the patterns are not touched.
+If the database_like_wild and table_like_wild arguments
+are not specified on the command-line they default to \*(L"%\*(R".
+.PP
+The program :
+.PP
+\&\- does \s-1CREATE\s0 \s-1DATABASE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write
+where database_to_write is the database name specified on the command\-line.
+.PP
+\&\- does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`db`
+.PP
+\&\- fills database_to_write.`db` with the output of
+\&\s-1SHOW\s0 \s-1DATABASES\s0 \s-1LIKE\s0 database_like_wild
+.PP
+\&\- does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`tbl`
+(respectively database_to_write.`tbl_status`
+if the \-\-tbl\-status option is on)
+.PP
+\&\- for every found database,
+fills database_to_write.`tbl` (respectively database_to_write.`tbl_status`)
+with the output of
+\&\s-1SHOW\s0 \s-1TABLES\s0 \s-1FROM\s0 found_db \s-1LIKE\s0 table_like_wild
+(respectively \s-1SHOW\s0 \s-1TABLE\s0 \s-1STATUS\s0 \s-1FROM\s0 found_db \s-1LIKE\s0 table_like_wild)
+.PP
+\&\- if the \-\-col option is on,
+ * does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`col`
+ * for every found table,
+ fills database_to_write.`col` with the output of
+ \s-1SHOW\s0 \s-1COLUMNS\s0 \s-1FROM\s0 found_tbl \s-1FROM\s0 found_db
+.PP
+\&\- if the \-\-idx option is on,
+ * does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`idx`
+ * for every found table,
+ fills database_to_write.`idx` with the output of
+ \s-1SHOW\s0 \s-1INDEX\s0 \s-1FROM\s0 found_tbl \s-1FROM\s0 found_db
+.PP
+Some options may modify this general scheme (see below).
+.PP
+As mentioned, the contents of the info tables are the output of
+\&\s-1SHOW\s0 commands. In fact the contents are slightly more complete :
+.PP
+\&\- the `tbl` (or `tbl_status`) info table
+ has an extra column which contains the database name,
+.PP
+\&\- the `col` info table
+ has an extra column which contains the table name,
+ and an extra column which contains, for each described column,
+ the number of this column in the table owning it (this extra column
+ is called `Seq_in_table`). `Seq_in_table` makes it possible for you
+ to retrieve your columns in sorted order, when you are querying
+ the `col` table.
+.PP
+\&\- the `index` info table
+ has an extra column which contains the database name.
+.PP
+Caution: info tables contain certain columns (e.g.
+Database, Table, Null...) whose names, as they are MySQL reserved words,
+need to be backquoted (`...`) when used in \s-1SQL\s0 statements.
+.PP
+Caution: as information fetching and info tables filling happen at the
+same time, info tables may contain inaccurate information about
+themselves.
+.SH "OPTIONS"
+.IX Header "OPTIONS"
+.IP "\-\-clear" 4
+.IX Item "--clear"
+Does \s-1DROP\s0 \s-1TABLE\s0 on the info tables (only those that the program is
+going to fill, for example if you do not use \-\-col it won't drop
+the `col` table) and processes normally. Does not drop database_to_write.
+.IP "\-\-clear\-only" 4
+.IX Item "--clear-only"
+Same as \-\-clear but exits after the DROPs.
+.IP "\-\-col" 4
+.IX Item "--col"
+Adds columns information (into table `col`).
+.IP "\-\-idx" 4
+.IX Item "--idx"
+Adds index information (into table `idx`).
+.IP "\-\-prefix prefix" 4
+.IX Item "--prefix prefix"
+The info tables are named from the concatenation of prefix and,
+respectively, db, tbl (or tbl_status), col, idx. Do not quote ('')
+or backquote (``) prefix.
+.IP "\-q, \-\-quiet" 4
+.IX Item "-q, --quiet"
+Does not warn you about what the script is going to do (\s-1DROP\s0 \s-1TABLE\s0 etc)
+and does not ask for a confirmation before starting.
+.IP "\-\-tbl\-status" 4
+.IX Item "--tbl-status"
+Instead of using \s-1SHOW\s0 \s-1TABLES\s0, uses \s-1SHOW\s0 \s-1TABLE\s0 \s-1STATUS\s0
+(much more complete information, but slower).
+.IP "\-\-help" 4
+.IX Item "--help"
+Display helpscreen and exit
+.IP "\-u, \-\-user=#" 4
+.IX Item "-u, --user=#"
+user for database login if not current user. Give a user
+who has sufficient privileges (\s-1CREATE\s0, ...).
+.IP "\-p, \-\-password=# (INSECURE)" 4
+.IX Item "-p, --password=# (INSECURE)"
+password to use when connecting to server.
+WARNING: Providing a password on command line is insecure as it is visible through /proc to anyone for a short time.
+.IP "\-h, \-\-host=#" 4
+.IX Item "-h, --host=#"
+host to connect to
+.IP "\-P, \-\-port=#" 4
+.IX Item "-P, --port=#"
+port to use when connecting to server
+.IP "\-S, \-\-socket=#" 4
+.IX Item "-S, --socket=#"
+\&\s-1UNIX\s0 domain socket to use when connecting to server
+.SH "WARRANTY"
+.IX Header "WARRANTY"
+This software is free and comes without warranty of any kind. You
+should never trust backup software without studying the code yourself.
+Study the code inside this script and only rely on it if \fIyou\fR believe
+that it does the right thing for you.
+.Sp
+Patches adding bug fixes, documentation and new features are welcome.
+.SH "TO DO"
+.IX Header "TO DO"
+Use extended inserts to be faster (for servers with many databases
+or tables). But to do that, must care about net\-buffer\-length.
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+2002\-06\-18 Guilhem Bichot (guilhem.bichot@mines\-paris.org)
+.Sp
+And all the authors of mysqlhotcopy, which served as a model for
+the structure of the program.
diff --git a/storage/xtradb/build/debian/additions/mysql_waitpid.1 b/storage/xtradb/build/debian/additions/mysql_waitpid.1
new file mode 100644
index 00000000000..f6877865ba8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_waitpid.1
@@ -0,0 +1,20 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_waitpid \- Waits a specified amount of seconds for a PID to terminate.
+.SH SYNOPSIS
+mysql_waitpid [options] <pid> <seconds>
+.SH DESCRIPTION
+Description: Waits for a program, which program id is #pid, to
+terminate within #time seconds. If the program terminates within
+this time, or if the #pid no longer exists, value 0 is returned.
+Otherwise 1 is returned. Both #pid and #time must be positive
+integer arguments.
+
+See mysql_waitpid for options.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlbinlog.1 b/storage/xtradb/build/debian/additions/mysqlbinlog.1
new file mode 100644
index 00000000000..fcdf2a083f4
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlbinlog.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlbinlog \- Dumps MySQL binary logs.
+.SH SYNOPSIS
+mysqlbinlog [options]
+.SH DESCRIPTION
+Dumps a MySQL binary log in a format usable for viewing or for pipeing to
+the mysql command line client
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlbug.1 b/storage/xtradb/build/debian/additions/mysqlbug.1
new file mode 100644
index 00000000000..133330dd897
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlbug.1
@@ -0,0 +1,14 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlbug \- MySQL bug reporting tool.
+.SH SYNOPSIS
+mysqlbug [options]
+.SH DESCRIPTION
+Interactive bug reporting tool. Use reportbug on Debian systems.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlcheck.1 b/storage/xtradb/build/debian/additions/mysqlcheck.1
new file mode 100644
index 00000000000..b36ba2d1eb1
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlcheck.1
@@ -0,0 +1,28 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlcheck \- MySQL program for repairing, checking and optimizing tables.
+.SH SYNOPSIS
+mysqlcheck | mysqlanalyze | mysqloptimize [options]
+.SH DESCRIPTION
+This program can be used to CHECK (-c,-m,-C), REPAIR (-r), ANALYZE (-a)
+or OPTIMIZE (-o) tables. Some of the options (like -e or -q) can be
+used same time. It works on MyISAM and in some cases on BDB tables.
+Please consult the MySQL manual for latest information about the
+above. The options -c,-r,-a and -o are exclusive to each other, which
+means that the last option will be used, if several was specified.
+
+The option -c will be used by default, if none was specified. You
+can change the default behavior by making a symbolic link, or
+copying this file somewhere with another name, the alternatives are:
+mysqlrepair: The default option will be -r
+mysqlanalyze: The default option will be -a
+mysqloptimize: The default option will be -o
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf b/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf
new file mode 100644
index 00000000000..3b0445d6bd8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf
@@ -0,0 +1,2 @@
+[mysqld_safe]
+syslog
diff --git a/storage/xtradb/build/debian/additions/mysqldumpslow.1 b/storage/xtradb/build/debian/additions/mysqldumpslow.1
new file mode 100644
index 00000000000..0431ef04cbb
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqldumpslow.1
@@ -0,0 +1,50 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqldumpslow \- Parse and summarize the MySQL slow query log.
+.SH SYNOPSIS
+mysqldumpslow [options]
+.SH DESCRIPTION
+This program parses and summarizes a 'slow query log'.
+
+.TP
+\fB\-v\fR
+verbose
+.TP
+\fB\-d\fR
+debug
+.TP
+\fB\-s=WORD\fR
+what to sort by (t, at, l, al, r, ar etc)
+.TP
+\fB\-r\fR
+reverse the sort order (largest last instead of first)
+.TP
+\fB\-t=NUMBER\fR
+just show the top n queries
+.TP
+\fB\-a\fR
+don't abstract all numbers to N and strings to 'S'
+.TP
+\fB\-n=NUMBER\fR
+abstract numbers with at least n digits within names
+.TP
+\fB\-g=WORD\fR
+grep: only consider stmts that include this string
+.TP
+\fB\-h=WORD\fR
+hostname of db server for *-slow.log filename (can be wildcard)
+.TP
+\fB\-i=WORD\fR
+name of server instance (if using mysql.server startup script)
+.TP
+\fB\-l\fR
+don't subtract lock time from total time
+
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org> based on
+the commends in the code.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlimport.1 b/storage/xtradb/build/debian/additions/mysqlimport.1
new file mode 100644
index 00000000000..9007307a328
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlimport.1
@@ -0,0 +1,20 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlimport \- Imports text files with MySQL database queries.
+.SH SYNOPSIS
+mysqlimport [options]
+.SH DESCRIPTION
+Loads tables from text files in various formats. The base name of the
+text file must be the name of the table that should be used.
+If one uses sockets to connect to the MySQL server, the server will open and
+read the text file directly. In other cases the client will open the text
+file. The SQL command 'LOAD DATA INFILE' is used to import the rows.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlmanager.1 b/storage/xtradb/build/debian/additions/mysqlmanager.1
new file mode 100644
index 00000000000..ebb69adbd09
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlmanager.1
@@ -0,0 +1,49 @@
+.TH mysql 1 "March 2005" "MySQL 4.1" "MySQL database"
+.SH NAME
+mysqlmanager \- Manages instances of MySQL server.
+.SH SYNOPSIS
+.B mysqlmanager
+[\fIOPTIONS\fR]
+.SH DESCRIPTION
+Manages instances of MySQL server.
+.TP
+\-?, \fB\-\-help\fR
+Display this help and exit.
+.TP
+\fB\-P\fR, \fB\-\-port=\fR#
+Port number to listen on.
+.TP
+\fB\-l\fR, \fB\-\-log\fR=\fIname\fR
+Path to log file.
+.TP
+\fB\-b\fR, \fB\-\-bind\-address=\fR#
+Address to listen on.
+.HP
+\fB\-B\fR, \fB\-\-tcp\-backlog=\fR# Size of TCP/IP listen queue.
+.HP
+\fB\-g\fR, \fB\-\-greeting\fR=\fIname\fR Set greeting on connect.
+.TP
+\fB\-m\fR, \fB\-\-max\-command\-len=\fR#
+Maximum command length.
+.TP
+\fB\-d\fR, \fB\-\-one\-thread\fR
+Use one thread ( for debugging).
+.TP
+\fB\-C\fR, \fB\-\-connect\-retries=\fR#
+Number of attempts to establish MySQL connection.
+.TP
+\fB\-p\fR, \fB\-\-password\-file\fR=\fIname\fR
+Password file for manager.
+.HP
+\fB\-f\fR, \fB\-\-pid\-file\fR=\fIname\fR Pid file to use.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Output version information and exit.
+.SH "SEE ALSO"
+The full documentation for
+.B mysqlmanager
+is available in the package mysql-doc-4.1 or on the MySQL
+homepage www.mysql.com.
+.SH AUTHOR
+This manpage was created by Christian Hammers <ch@debian.org>
+using help2man.
diff --git a/storage/xtradb/build/debian/additions/mysqlreport b/storage/xtradb/build/debian/additions/mysqlreport
new file mode 100644
index 00000000000..402a5be835d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlreport
@@ -0,0 +1,1298 @@
+#!/usr/bin/perl -w
+
+# mysqlreport v3.5 Apr 16 2008
+# http://hackmysql.com/mysqlreport
+
+# mysqlreport makes an easy-to-read report of important MySQL status values.
+# Copyright 2006-2008 Daniel Nichter
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# The GNU General Public License is available at:
+# http://www.gnu.org/copyleft/gpl.html
+
+use strict;
+use File::Temp qw(tempfile);
+use DBI;
+use Getopt::Long;
+eval { require Term::ReadKey; };
+my $RK = ($@ ? 0 : 1);
+
+sub have_op;
+
+my $WIN = ($^O eq 'MSWin32' ? 1 : 0);
+my %op;
+my %mycnf; # ~/.my.cnf
+my ($tmpfile_fh, $tmpfile);
+my ($stat_name, $stat_val, $stat_label);
+my $MySQL_version;
+my (%stats, %vars); # SHOW STATUS, SHOW VARIABLES
+my (%DMS_vals, %Com_vals, %ib_vals);
+my ($dbh, $query);
+my ($questions, $key_read_ratio, $key_write_ratio, $dms, $slow_query_t);
+my ($key_cache_block_size, $key_buffer_used, $key_buffer_usage);
+my ($qc_mem_used, $qc_hi_r, $qc_ip_r); # Query Cache
+my $have_innodb_vals;
+my ($ib_bp_used, $ib_bp_total, $ib_bp_read_ratio);
+my ($relative_live, $relative_infiles);
+my $real_uptime;
+my (%stats_present, %stats_past); # For relative reports
+
+GetOptions (
+ \%op,
+ "user=s",
+ "password:s",
+ "host=s",
+ "port=s",
+ "socket=s",
+ "no-mycnf",
+ "infile|in=s",
+ "outfile=s",
+ "flush-status",
+ "email=s",
+ "r|relative:i",
+ "c|report-count=i",
+ "detach",
+ "help|?",
+ "debug"
+);
+
+show_help_and_exit() if $op{'help'};
+
+get_user_mycnf() unless $op{'no-mycnf'};
+
+# Command line options override ~/.my.cnf
+$mycnf{'host'} = $op{'host'} if have_op 'host';
+$mycnf{'port'} = $op{'port'} if have_op 'port';
+$mycnf{'socket'} = $op{'socket'} if have_op 'socket';
+$mycnf{'user'} = $op{'user'} if have_op 'user';
+
+$mycnf{'user'} ||= $ENV{'USER'};
+
+if(exists $op{'password'})
+{
+ if($op{'password'} eq '') # Prompt for password
+ {
+ Term::ReadKey::ReadMode(2) if $RK;
+ print "Password for database user $mycnf{'user'}: ";
+ chomp($mycnf{'pass'} = <STDIN>);
+ Term::ReadKey::ReadMode(0), print "\n" if $RK;
+ }
+ else { $mycnf{'pass'} = $op{'password'}; } # Use password given on command line
+}
+
+$op{'com'} ||= 3;
+$op{'c'} ||= 1; # Used in collect_reports() if --r given integer value
+
+$relative_live = 0;
+$relative_infiles = 0;
+
+if(defined $op{'r'})
+{
+ if($op{r}) { $relative_live = 1; } # if -r was given an integer value
+ else { $relative_infiles = 1; }
+}
+
+# The report is written to a tmp file first.
+# Later it will be moved to $op{'outfile'} or emailed $op{'email'} if needed.
+($tmpfile_fh, $tmpfile) = tempfile() or die "Cannot open temporary file for writing: $!\n";
+
+if($op{'detach'})
+{
+ $SIG{'TERM'} = 'sig_handler';
+
+ if(fork())
+ {
+ print "mysqlreport has forked and detached.\n";
+ print "While running detached, mysqlreport writes reports to '$tmpfile'.\n";
+
+ exit;
+ }
+
+ open(STDIN, "</dev/null");
+ open(STDOUT, "> $tmpfile") or die "Cannot dup STDOUT: $!\n";
+ open(STDERR, "> $tmpfile") or die "Cannot dup STDERR: $!\n";
+}
+
+select $tmpfile_fh;
+$| = 1 if ($op{'detach'} || $relative_live);
+
+print "tmp file: $tmpfile\n" if $op{debug};
+
+# Connect to MySQL
+if(!$op{'infile'} && !$relative_infiles)
+{
+ connect_to_MySQL();
+}
+
+$have_innodb_vals = 1; # This might be set to 0 later in get_MySQL_version()
+
+if(defined $op{'r'})
+{
+ if($relative_live)
+ {
+ print STDERR "mysqlreport is writing relative reports to '$tmpfile'.\n" unless $op{'detach'};
+ get_MySQL_version();
+ collect_reports();
+ }
+
+ if($relative_infiles) { read_relative_infiles(); }
+}
+else
+{
+ if(!$op{'infile'})
+ {
+ get_MySQL_version();
+ get_vals();
+ get_vars();
+ }
+ else
+ {
+ read_infile($op{'infile'});
+ }
+
+ get_Com_values();
+
+ set_myisam_vals();
+ set_ib_vals() if $have_innodb_vals;
+
+ write_report();
+}
+
+exit_tasks_and_cleanup();
+
+exit;
+
+#
+# Subroutines
+#
+sub show_help_and_exit
+{
+ print <<"HELP";
+mysqlreport v3.5 Apr 16 2008
+mysqlreport makes an easy-to-read report of important MySQL status values.
+
+Command line options (abbreviations work):
+ --user USER Connect to MySQL as USER
+ --password PASS Use PASS or prompt for MySQL user's password
+ --host ADDRESS Connect to MySQL at ADDRESS
+ --port PORT Connect to MySQL at PORT
+ --socket SOCKET Connect to MySQL at SOCKET
+ --no-mycnf Don't read ~/.my.cnf
+ --infile FILE Read status values from FILE instead of MySQL
+ --outfile FILE Write report to FILE
+ --email ADDRESS Email report to ADDRESS (doesn't work on Windows)
+ --flush-status Issue FLUSH STATUS; after getting current values
+ --relative X Generate relative reports. If X is an integer,
+ reports are live from the MySQL server X seconds apart.
+ If X is a list of infiles (file1 file2 etc.),
+ reports are generated from the infiles in the order
+ that they are given.
+ --report-count N Collect N number of live relative reports (default 1)
+ --detach Fork and detach from terminal (run in background)
+ --help Prints this
+ --debug Print debugging information
+
+Visit http://hackmysql.com/mysqlreport for more information.
+HELP
+
+ exit;
+}
+
+sub get_user_mycnf
+{
+ print "get_user_mycnf\n" if $op{debug};
+
+ return if $WIN;
+ open MYCNF, "$ENV{HOME}/.my.cnf" or return;
+ while(<MYCNF>)
+ {
+ if(/^(.+?)\s*=\s*"?(.+?)"?\s*$/)
+ {
+ $mycnf{$1} = $2;
+ print "get_user_mycnf: read '$1 = $2'\n" if $op{debug};
+ }
+ }
+ $mycnf{'pass'} ||= $mycnf{'password'} if exists $mycnf{'password'};
+ close MYCNF;
+}
+
+sub connect_to_MySQL
+{
+ print "connect_to_MySQL\n" if $op{debug};
+
+ my $dsn;
+
+ if($mycnf{'socket'} && -S $mycnf{'socket'})
+ {
+ $dsn = "DBI:mysql:mysql_socket=$mycnf{socket}";
+ }
+ elsif($mycnf{'host'})
+ {
+ $dsn = "DBI:mysql:host=$mycnf{host}" . ($mycnf{port} ? ";port=$mycnf{port}" : "");
+ }
+ else
+ {
+ $dsn = "DBI:mysql:host=localhost";
+ }
+
+ print "connect_to_MySQL: DBI DSN: $dsn\n" if $op{debug};
+
+ $dbh = DBI->connect($dsn, $mycnf{'user'}, $mycnf{'pass'}) or die;
+}
+
+sub collect_reports
+{
+ print "collect_reports\n" if $op{debug};
+
+ my $i;
+
+ get_vals();
+ get_vars();
+
+ get_Com_values();
+
+ %stats_past = %stats;
+
+ set_myisam_vals();
+ set_ib_vals() if $have_innodb_vals;
+
+ print "#\n# Beginning report, 0 0:0:0\n#\n";
+
+ write_report();
+
+ for($i = 0; $i < $op{'c'}; $i++)
+ {
+ $dbh->disconnect();
+
+ sleep($op{'r'});
+
+ connect_to_MySQL();
+
+ print "\n#\n# Interval report " , $i + 1 , ", +", sec_to_dhms(($i + 1) * $op{'r'}), "\n#\n";
+
+ get_vals();
+
+ write_relative_report();
+ }
+}
+
+sub read_relative_infiles
+{
+ print "read_relative_infiles\n" if $op{debug};
+
+ my $slurp; # Used to check infiles for multiple sets of status values
+ my $n_stats; # Number of multiple sets of status values in an infile
+ my $infile;
+ my $report_n; # Report number
+
+ $report_n = 1;
+
+ foreach $infile (@ARGV)
+ {
+ # Read all of infile into $slurp
+ open INFILE, "< $infile" or warn and next;
+ $slurp = do { local $/; <INFILE> };
+ close INFILE;
+
+ $n_stats = 0;
+
+ # Count number of status value sets
+ $n_stats++ while $slurp =~ /Aborted_clients/g;
+
+ print "read_relative_infiles: found $n_stats sets of status values in file '$infile'\n"
+ if $op{debug};
+
+ if($n_stats == 1)
+ {
+ read_infile($infile);
+ relative_infile_report($report_n++);
+ }
+
+ if($n_stats > 1)
+ {
+ my @tmpfile_fh;
+ my @tmpfile_name;
+ my $i;
+ my $stat_n; # Status value set number
+
+ # Create a tmp file for each set of status values
+ for($i = 0; $i < $n_stats; $i++)
+ {
+ my ($fh, $name) = tempfile()
+ or die "read_relative_infiles: cannot open temporary file for writing: $!\n";
+
+ push(@tmpfile_fh, $fh);
+ push(@tmpfile_name, $name);
+
+ print "read_relative_infiles: created tmp file '$name' for set $i\n" if $op{debug};
+ }
+
+ $i = 0;
+ $stat_n = 0;
+
+ select $tmpfile_fh[$i];
+
+ # Read infile again and copy each set of status values to seperate tmp files
+ open INFILE, "< $infile" or warn and next;
+ while(<INFILE>)
+ {
+ next if /^\+/;
+ next if /^$/;
+
+ # The infile must begin with the system variable values.
+ # Therefore, the first occurance of Aborted_clients indicates the beginning
+ # of the first set of status values if no sets have occured yet ($stat_n == 0).
+ # In this case, the following status values are printed to the current fh,
+ # along with the system variable values read thus far, until Aborted_clients
+ # occurs again. Then begins the second and subsequent sets of status values.
+
+ if(/Aborted_clients/)
+ {
+ print and next if $stat_n++ == 0;
+ select $tmpfile_fh[++$i];
+ }
+
+ print;
+ }
+ close INFILE;
+
+ # Re-select the main tmp file into which the reports are being written.
+ select $tmpfile_fh;
+
+ for($i = 0; $i < $n_stats; $i++)
+ {
+ close $tmpfile_fh[$i];
+
+ print "read_relative_infiles: reading set $i tmp file '$tmpfile_name[$i]'\n"
+ if $op{debug};
+
+ read_infile($tmpfile_name[$i]);
+ relative_infile_report($report_n++);
+
+ if($WIN) { `del $tmpfile_name[$i]`; }
+ else { `rm -f $tmpfile_name[$i]`; }
+
+ print "read_relative_infiles: deleted set $i tmp file '$tmpfile_name[$i]'\n"
+ if $op{debug};
+ }
+
+ } # if($n_stats > 1)
+ } # foreach $infile (@files)
+}
+
+sub relative_infile_report
+{
+ print "relative_infile_report\n" if $op{debug};
+
+ my $report_n = shift;
+
+ if($report_n == 1)
+ {
+ get_Com_values();
+
+ %stats_past = %stats;
+
+ set_myisam_vals();
+ set_ib_vals() if $have_innodb_vals;
+
+ print "#\n# Beginning report, 0 0:0:0\n#\n";
+
+ write_report();
+ }
+ else
+ {
+ print "\n#\n# Interval report ", $report_n - 1, ", +",
+ sec_to_dhms($stats{Uptime} - $stats_past{Uptime}),
+ "\n#\n";
+
+ write_relative_report();
+ }
+}
+
+sub get_vals
+{
+ print "get_vals\n" if $op{debug};
+
+ my @row;
+
+ # Get status values
+ if($MySQL_version >= 50002)
+ {
+ $query = $dbh->prepare("SHOW GLOBAL STATUS;");
+ }
+ else
+ {
+ $query = $dbh->prepare("SHOW STATUS;");
+ }
+ $query->execute();
+ while(@row = $query->fetchrow_array()) { $stats{$row[0]} = $row[1]; }
+
+ $real_uptime = $stats{'Uptime'};
+}
+
+sub get_vars
+{
+ print "get_vars\n" if $op{debug};
+
+ my @row;
+
+ # Get server system variables
+ $query = $dbh->prepare("SHOW VARIABLES;");
+ $query->execute();
+ while(@row = $query->fetchrow_array()) { $vars{$row[0]} = $row[1]; }
+
+ # table_cache was renamed to table_open_cache in MySQL 5.1.3
+ if($MySQL_version >= 50103)
+ {
+ $vars{'table_cache'} = $vars{'table_open_cache'};
+ }
+}
+
+sub read_infile
+{
+ print "read_infile\n" if $op{debug};
+
+ my $infile = shift;
+
+ # Default required system variable values if not set in INFILE.
+ # As of mysqlreport v3.5 the direct output from SHOW VARIABLES;
+ # can be put into INFILE instead. See http://hackmysql.com/mysqlreportdoc
+ # for details.
+ $vars{'version'} = "0.0.0" if !exists $vars{'version'};
+ $vars{'table_cache'} = 64 if !exists $vars{'table_cache'};
+ $vars{'max_connections'} = 100 if !exists $vars{'max_connections'};
+ $vars{'key_buffer_size'} = 8388600 if !exists $vars{'key_buffer_size'}; # 8M
+ $vars{'thread_cache_size'} = 0 if !exists $vars{'thread_cache_size'};
+ $vars{'tmp_table_size'} = 0 if !exists $vars{'tmp_table_size'};
+ $vars{'long_query_time'} = '?' if !exists $vars{'long_query_time'};
+ $vars{'log_slow_queries'} = '?' if !exists $vars{'log_slow_queries'};
+
+ # One should also add:
+ # key_cache_block_size
+ # query_cache_size
+ # to INFILE if needed.
+
+ open INFILE, "< $infile" or die "Cannot open INFILE '$infile': $!\n";
+
+ while(<INFILE>)
+ {
+ last if !defined $_;
+
+ next if /^\+/; # skip divider lines
+ next if /^$/; # skip blank lines
+
+ next until /(Aborted_clients|back_log|=)/;
+
+ if($1 eq 'Aborted_clients') # status values
+ {
+ print "read_infile: start stats\n" if $op{debug};
+
+ while($_)
+ {
+ chomp;
+ if(/([A-Za-z_]+)[\s\t|]+(\d+)/)
+ {
+ $stats{$1} = $2;
+ print "read_infile: save $1 = $2\n" if $op{debug};
+ }
+ else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+ last if $1 eq 'Uptime'; # exit while() if end of status values
+ $_ = <INFILE>; # otherwise, read next line of status values
+ }
+ }
+ elsif($1 eq 'back_log') # system variable values
+ {
+ print "read_infile: start vars\n" if $op{debug};
+
+ while($_)
+ {
+ chomp;
+ if(/([A-Za-z_]+)[\s\t|]+([\w\.\-]+)/) # This will exclude some vars
+ { # like pid_file which we don't need
+ $vars{$1} = $2;
+ print "read_infile: save $1 = $2\n" if $op{debug};
+ }
+ else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+ last if $1 eq 'wait_timeout'; # exit while() if end of vars
+ $_ = <INFILE>; # otherwise, read next line of vars
+ }
+ }
+ elsif($1 eq '=') # old style, manually added system variable values
+ {
+ print "read_infile: start old vars\n" if $op{debug};
+
+ while($_ && $_ =~ /=/)
+ {
+ chomp;
+ if(/^\s*(\w+)\s*=\s*([0-9.]+)(M*)\s*$/) # e.g.: key_buffer_size = 128M
+ {
+ $vars{$1} = ($3 ? $2 * 1024 * 1024 : $2);
+ print "read_infile: read '$_' as $1 = $vars{$1}\n" if $op{debug};
+ }
+ else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+ $_ = <INFILE>; # otherwise, read next line of old vars
+ }
+
+ redo;
+ }
+ else
+ {
+ print "read_infile: unrecognized line: '$_'\n" if $op{debug};
+ }
+ }
+
+ close INFILE;
+
+ $real_uptime = $stats{'Uptime'};
+
+ $vars{'table_cache'} = $vars{'table_open_cache'} if exists $vars{'table_open_cache'};
+
+ get_MySQL_version();
+}
+
+sub get_MySQL_version
+{
+ print "get_MySQL_version\n" if $op{debug};
+
+ return if $MySQL_version;
+
+ my ($major, $minor, $patch);
+
+ if($op{'infile'} || $relative_infiles)
+ {
+ ($major, $minor, $patch) = ($vars{'version'} =~ /(\d{1,2})\.(\d{1,2})\.(\d{1,2})/);
+ }
+ else
+ {
+ my @row;
+
+ $query = $dbh->prepare("SHOW VARIABLES LIKE 'version';");
+ $query->execute();
+ @row = $query->fetchrow_array();
+ ($major, $minor, $patch) = ($row[1] =~ /(\d{1,2})\.(\d{1,2})\.(\d{1,2})/);
+ }
+
+ $MySQL_version = sprintf("%d%02d%02d", $major, $minor, $patch);
+
+ # Innodb_ status values were added in 5.0.2
+ if($MySQL_version < 50002)
+ {
+ $have_innodb_vals = 0;
+ print "get_MySQL_version: no InnoDB reports because MySQL version is older than 5.0.2\n" if $op{debug};
+ }
+}
+
+sub set_myisam_vals
+{
+ print "set_myisam_vals\n" if $op{debug};
+
+ $questions = $stats{'Questions'};
+
+ $key_read_ratio = sprintf "%.2f",
+ ($stats{'Key_read_requests'} ?
+ 100 - ($stats{'Key_reads'} / $stats{'Key_read_requests'}) * 100 :
+ 0);
+
+ $key_write_ratio = sprintf "%.2f",
+ ($stats{'Key_write_requests'} ?
+ 100 - ($stats{'Key_writes'} / $stats{'Key_write_requests'}) * 100 :
+ 0);
+
+ $key_cache_block_size = (defined $vars{'key_cache_block_size'} ?
+ $vars{'key_cache_block_size'} :
+ 1024);
+
+ $key_buffer_used = $stats{'Key_blocks_used'} * $key_cache_block_size;
+
+ if(defined $stats{'Key_blocks_unused'}) # MySQL 4.1.2+
+ {
+ $key_buffer_usage = $vars{'key_buffer_size'} -
+ ($stats{'Key_blocks_unused'} * $key_cache_block_size);
+ }
+ else { $key_buffer_usage = -1; }
+
+ # Data Manipulation Statements: http://dev.mysql.com/doc/refman/5.0/en/data-manipulation.html
+ %DMS_vals =
+ (
+ SELECT => $stats{'Com_select'},
+ INSERT => $stats{'Com_insert'} + $stats{'Com_insert_select'},
+ REPLACE => $stats{'Com_replace'} + $stats{'Com_replace_select'},
+ UPDATE => $stats{'Com_update'} +
+ (exists $stats{'Com_update_multi'} ? $stats{'Com_update_multi'} : 0),
+ DELETE => $stats{'Com_delete'} +
+ (exists $stats{'Com_delete_multi'} ? $stats{'Com_delete_multi'} : 0)
+ );
+
+ $dms = $DMS_vals{SELECT} + $DMS_vals{INSERT} + $DMS_vals{REPLACE} + $DMS_vals{UPDATE} + $DMS_vals{DELETE};
+
+ $slow_query_t = format_u_time($vars{long_query_time});
+
+}
+
+sub set_ib_vals
+{
+ print "set_ib_vals\n" if $op{debug};
+
+ $ib_bp_used = ($stats{'Innodb_buffer_pool_pages_total'} -
+ $stats{'Innodb_buffer_pool_pages_free'}) *
+ $stats{'Innodb_page_size'};
+
+ $ib_bp_total = $stats{'Innodb_buffer_pool_pages_total'} * $stats{'Innodb_page_size'};
+
+ $ib_bp_read_ratio = sprintf "%.2f",
+ ($stats{'Innodb_buffer_pool_read_requests'} ?
+ 100 - ($stats{'Innodb_buffer_pool_reads'} /
+ $stats{'Innodb_buffer_pool_read_requests'}) * 100 :
+ 0);
+}
+
+sub write_relative_report
+{
+ print "write_relative_report\n" if $op{debug};
+
+ %stats_present = %stats;
+
+ for(keys %stats)
+ {
+ if($stats_past{$_} =~ /\d+/)
+ {
+ if($stats_present{$_} >= $stats_past{$_}) # Avoid negative values
+ {
+ $stats{$_} = $stats_present{$_} - $stats_past{$_};
+ }
+ }
+ }
+
+ # These values are either "at present" or "high water marks".
+ # Therefore, it is more logical to not relativize these values.
+ # Doing otherwise causes strange and misleading values.
+ $stats{'Key_blocks_used'} = $stats_present{'Key_blocks_used'};
+ $stats{'Open_tables'} = $stats_present{'Open_tables'};
+ $stats{'Max_used_connections'} = $stats_present{'Max_used_connections'};
+ $stats{'Threads_running'} = $stats_present{'Threads_running'};
+ $stats{'Threads_connected'} = $stats_present{'Threads_connected'};
+ $stats{'Threads_cached'} = $stats_present{'Threads_cached'};
+ $stats{'Qcache_free_blocks'} = $stats_present{'Qcache_free_blocks'};
+ $stats{'Qcache_total_blocks'} = $stats_present{'Qcache_total_blocks'};
+ $stats{'Qcache_free_memory'} = $stats_present{'Qcache_free_memory'};
+ if($have_innodb_vals)
+ {
+ $stats{'Innodb_page_size'} = $stats_present{'Innodb_page_size'};
+ $stats{'Innodb_buffer_pool_pages_data'} = $stats_present{'Innodb_buffer_pool_pages_data'};
+ $stats{'Innodb_buffer_pool_pages_dirty'} = $stats_present{'Innodb_buffer_pool_pages_dirty'};
+ $stats{'Innodb_buffer_pool_pages_free'} = $stats_present{'Innodb_buffer_pool_pages_free'};
+ $stats{'Innodb_buffer_pool_pages_latched'} = $stats_present{'Innodb_buffer_pool_pages_latched'};
+ $stats{'Innodb_buffer_pool_pages_misc'} = $stats_present{'Innodb_buffer_pool_pages_misc'};
+ $stats{'Innodb_buffer_pool_pages_total'} = $stats_present{'Innodb_buffer_pool_pages_total'};
+ $stats{'Innodb_data_pending_fsyncs'} = $stats_present{'Innodb_data_pending_fsyncs'};
+ $stats{'Innodb_data_pending_reads'} = $stats_present{'Innodb_data_pending_reads'};
+ $stats{'Innodb_data_pending_writes'} = $stats_present{'Innodb_data_pending_writes'};
+
+ # Innodb_row_lock_ values were added in MySQL 5.0.3
+ if($MySQL_version >= 50003)
+ {
+ $stats{'Innodb_row_lock_current_waits'} = $stats_present{'Innodb_row_lock_current_waits'};
+ $stats{'Innodb_row_lock_time_avg'} = $stats_present{'Innodb_row_lock_time_avg'};
+ $stats{'Innodb_row_lock_time_max'} = $stats_present{'Innodb_row_lock_time_max'};
+ }
+ }
+
+ get_Com_values();
+
+ %stats_past = %stats_present;
+
+ set_myisam_vals();
+ set_ib_vals() if $have_innodb_vals;
+
+ write_report();
+}
+
+sub write_report
+{
+ print "write_report\n" if $op{debug};
+
+ $~ = 'MYSQL_TIME', write;
+ $~ = 'KEY_BUFF_MAX', write;
+ if($key_buffer_usage != -1) { $~ = 'KEY_BUFF_USAGE', write }
+ $~ = 'KEY_RATIOS', write;
+ write_DTQ();
+ $~ = 'SLOW_DMS', write;
+ write_DMS();
+ write_Com();
+ $~ = 'SAS', write;
+ write_qcache();
+ $~ = 'REPORT_END', write;
+ $~ = 'TAB', write;
+
+ write_InnoDB() if $have_innodb_vals;
+}
+
+sub sec_to_dhms # Seconds to days hours:minutes:seconds
+{
+ my $s = shift;
+ my ($d, $h, $m) = (0, 0, 0);
+
+ return '0 0:0:0' if $s <= 0;
+
+ if($s >= 86400)
+ {
+ $d = int $s / 86400;
+ $s -= $d * 86400;
+ }
+
+ if($s >= 3600)
+ {
+ $h = int $s / 3600;
+ $s -= $h * 3600;
+ }
+
+ $m = int $s / 60;
+ $s -= $m * 60;
+
+ return "$d $h:$m:$s";
+}
+
+sub make_short
+{
+ my ($number, $kb, $d) = @_;
+ my $n = 0;
+ my $short;
+
+ $d ||= 2;
+
+ if($kb) { while ($number > 1023) { $number /= 1024; $n++; }; }
+ else { while ($number > 999) { $number /= 1000; $n++; }; }
+
+ $short = sprintf "%.${d}f%s", $number, ('','k','M','G','T')[$n];
+ if($short =~ /^(.+)\.(00)$/) { return $1; } # 12.00 -> 12 but not 12.00k -> 12k
+
+ return $short;
+}
+
+# What began as a simple but great idea has become the new standard:
+# long_query_time in microseconds. For MySQL 5.1.21+ and 6.0.4+ this
+# is now standard. For 4.1 and 5.0 patches, the architects of this
+# idea provide: http://www.mysqlperformanceblog.com/mysql-patches/
+# Relevant notes in MySQL manual:
+# http://dev.mysql.com/doc/refman/5.1/en/slow-query-log.html
+# http://dev.mysql.com/doc/refman/6.0/en/slow-query-log.html
+#
+# The format_u_time sub simply beautifies long_query_time.
+
+sub format_u_time # format microsecond (µ) time value
+{
+ # 0.000000 - 0.000999 = 0 - 999 µ
+ # 0.001000 - 0.999999 = 1 ms - 999.999 ms
+ # 1.000000 - n.nnnnnn = 1 s - n.nnnnn s
+
+ my $t = shift;
+ my $f; # formatted µ time
+ my $u = chr(($WIN ? 230 : 181));
+
+ $t = 0 if $t < 0;
+
+ if($t > 0 && $t <= 0.000999)
+ {
+ $f = ($t * 1000000) . " $u";
+ }
+ elsif($t >= 0.001000 && $t <= 0.999999)
+ {
+ $f = ($t * 1000) . ' ms';
+ }
+ elsif($t >= 1)
+ {
+ $f = ($t * 1) . ' s'; # * 1 to remove insignificant zeros
+ }
+ else
+ {
+ $f = 0; # $t should = 0 at this point
+ }
+
+ return $f;
+}
+
+sub perc # Percentage
+{
+ my($is, $of) = @_;
+ $is = 0 if (not defined $is);
+ return sprintf "%.2f", ($is * 100) / ($of ||= 1);
+}
+
+sub t # Time average per second
+{
+ my $val = shift;
+ return 0 if !$val;
+ return(make_short($val / $stats{'Uptime'}, 0, 1));
+}
+
+sub email_report # Email given report to $op{'email'}
+{
+ print "email_report\n" if $op{debug};
+
+ return if $WIN;
+
+ my $report = shift;
+
+ open SENDMAIL, "|/usr/sbin/sendmail -t";
+ print SENDMAIL "From: mysqlreport\n";
+ print SENDMAIL "To: $op{email}\n";
+ print SENDMAIL "Subject: MySQL status report on " . ($mycnf{'host'} || 'localhost') . "\n\n";
+ print SENDMAIL `cat $report`;
+ close SENDMAIL;
+}
+
+sub cat_report # Print given report to screen
+{
+ print "cat_report\n" if $op{debug};
+
+ my $report = shift;
+ my @report;
+
+ open REPORT, "< $report";
+ @report = <REPORT>;
+ close REPORT;
+ print @report;
+}
+
+sub get_Com_values
+{
+ print "get_Com_values\n" if $op{debug};
+
+ %Com_vals = ();
+
+ # Make copy of just the Com_ values
+ for(keys %stats)
+ {
+ if(grep /^Com_/, $_ and $stats{$_} > 0)
+ {
+ /^Com_(.*)/;
+ $Com_vals{$1} = $stats{$_};
+ }
+ }
+
+ # Remove DMS values
+ delete $Com_vals{'select'};
+ delete $Com_vals{'insert'};
+ delete $Com_vals{'insert_select'};
+ delete $Com_vals{'replace'};
+ delete $Com_vals{'replace_select'};
+ delete $Com_vals{'update'};
+ delete $Com_vals{'update_multi'} if exists $Com_vals{'update_multi'};
+ delete $Com_vals{'delete'};
+ delete $Com_vals{'delete_multi'} if exists $Com_vals{'delete_multi'};
+}
+
+sub write_DTQ # Write DTQ report in descending order by values
+{
+ print "write_DTQ\n" if $op{debug};
+
+ $~ = 'DTQ';
+
+ my %DTQ;
+ my $first = 1;
+
+ # Total Com values
+ $stat_val = 0;
+ for(values %Com_vals) { $stat_val += $_; }
+ $DTQ{'Com_'} = $stat_val;
+
+ $DTQ{'DMS'} = $dms;
+ $DTQ{'QC Hits'} = $stats{'Qcache_hits'} if $stats{'Qcache_hits'} != 0;
+ $DTQ{'COM_QUIT'} = int (($stats{'Connections'} - 2) - ($stats{'Aborted_clients'} / 2));
+
+ $stat_val = 0;
+ for(values %DTQ) { $stat_val += $_; }
+ if($questions != $stat_val)
+ {
+ $DTQ{($questions > $stat_val ? '+Unknown' : '-Unknown')} = abs $questions - $stat_val;
+ }
+
+ for(sort { $DTQ{$b} <=> $DTQ{$a} } keys(%DTQ))
+ {
+ if($first) { $stat_label = '%Total:'; $first = 0; }
+ else { $stat_label = ''; }
+
+ $stat_name = $_;
+ $stat_val = $DTQ{$_};
+ write;
+ }
+}
+
+sub write_DMS # Write DMS report in descending order by values
+{
+ print "write_DMS\n" if $op{debug};
+
+ $~ = 'DMS';
+
+ for(sort { $DMS_vals{$b} <=> $DMS_vals{$a} } keys(%DMS_vals))
+ {
+ $stat_name = $_;
+ $stat_val = $DMS_vals{$_};
+ write;
+ }
+}
+
+sub write_Com # Write COM report in descending order by values
+{
+ print "write_Com\n" if $op{debug};
+
+ my $i = $op{'com'};
+
+ $~ = 'COM_1';
+
+ # Total Com values and write first line of COM report
+ $stat_label = '%Total:' unless $op{'dtq'};
+ $stat_val = 0;
+ for(values %Com_vals) { $stat_val += $_; }
+ write;
+
+ $~ = 'COM_2';
+
+ # Sort remaining Com values, print only the top $op{'com'} number of values
+ for(sort { $Com_vals{$b} <=> $Com_vals{$a} } keys(%Com_vals))
+ {
+ $stat_name = $_;
+ $stat_val = $Com_vals{$_};
+ write;
+
+ last if !(--$i);
+ }
+}
+
+sub write_qcache
+{
+ print "write_qcache\n" if $op{debug};
+
+ # Query cache was added in 4.0.1, but have_query_cache was added in 4.0.2,
+ # ergo this method is slightly more reliable
+ return if not exists $vars{'query_cache_size'};
+ return if $vars{'query_cache_size'} == 0;
+
+ $qc_mem_used = $vars{'query_cache_size'} - $stats{'Qcache_free_memory'};
+ $qc_hi_r = sprintf "%.2f", $stats{'Qcache_hits'} / ($stats{'Qcache_inserts'} ||= 1);
+ $qc_ip_r = sprintf "%.2f", $stats{'Qcache_inserts'} / ($stats{'Qcache_lowmem_prunes'} ||= 1);
+
+ $~ = 'QCACHE';
+ write;
+}
+
+sub write_InnoDB
+{
+ print "write_InnoDB\n" if $op{debug};
+
+ return if not defined $stats{'Innodb_page_size'};
+
+ $stats{'Innodb_buffer_pool_pages_latched'} = 0 if not defined $stats{'Innodb_buffer_pool_pages_latched'};
+
+ $~ = 'IB';
+ write;
+
+ # Innodb_row_lock_ values were added in MySQL 5.0.3
+ if($MySQL_version >= 50003)
+ {
+ $~ = 'IB_LOCK';
+ write;
+ }
+
+ # Data, Pages, Rows
+ $~ = 'IB_DPR';
+ write;
+}
+
+sub have_op
+{
+ my $key = shift;
+ return 1 if (exists $op{$key} && $op{$key} ne '');
+ return 0;
+}
+
+sub sig_handler
+{
+ print "\nReceived signal at " , scalar localtime , "\n";
+ exit_tasks_and_cleanup();
+ exit;
+}
+
+sub exit_tasks_and_cleanup
+{
+ print "exit_tasks_and_cleanup\n" if $op{debug};
+
+ close $tmpfile_fh;
+ select STDOUT unless $op{'detach'};
+
+ email_report($tmpfile) if $op{'email'};
+
+ cat_report($tmpfile) unless $op{'detach'};
+
+ if($op{'outfile'})
+ {
+ if($WIN) { `move $tmpfile $op{outfile}`; }
+ else { `mv $tmpfile $op{outfile}`; }
+ }
+ else
+ {
+ if($WIN) { `del $tmpfile`; }
+ else { `rm -f $tmpfile`; }
+ }
+
+ if(!$op{'infile'} && !$relative_infiles)
+ {
+ if($op{'flush-status'})
+ {
+ $query = $dbh->prepare("FLUSH STATUS;");
+ $query->execute();
+ }
+
+ $query->finish();
+ $dbh->disconnect();
+ }
+}
+
+#
+# Formats
+#
+
+format MYSQL_TIME =
+MySQL @<<<<<<<<<<<<<<<< uptime @<<<<<<<<<<< @>>>>>>>>>>>>>>>>>>>>>>>>
+$vars{'version'}, sec_to_dhms($real_uptime), (($op{infile} || $relative_infiles) ? '' : scalar localtime)
+.
+
+format KEY_BUFF_MAX =
+
+__ Key _________________________________________________________________
+Buffer used @>>>>>> of @>>>>>> %Used: @>>>>>
+make_short($key_buffer_used, 1), make_short($vars{'key_buffer_size'}, 1), perc($key_buffer_used, $vars{'key_buffer_size'})
+.
+
+format KEY_BUFF_USAGE =
+ Current @>>>>>> %Usage: @>>>>>
+make_short($key_buffer_usage, 1), perc($key_buffer_usage, $vars{'key_buffer_size'})
+.
+
+format KEY_RATIOS =
+Write hit @>>>>>%
+$key_write_ratio
+Read hit @>>>>>%
+$key_read_ratio
+
+__ Questions ___________________________________________________________
+Total @>>>>>>>> @>>>>>/s
+make_short($questions), t($questions)
+.
+
+format DTQ =
+ @<<<<<<< @>>>>>>>> @>>>>>/s @>>>>>> @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), $stat_label, perc($stat_val, $questions)
+.
+
+format SLOW_DMS =
+Slow @<<<<<<< @>>>>>> @>>>>>/s @>>>>> %DMS: @>>>>> Log: @>>
+$slow_query_t, make_short($stats{'Slow_queries'}), t($stats{'Slow_queries'}), perc($stats{'Slow_queries'}, $questions), perc($stats{'Slow_queries'}, $dms), $vars{'log_slow_queries'}
+DMS @>>>>>>>> @>>>>>/s @>>>>>
+make_short($dms), t($dms), perc($dms, $questions)
+.
+
+format DMS =
+ @<<<<<<< @>>>>>>>> @>>>>>/s @>>>>> @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), perc($stat_val, $questions), perc($stat_val, $dms)
+.
+
+format COM_1 =
+Com_ @>>>>>>>> @>>>>>/s @>>>>>
+make_short($stat_val), t($stat_val), perc($stat_val, $questions)
+.
+
+format COM_2 =
+ @<<<<<<<<<< @>>>>>> @>>>>>/s @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), perc($stat_val, $questions)
+.
+
+format SAS =
+
+__ SELECT and Sort _____________________________________________________
+Scan @>>>>>> @>>>>/s %SELECT: @>>>>>
+make_short($stats{'Select_scan'}), t($stats{'Select_scan'}), perc($stats{'Select_scan'}, $stats{'Com_select'})
+Range @>>>>>> @>>>>/s @>>>>>
+make_short($stats{'Select_range'}), t($stats{'Select_range'}), perc($stats{'Select_range'}, $stats{'Com_select'})
+Full join @>>>>>> @>>>>/s @>>>>>
+make_short($stats{'Select_full_join'}), t($stats{'Select_full_join'}), perc($stats{'Select_full_join'}, $stats{'Com_select'})
+Range check @>>>>>> @>>>>/s @>>>>>
+make_short($stats{'Select_range_check'}), t($stats{'Select_range_check'}), perc($stats{'Select_range_check'}, $stats{'Com_select'})
+Full rng join @>>>>>> @>>>>/s @>>>>>
+make_short($stats{'Select_full_range_join'}), t($stats{'Select_full_range_join'}), perc($stats{'Select_full_range_join'}, $stats{'Com_select'})
+Sort scan @>>>>>> @>>>>/s
+make_short($stats{'Sort_scan'}), t($stats{'Sort_scan'})
+Sort range @>>>>>> @>>>>/s
+make_short($stats{'Sort_range'}), t($stats{'Sort_range'})
+Sort mrg pass @>>>>>> @>>>>/s
+make_short($stats{'Sort_merge_passes'}), t($stats{'Sort_merge_passes'})
+.
+
+format QCACHE =
+
+__ Query Cache _________________________________________________________
+Memory usage @>>>>>> of @>>>>>> %Used: @>>>>>
+make_short($qc_mem_used, 1), make_short($vars{'query_cache_size'}, 1), perc($qc_mem_used, $vars{'query_cache_size'})
+Block Fragmnt @>>>>>%
+perc($stats{'Qcache_free_blocks'}, $stats{'Qcache_total_blocks'})
+Hits @>>>>>> @>>>>/s
+make_short($stats{'Qcache_hits'}), t($stats{'Qcache_hits'})
+Inserts @>>>>>> @>>>>/s
+make_short($stats{'Qcache_inserts'}), t($stats{'Qcache_inserts'})
+Insrt:Prune @>>>>>>:1 @>>>>/s
+make_short($qc_ip_r), t($stats{'Qcache_inserts'} - $stats{'Qcache_lowmem_prunes'})
+Hit:Insert @>>>>>>:1
+$qc_hi_r, t($qc_hi_r)
+.
+
+# Not really the end...
+format REPORT_END =
+
+__ Table Locks _________________________________________________________
+Waited @>>>>>>>> @>>>>>/s %Total: @>>>>>
+make_short($stats{'Table_locks_waited'}), t($stats{'Table_locks_waited'}), perc($stats{'Table_locks_waited'}, $stats{'Table_locks_waited'} + $stats{'Table_locks_immediate'});
+Immediate @>>>>>>>> @>>>>>/s
+make_short($stats{'Table_locks_immediate'}), t($stats{'Table_locks_immediate'})
+
+__ Tables ______________________________________________________________
+Open @>>>>>>>> of @>>> %Cache: @>>>>>
+$stats{'Open_tables'}, $vars{'table_cache'}, perc($stats{'Open_tables'}, $vars{'table_cache'})
+Opened @>>>>>>>> @>>>>>/s
+make_short($stats{'Opened_tables'}), t($stats{'Opened_tables'})
+
+__ Connections _________________________________________________________
+Max used @>>>>>>>> of @>>> %Max: @>>>>>
+$stats{'Max_used_connections'}, $vars{'max_connections'}, perc($stats{'Max_used_connections'}, $vars{'max_connections'})
+Total @>>>>>>>> @>>>>>/s
+make_short($stats{'Connections'}), t($stats{'Connections'})
+
+__ Created Temp ________________________________________________________
+Disk table @>>>>>>>> @>>>>>/s
+make_short($stats{'Created_tmp_disk_tables'}), t($stats{'Created_tmp_disk_tables'})
+Table @>>>>>>>> @>>>>>/s Size: @>>>>>
+make_short($stats{'Created_tmp_tables'}), t($stats{'Created_tmp_tables'}), make_short($vars{'tmp_table_size'}, 1, 1)
+File @>>>>>>>> @>>>>>/s
+make_short($stats{'Created_tmp_files'}), t($stats{'Created_tmp_files'})
+.
+
+format TAB =
+
+__ Threads _____________________________________________________________
+Running @>>>>>>>> of @>>>
+$stats{'Threads_running'}, $stats{'Threads_connected'}
+Cached @>>>>>>>> of @>>> %Hit: @>>>>>
+$stats{'Threads_cached'}, $vars{'thread_cache_size'}, make_short(100 - perc($stats{'Threads_created'}, $stats{'Connections'}))
+Created @>>>>>>>> @>>>>>/s
+make_short($stats{'Threads_created'}), t($stats{'Threads_created'})
+Slow @>>>>>>>> @>>>>>/s
+$stats{'Slow_launch_threads'}, t($stats{'Slow_launch_threads'})
+
+__ Aborted _____________________________________________________________
+Clients @>>>>>>>> @>>>>>/s
+make_short($stats{'Aborted_clients'}), t($stats{'Aborted_clients'})
+Connects @>>>>>>>> @>>>>>/s
+make_short($stats{'Aborted_connects'}), t($stats{'Aborted_connects'})
+
+__ Bytes _______________________________________________________________
+Sent @>>>>>>>> @>>>>>/s
+make_short($stats{'Bytes_sent'}), t($stats{'Bytes_sent'})
+Received @>>>>>>>> @>>>>>/s
+make_short($stats{'Bytes_received'}), t($stats{'Bytes_received'})
+.
+
+format IB =
+
+__ InnoDB Buffer Pool __________________________________________________
+Usage @>>>>>> of @>>>>>> %Used: @>>>>>
+make_short($ib_bp_used, 1), make_short($ib_bp_total, 1), perc($ib_bp_used, $ib_bp_total)
+Read hit @>>>>>%
+$ib_bp_read_ratio;
+Pages
+ Free @>>>>>>>> %Total: @>>>>>
+make_short($stats{'Innodb_buffer_pool_pages_free'}), perc($stats{'Innodb_buffer_pool_pages_free'}, $stats{'Innodb_buffer_pool_pages_total'})
+ Data @>>>>>>>> @>>>>> %Drty: @>>>>>
+make_short($stats{'Innodb_buffer_pool_pages_data'}), perc($stats{'Innodb_buffer_pool_pages_data'}, $stats{'Innodb_buffer_pool_pages_total'}), perc($stats{'Innodb_buffer_pool_pages_dirty'}, $stats{'Innodb_buffer_pool_pages_data'})
+ Misc @>>>>>>>> @>>>>>
+ $stats{'Innodb_buffer_pool_pages_misc'}, perc($stats{'Innodb_buffer_pool_pages_misc'}, $stats{'Innodb_buffer_pool_pages_total'})
+ Latched @>>>>>>>> @>>>>>
+$stats{'Innodb_buffer_pool_pages_latched'}, perc($stats{'Innodb_buffer_pool_pages_latched'}, $stats{'Innodb_buffer_pool_pages_total'})
+Reads @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_buffer_pool_read_requests'}), t($stats{'Innodb_buffer_pool_read_requests'})
+ From file @>>>>>>>> @>>>>>/s @>>>>>
+make_short($stats{'Innodb_buffer_pool_reads'}), t($stats{'Innodb_buffer_pool_reads'}), perc($stats{'Innodb_buffer_pool_reads'}, $stats{'Innodb_buffer_pool_read_requests'})
+ Ahead Rnd @>>>>>>>> @>>>>>/s
+$stats{'Innodb_buffer_pool_read_ahead_rnd'}, t($stats{'Innodb_buffer_pool_read_ahead_rnd'})
+ Ahead Sql @>>>>>>>> @>>>>>/s
+$stats{'Innodb_buffer_pool_read_ahead_seq'}, t($stats{'Innodb_buffer_pool_read_ahead_seq'})
+Writes @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_buffer_pool_write_requests'}), t($stats{'Innodb_buffer_pool_write_requests'})
+Flushes @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_buffer_pool_pages_flushed'}), t($stats{'Innodb_buffer_pool_pages_flushed'})
+Wait Free @>>>>>>>> @>>>>>/s
+$stats{'Innodb_buffer_pool_wait_free'}, t($stats{'Innodb_buffer_pool_wait_free'})
+.
+
+format IB_LOCK =
+
+__ InnoDB Lock _________________________________________________________
+Waits @>>>>>>>> @>>>>>/s
+$stats{'Innodb_row_lock_waits'}, t($stats{'Innodb_row_lock_waits'})
+Current @>>>>>>>>
+$stats{'Innodb_row_lock_current_waits'}
+Time acquiring
+ Total @>>>>>>>> ms
+$stats{'Innodb_row_lock_time'}
+ Average @>>>>>>>> ms
+$stats{'Innodb_row_lock_time_avg'}
+ Max @>>>>>>>> ms
+$stats{'Innodb_row_lock_time_max'}
+.
+
+format IB_DPR =
+
+__ InnoDB Data, Pages, Rows ____________________________________________
+Data
+ Reads @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_data_reads'}), t($stats{'Innodb_data_reads'})
+ Writes @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_data_writes'}), t($stats{'Innodb_data_writes'})
+ fsync @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_data_fsyncs'}), t($stats{'Innodb_data_fsyncs'})
+ Pending
+ Reads @>>>>>>>>
+$stats{'Innodb_data_pending_reads'}, t($stats{'Innodb_data_pending_reads'})
+ Writes @>>>>>>>>
+$stats{'Innodb_data_pending_writes'}, t($stats{'Innodb_data_pending_writes'})
+ fsync @>>>>>>>>
+$stats{'Innodb_data_pending_fsyncs'}, t($stats{'Innodb_data_pending_fsyncs'})
+
+Pages
+ Created @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_pages_created'}), t($stats{'Innodb_pages_created'})
+ Read @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_pages_read'}), t($stats{'Innodb_pages_read'})
+ Written @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_pages_written'}), t($stats{'Innodb_pages_written'})
+
+Rows
+ Deleted @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_rows_deleted'}), t($stats{'Innodb_rows_deleted'})
+ Inserted @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_rows_inserted'}), t($stats{'Innodb_rows_inserted'})
+ Read @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_rows_read'}), t($stats{'Innodb_rows_read'})
+ Updated @>>>>>>>> @>>>>>/s
+make_short($stats{'Innodb_rows_updated'}), t($stats{'Innodb_rows_updated'})
+.
diff --git a/storage/xtradb/build/debian/additions/mysqlreport.1 b/storage/xtradb/build/debian/additions/mysqlreport.1
new file mode 100644
index 00000000000..5ae6b9e3b92
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlreport.1
@@ -0,0 +1,180 @@
+.TH "mysqlreport" "1" "2.5 2006-09-01 (docrev 2006-05-19)" "Daniel Nichter" "MYSQL"
+.SH "NAME"
+.LP
+mysqlreport \- Makes a friendly report of important MySQL status values
+.SH "SYNTAX"
+.LP
+mysqlreport [\fIoptions\fP]
+.SH "DESCRIPTION"
+.LP
+mysqlreport makes a friendly report of important MySQL status values. Actually,
+it makes a friendly report of nearly every status value from SHOW STATUS.
+Unlike SHOW STATUS which simply dumps over 100 values to screen in one long
+list, mysqlreport interprets and formats the values and presents the basic
+values and many more inferred values in a human\-readable format. Numerous
+example reports are available at the mysqlreport web page at
+http://hackmysql.com/mysqlreport.
+
+The benefit of mysqlreport is that it allows you to very quickly see a wide
+array of performance indicators for your MySQL server which would otherwise
+need to be calculated by hand from all the various SHOW STATUS values. For
+example, the Index Read Ratio is an important value but it's not present in
+SHOW STATUS; it's an inferred value (the ratio of Key_reads to
+Key_read_requests).
+
+This documentation outlines all the command line options in mysqlreport, most
+of which control which reports are printed. This document does not address
+how to interpret these reports; that topic is covered in the document Guide
+To Understanding mysqlreport at http://hackmysql.com/mysqlreportguide.
+
+.SH "OPTIONS"
+Technically, command line options are in the form \-\-option, but \-option works
+too. All options can be abbreviated if the abbreviation is unique. For example,
+option \-\-host can be abbreviated \-\-ho but not \-\-h because \-\-h is ambiguous: it
+could mean \-\-host or \-\-help.
+
+.LP
+
+.TP
+\fB\-\-help\fR
+Output help information and exit.
+
+.TP
+\fB\-\-user USER\fR
+
+.TP
+\fB\-\-password\fR
+As of version 2.3 \-\-password can take the password on the
+command line like "\-\-password FOO". Using \-\-password
+alone without giving a password on the command line
+causes mysqlreport to prompt for a password.
+
+.TP
+\fB\-\-host ADDRESS\fR
+
+.TP
+\fB\-\-port PORT\fR
+
+.TP
+\fB\-\-socket SOCKET\fR
+
+.TP
+\fB\-\-no\-mycnf\fR
+\-\-no\-mycnf makes mysqlreport not read ~/.my.cnf which it does by default
+otherwise. \-\-user and \-\-password always override values from ~/.my.cnf.
+
+.TP
+\fB\-\-dtq\fR
+Print Distribution of Total Queries (DTQ) report (under
+Total in Questions report). Queries (or Questions) can
+be divided into four main areas: DMS (see \-\-dms below),
+Com_ (see \-\-com below), COM_QUIT (see COM_QUIT and
+Questions at http://hackmysql.com/com_quit), and
+Unknown. \-\-dtq lists the number of queries in each of
+these areas in descending order.
+
+.TP
+\fB\-\-dms\fR
+Print Data Manipulation Statements (DMS) report (under
+DMS in Questions report). DMS are those from the MySQL
+manual section 13.2. Data Manipulation Statements.
+(Currently, mysqlreport considers only SELECT, INSERT,
+REPLACE, UPDATE, and DELETE.) Each DMS is listed in
+descending order by count.
+
+.TP
+\fB\-\-com N\fR
+Print top N number of non\-DMS Com_ status values in
+descending order (after DMS in Questions report). If N
+is not given, default is 3. Such non\-DMS Com_ values
+include Com_change_db, Com_show_tables, Com_rollback,
+etc.
+
+.TP
+\fB\-\-sas\fR
+Print report for Select_ and Sort_ status values (after
+Questions report). See MySQL Select and Sort Status
+Variables at http://hackmysql.com/selectandsort.
+
+.TP
+\fB\-\-tab\fR
+Print Threads, Aborted, and Bytes status reports (after
+Created temp report). As of mysqlreport v2.3 the
+Threads report reports on all Threads_ status values.
+
+.TP
+\fB\-\-qcache\fR
+Print Query Cache report.
+.TP
+\fB\-\-all\fR
+Equivalent to "\-\-dtq \-\-dms \-\-com 3 \-\-sas \-\-qcache".
+(Notice \-\-tab is not invoked by \-\-all.)
+
+.TP
+\fB\-\-infile FILE\fR
+Instead of getting SHOW STATUS values from MySQL, read
+values from FILE. FILE is often a copy of the output of
+SHOW STATUS including formatting characters (|, +, \-).
+mysqlreport expects FILE to have the format
+" value number " where value is only alpha and
+underscore characters (A\-Z and _) and number is a
+positive integer. Anything before, between, or after
+value and number is ignored. mysqlreport also needs
+the following MySQL server variables: version,
+table_cache, max_connections, key_buffer_size,
+query_cache_size. These values can be specified in
+INFILE in the format "name = value" where name is one
+of the aforementioned server variables and value is a
+positive integer with or without a trailing M and
+possible periods (for version). For example, to specify
+an 18M key_buffer_size: key_buffer_size = 18M. Or, a
+256 table_cache: table_cache = 256. The M implies
+Megabytes not million, so 18M means 18,874,368 not
+18,000,000. If these server variables are not specified
+the following defaults are used (respectively) which
+may cause strange values to be reported: 0.0.0, 64,
+100, 8M, 0.
+
+.TP
+\fB\-\-outfile FILE\fR
+After printing the report to screen, print the report
+to FILE too. Internally, mysqlreport always writes the
+report to a temp file first: /tmp/mysqlreport.PID on
+*nix, c:\mysqlreport.PID on Windows (PID is the
+script's process ID). Then it prints the temp file to
+screen. Then if \-\-outfile is specified, the temp file
+is copied to OUTFILE. After \-\-email (below), the temp
+file is deleted.
+
+.TP
+\fB\-\-email ADDRESS\fR
+After printing the report to screen, email the report
+to ADDRESS. This option requires sendmail in
+/usr/sbin/, therefore it does not work on Windows.
+/usr/sbin/sendmail can be a sym link to qmail, for
+example, or any MTA that emulates sendmail's \-t
+command line option and operation. The FROM: field is
+"mysqlreport", SUBJECT: is "MySQL status report".
+
+.TP
+\fB\-\-flush\-status\fR
+Execute a "FLUSH STATUS;" after generating the reports.
+If you do not have permissions in MySQL to do this an
+error from DBD::mysql::st will be printed after the
+reports.
+
+.SH "AUTHORS"
+.LP
+Daniel Nichter
+
+If mysqlreport breaks, send me a message from
+http://hackmysql.com/feedback
+with the error.
+
+.SH "SEE ALSO"
+.LP
+mytop(1)
+.LP
+The comprehensive Guide To Understanding mysqlreport at
+http://hackmysql.com/mysqlreportguide.
+
diff --git a/storage/xtradb/build/debian/additions/mysqltest.1 b/storage/xtradb/build/debian/additions/mysqltest.1
new file mode 100644
index 00000000000..3469765fe3b
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqltest.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqltest \- Regressiontest program for MySQL.
+.SH SYNOPSIS
+mysqltest [options]
+.SH DESCRIPTION
+Runs a test against the mysql server and compares output with a results file.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/pack_isam.1 b/storage/xtradb/build/debian/additions/pack_isam.1
new file mode 100644
index 00000000000..cad153eedee
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/pack_isam.1
@@ -0,0 +1,19 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisampack \- Compresses MySQL database files.
+.SH SYNOPSIS
+myisampack [options]
+.SH DESCRIPTION
+Pack a ISAM-table to take much smaller space
+Keys are not updated, so you must run isamchk -rq on any table
+that has keys after you have compressed it
+You should give the .ISM file as the filename argument
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/resolve_stack_dump.1 b/storage/xtradb/build/debian/additions/resolve_stack_dump.1
new file mode 100644
index 00000000000..2a1e2770275
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/resolve_stack_dump.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+resolve_stack_dump \- MySQL helper program for reporting bugs.
+.SH SYNOPSIS
+resolve_stack_dump [options]
+.SH DESCRIPTION
+Resolve numeric stack strace dump into symbols.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/resolveip.1 b/storage/xtradb/build/debian/additions/resolveip.1
new file mode 100644
index 00000000000..7aa9439394d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/resolveip.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+resolveip \- MySQL helper program to retrive IP addresses.
+.SH SYNOPSIS
+resolveip [options]
+.SH DESCRIPTION
+Get hostname based on IP-address or IP-address based on hostname.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/changelog b/storage/xtradb/build/debian/changelog
new file mode 100644
index 00000000000..6dc9b561634
--- /dev/null
+++ b/storage/xtradb/build/debian/changelog
@@ -0,0 +1,4186 @@
+percona-xtradb-dfsg-5.1 (5.1.36-1) experimental; urgency=low
+
+ [TODO]
+ * Link libmysqlclient.so to libmysqlclient_r.so to help applications
+ like Apache where some modules, like libaprutil, want to use the thread
+ safe library and some, like PHP, do not. As the client library just copies
+ data between client and server, we do not expect significant performance
+ losses. (thanks to Stefan Fritsch). Closes: #450535
+
+ Add the following to libmysqlclient16.links:
+ usr/lib/libmysqlclient_r.so.16.0.0 usr/lib/libmysqlclient.so.16.0.0
+
+ * Ex-maintainer upload :)
+ * New upstream release.
+ * SECURITY: Upstream fix for "mysql client does not escape strings in
+ --html mode." (CVE-2008-4456) Closes: #526254
+ * Upstream fixes REPEAT() function. Closes: #447028
+ * Upstream fixes problems when mixing ORDER and GROUP BY. Closes: #470854
+ * There were many innodb fixes in the last two years, probably
+ also for this unreproducible crash. CLoses: #447713
+ * Removed amd64 specific -fPIC compiler option that was introduced
+ especially for building the NDB cluster module which is no longer
+ part of this package (thanks to Modestas Vainius). Closes: #508406
+ * Put /etc/mysql/conf.d to mysql-server-5.1.dirs (thanks to Alexander
+ Gerasiov). Closes: #515145
+ * Fixed mysql-test suite by adding 50_mysql-test__db_test.dpatch.
+ It now passes 100% of the tests again. Also Closes: #533999
+ * Preinst now prevents Installation if NDB configuration is detected.
+ * Applied Ubuntu patch that fixes privilege bootstrapping in postinst
+ (thanks to Mathias Gug). Closes: #535492
+ * Applied Ubuntu patch that sets the debconf prio for the root password
+ question to high and prevents it from being asked on 5.0 -> 5.1 upgrades
+ (thanks to Mathias Gug). Closes: #535500
+ * Removed the check for ISAM tables as the only supported upgrade path is
+ from lenny's MySQL-5.0.
+ * Added /etc/mysql/conf.d/mysqld_safe_syslog.cnf which enables mysqld_safe
+ to pipe all mysqld output into the syslog. The reason for not letting dpkg
+ handle it via a normal config file change was that my.cnf is usually
+ heavily tuned by the admin so the setting would go lost too easily.
+ * Updated mysqlreport to version 3.5 (including two minor patches by me).
+
+ -- Christian Hammers <ch@debian.org> Wed, 01 Jul 2009 20:54:58 +0200
+
+mysql-dfsg-5.1 (5.1.34-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Mon, 20 Apr 2009 20:23:10 +0200
+
+mysql-dfsg-5.1 (5.1.33-2) experimental; urgency=low
+
+ * Remove no longer active developers from uploaders field.
+ * Drop workaround for upgrades from MySQL 3.23, not necessary any more.
+
+ -- Norbert Tretkowski <nobse@debian.org> Tue, 07 Apr 2009 11:23:25 +0200
+
+mysql-dfsg-5.1 (5.1.33-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 02 Apr 2009 21:12:23 +0200
+
+mysql-dfsg-5.1 (5.1.32-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Fri, 06 Mar 2009 18:48:23 +0100
+
+mysql-dfsg-5.1 (5.1.31-2) experimental; urgency=low
+
+ * Update SSL certificates, and re-enable SSL related tests when running
+ the testsuite.
+
+ -- Norbert Tretkowski <nobse@debian.org> Tue, 10 Feb 2009 16:08:42 +0100
+
+mysql-dfsg-5.1 (5.1.31-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 08 Feb 2009 17:07:11 +0100
+
+mysql-dfsg-5.1 (5.1.30-2) experimental; urgency=low
+
+ * Drop MySQL Cluster support, it's deprecated since 5.1.24-RC.
+ * Fix FTBFS if build twice in a row. (closes: #487091)
+
+ -- Norbert Tretkowski <nobse@debian.org> Fri, 05 Dec 2008 21:04:55 +0100
+
+mysql-dfsg-5.1 (5.1.30-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 27 Nov 2008 09:09:55 +0100
+
+mysql-dfsg-5.1 (5.1.29rc-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Mon, 27 Oct 2008 20:00:43 +0100
+
+mysql-dfsg-5.1 (5.1.26rc-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Mon, 14 Jul 2008 21:46:59 +0200
+
+mysql-dfsg-5.1 (5.1.25rc-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sat, 21 Jun 2008 13:55:02 +0200
+
+mysql-dfsg-5.1 (5.1.24rc-1) experimental; urgency=low
+
+ * New upstream release.
+ * Ignore errors in testsuite on ia64 and s390.
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 16 Apr 2008 22:03:44 +0200
+
+mysql-dfsg-5.1 (5.1.23rc-1) experimental; urgency=low
+
+ * New upstream release.
+
+ [ Christian Hammers ]
+ * Add PIC support for NDB libraries on amd64 (thanks to Monty Taylor).
+ * Add extra information when aborting due to a detected downgrade (thanks to
+ Raphael Pinson).
+ * Move libndbclient.so.3 to its own package as it now has a version != 0
+ (thanks to Raphael Pinson for reminding me).
+
+ [ Monty Taylor ]
+ * Remove 85_ndb__staticlib.dpatch since we have a libndbclient package now.
+ * Add myself to the uploaders so that I don't get complaints about package
+ signing.
+ * Add libndbclient-dev package to go with libndbclient3.
+
+ [ Norbert Tretkowski ]
+ * Update patches:
+ + 41_scripts__mysql_install_db.sh__no_test.dpatch
+ * Drop patches:
+ + 70_upstream_debian__configure.dpatch
+ + 71_upstream_debian__Makefile.in.dpatch
+ + 99_TEMP_minmax.dpatch
+ * Remove Adam Conrad from uploaders on his request. Thanks for your work in
+ the past!
+ * Ignore errors in testsuite on amd64 and i386.
+
+ -- Norbert Tretkowski <nobse@debian.org> Fri, 29 Feb 2008 10:38:27 +0100
+
+mysql-dfsg-5.1 (5.1.22rc-1) experimental; urgency=low
+
+ * New upstream version.
+ * Let mysql-server-5.1 pre-depend on debconf as it uses it in the preinst.
+ * Fixed mysql-client-5.1 menu entry for upcoming menu policy 1.4.
+
+ -- Christian Hammers <ch@debian.org> Tue, 02 Oct 2007 22:45:37 +0200
+
+mysql-dfsg-5.1 (5.1.21beta-1) experimental; urgency=low
+
+ * My "Greetings from FrOSCon!" release.
+ * New upstream version.
+ * libmysqlclient.so.15 has been superseded by libmysqlclient.so.16.
+ * Renamed libmysqlclient15-dev to libmysqlclient-dev but added an empty
+ package libmysqlclient15-dev to ease the transition for packages with
+ a versioned build-dep to libmysqlclient15-dev which is something that
+ currently does not work with "Provides:".
+ * Synced with 5.0 branch up to subversion release r909.
+ * Commented out most of the compile conditionals in the hope that
+ all architectures can be build the same way.
+ * Added a lot of new binaries and manpages.
+ * Switched to plugin based engines.
+
+ -- Christian Hammers <ch@debian.org> Sat, 25 Aug 2007 14:24:40 +0200
+
+mysql-dfsg-5.1 (5.1.19beta-1) experimental; urgency=low
+
+ * New upstream release.
+
+ -- Christian Hammers <ch@debian.org> Mon, 11 Jun 2007 23:18:35 +0200
+
+mysql-dfsg-5.1 (5.1.16beta-4) experimental; urgency=high
+
+ * Merged with 5.0 r850:
+ * SECURITY:
+ In some previous versions mysql_install_db was not idempotent and did
+ always create passwordless root accounts although it should only on
+ initial installs (thanks to Olaf van der Spek). Closes: #418672
+ * Added check for passwordless root accounts to debian-start.
+ * As MySQL-5.0 is, at least currently, incompatible with Kernel 2.4 the
+ installation is aborted for such old kernels. Debian Etch does not
+ support them anyway according to the release notes but this might be
+ unexpected and many production servers still have self build ones
+ installed (thanks to Marc-Christian Petersen). See: #416841
+ * Adjusted TeX build-deps to texlive.
+ * Added innotop.
+ * Changed maintainer email address to
+ pkg-mysql-commits@lists.alioth.debian.org
+
+ -- Christian Hammers <ch@debian.org> Thu, 19 Apr 2007 19:29:29 +0200
+
+mysql-dfsg-5.1 (5.1.16beta-3) experimental; urgency=low
+
+ * Merged with 5.0 r837:
+ * Activated the blackhole engine as it's needed for replicating partition
+ designs (thanks to Cyril SCETBON).
+ * Fixed segfault on i486 systems without cpuid instruction (thanks to
+ Lennart Sorensen). Closes: #410474
+ * Only use of the non-essential debconf package in postrm if it is
+ still installed (thanks to Michael Ablassmeier). Closes: #416838
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Mar 2007 21:48:11 +0100
+
+mysql-dfsg-5.1 (5.1.16beta-2) experimental; urgency=low
+
+ * Merged with 5.0 r818:
+ * Fixed FTBFS on Sparc introduced with the "make -j" trick in
+ 5.0.32-8 (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Mar 2007 21:20:11 +0100
+
+mysql-dfsg-5.1 (5.1.16beta-1) experimental; urgency=low
+
+ * New upstream release.
+ * SECURITY: Using an INFORMATION_SCHEMA table with ORDER BY in a subquery
+ could cause a server crash (CVE-2007-1420).
+ * Added temporary patch 90_TEMP_sqlparse-ifdef to avoid build problems.
+ * Merged with 5.0 r809:
+ * Updated mysqlreport to latest upstream (and patched --help usage
+ message and "return if qcache_size==0").
+ * Merged with 5.0 r798:
+ * Adapt MAKE_J to use the -j option with the number of available
+ processors. (thanks to Raphael Pinson).
+ * Merged with 5.0 r758:
+ * Changed minimum required version in dh_makeshlibs to 5.0.27-1 as
+ 5.0.26 had an ABI breakage in it!
+ This is the cause for Perl programs crashing with the following error:
+ Transactions not supported by database at /usr/lib/perl5/DBI.pm line 672
+ * Added some more comments to the default my.cnf.
+ * Added support for /etc/mysql/conf.d/.
+ * The debian-start script that runs on every server start now first upgrades
+ the system tables (if neccessary) and then check them as it sometimes did
+ not work the other way around (e.g. for MediaWiki). The script now uses
+ mysql_update instead of mysql_update_script as recommended. See: 409780
+
+ -- Christian Hammers <ch@debian.org> Fri, 2 Mar 2007 01:00:55 +0100
+
+mysql-dfsg-5.1 (5.1.15beta-1) experimental; urgency=low
+
+ * New upstream release.
+ [Monty Taylor]
+ * Removed patches/25_mysys__default.c - fixed upstream.
+ * Removed patches/26_client__mysql_upgrade.c - fixed upstream.
+ * Removed patches/29_scripts__mysqlbug.sh - fixed upstream.
+ * Removed patches/39_scripts__mysqld_safe.sh__port_dir - fixed upstream.
+ * Removed patches/42_scripts__mysqldumpslow__slowdir - fixed upstream.
+ * Removed patches/45_warn-CLI-passwords - fixed upstream.
+ * Removed patches/89_ndb__records.dpatch - fixed upstream.
+ * Removed patches/86_ndbapi_tc_selection.dpatch - fixed upstream.
+ [Christian Hammers]
+ * Synced with 5.0.32-4.
+ * mysql-server-5.0 pre-depends on adduser now and has --disabled-login
+ explicitly added to be on the safe side (thanks to the puiparts team).
+ Closes: #408362
+ * Corrections the terminology regarding NDB in the comments of all config
+ files and init scripts (thanks to Geert Vanderkelen of MySQL).
+
+ -- Christian Hammers <ch@debian.org> Wed, 7 Feb 2007 11:34:52 -0200
+
+mysql-dfsg-5.1 (5.1.14beta-2) experimental; urgency=low
+
+ [Christian Hammers]
+ * Readded 85_ndb__staticlib.dpatch with slight modifications.
+ * Backported debian-start scripts from 5.0.
+ [Monty Taylor]
+ * Now build-depends on bison.
+ * Updated to standards 3.7.2.
+ * Removed references to comp_err.
+ * build-depend on automake1.9 to match upstream
+ * Merged runlevel changes from 5.0.
+ * Added 26_client__mysql_upgrade.c.dpatch to fix a segfault in mysql_upgrade
+ when using a password. It's been fixed upstream in 5.1.15.
+ * Moved BDB check to sanity_checks() and added a note about deprecation.
+ * Use my_print_defaults instead of mysqld --print-defaults
+ * Changed NDB Data and Management node startup seqence. Prevented both
+ from restarting on upgrade to address rolling upgrade issues.
+ * Added a "start-initial" option to the Data Node init script to support
+ initial node starts.
+ * Added 86_ndbapi_tc_selection.dpatch to fix a bug that causes a segfault
+ when using the NdbApi. http://bugs.mysql.com/bug.php?id=24914
+ Fixed in 5.1.15
+ * Added 89_ndb__records.dpatch to fix
+ http://bugs.mysql.com/bug.php?id=25567, which causes a table scan per
+ table per query.
+
+ -- Christian Hammers <ch@debian.org> Wed, 31 Jan 2007 01:17:35 +0100
+
+mysql-dfsg-5.1 (5.1.14beta-1) experimental; urgency=low
+
+ * New upstream.
+ * Removed references to mysql_explain_log
+ * Changed context for patch to mysqld_multi.1
+ * Removed 70_kfreebsd.dpatch - applied to upstream
+ * Removed 87_ps_Hurd - applied to upstream
+ * Replaced --without-readline to --with-libedit to configure options, as
+ --without-readline doesn't seem to do the right thing anymore.
+
+ -- Monty Taylor <mordred@inaugust.com> Wed, 10 Jan 2007 12:59:55 -0800
+
+mysql-dfsg-5.1 (5.1.11beta-1) experimental; urgency=low
+
+ * Starting new 5.1 branch!
+ * FIXME: Following patch couldn't be applied:
+ ## 85_ndb__staticlib.dpatch by <ch@debian.org>
+ * FIXME: Following patch couldn't be applied:
+ ## 86_PATH_MAX.dpatch
+
+ -- Christian Hammers <ch@debian.org> Sat, 29 Jul 2006 11:35:42 +0200
+
+mysql-dfsg-5.0 (5.0.51a-19) UNRELEASED; urgency=low
+
+ * New patch 50_fix_mysqldump2.dpatch from 5.0.60 to fix dumping databases
+ from mysql 4.0 server. (closes: #507789)
+ * Don't create a guest account during bootstrap. (closes: #463704)
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 04 Dec 2008 23:07:19 +0100
+
+mysql-dfsg-5.0 (5.0.51a-18) testing-proposed-updates; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2008-4098: Inadequate validation of paths used in DATA
+ DIRECTORY and INDEX DIRECTORY clauses of CREATE TABLE statements enabled
+ attackers to write to tables in other databases to which they could not
+ ordinarily have access.
+
+ -- Devin Carraway <devin@debian.org> Tue, 25 Nov 2008 05:38:45 +0000
+
+mysql-dfsg-5.0 (5.0.51a-17) testing-proposed-updates; urgency=low
+
+ * Don't use commented out passwords from debian.cnf. (closes: #453820)
+ * Update watch file to recognize releases > 5.0.45.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 02 Nov 2008 13:31:32 +0100
+
+mysql-dfsg-5.0 (5.0.51a-16) unstable; urgency=low
+
+ * New patch 60_rpl_test_failure.dpatch from 5.0.54 to fix a race condition
+ with the rpl_packet test in some cases. (closes: #501413)
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 09 Oct 2008 08:50:43 +0200
+
+mysql-dfsg-5.0 (5.0.51a-15) unstable; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2008-3963: An empty bit-string literal (b'') caused a server
+ crash. Now the value is parsed as an empty bit value (which is treated as
+ an empty string in string context or 0 in numeric context).
+ (closes: #498362)
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 14 Sep 2008 18:27:46 +0200
+
+mysql-dfsg-5.0 (5.0.51a-14) unstable; urgency=low
+
+ * Update debconf translations:
+ - Swedish, from Martin Bagge. (closes: #491688)
+ - Netherlands, from Thijs Kinkhorst. (closes: #492723)
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 07 Sep 2008 20:18:31 +0200
+
+mysql-dfsg-5.0 (5.0.51a-13) unstable; urgency=medium
+
+ * New patch 59_fix_relay_logs_corruption.dpatch from 5.0.56 to fix
+ corruption in relay logs. (closes: #463515)
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 03 Sep 2008 09:13:46 +0200
+
+mysql-dfsg-5.0 (5.0.51a-12) unstable; urgency=low
+
+ * Disable rpl_ndb_innodb_trans test when running the testsuite, fails
+ randomly on i386. (closes: #494238)
+
+ -- Norbert Tretkowski <nobse@debian.org> Sat, 09 Aug 2008 15:56:45 +0200
+
+mysql-dfsg-5.0 (5.0.51a-11) unstable; urgency=low
+
+ * Disable innodb_handler test when running the testsuite, fails randomly
+ on s390. (closes: #491363)
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 23 Jul 2008 08:34:51 +0200
+
+mysql-dfsg-5.0 (5.0.51a-10) unstable; urgency=high
+
+ * Merge testing-security upload to finally fix CVE-2008-2079, thanks to
+ Devin Carraway and Steffen Joeris. (closes: #480292)
+ * New patch 58_disable-ndb-backup-print.dpatch from 5.0.54 to disable
+ ndb_backup_print, ndb_alter_table and ndb_replace tests when running the
+ testsuite. (closes: #474893)
+ * Reenable error handling in testsuite on i386, disabling it was just a
+ workaround for the problem which is now fixed with the above patch.
+ * Update debconf translations:
+ - Vietnamese, from Clytie Siddall. (closes: #486443)
+ - Spanish, from Javier Fernández-Sanguino Peña. (closes: #488740)
+ - Slovak, from helix84. (closes: #489266)
+ * Make lintian happy:
+ - Fix build-dependency on -1 revision.
+ - Fix deprecated chown usage.
+ - Fix spelling error in description.
+
+ -- Norbert Tretkowski <nobse@debian.org> Tue, 15 Jul 2008 19:37:35 +0200
+
+mysql-dfsg-5.0 (5.0.51a-9+lenny2) testing-security; urgency=high
+
+ * Non-maintainer upload by the security team.
+ * Correct error number in symlink.test to avoid FTBFS on some archs.
+
+ -- Steffen Joeris <white@debian.org> Sun, 13 Jul 2008 11:44:57 +0000
+
+mysql-dfsg-5.0 (5.0.51a-9+lenny1) testing-security; urgency=high
+
+ * Non-maintainer upload by the security team.
+ * Correct and expand 92_SECURITY_CVE-2008-2079.dpatch to cover all symlinks
+ and check the output of fn_format(). (closes: #480292)
+ Fixes: CVE-2008-2079
+
+ -- Steffen Joeris <white@debian.org> Sat, 12 Jul 2008 05:30:39 +0000
+
+mysql-dfsg-5.0 (5.0.51a-9) unstable; urgency=low
+
+ * Ignore errors in testsuite on i386. (workaround for #474893)
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 25 Jun 2008 15:07:03 +0200
+
+mysql-dfsg-5.0 (5.0.51a-8) unstable; urgency=low
+
+ * New patch 80_fix_user_setup_on_localhost.dpatch from Daniel Hahler to fix
+ a duplicate key error when install MySQL server on a host with hostname
+ localhost. (closes: #478319)
+ * Really fix build on non-linux systems, this time without producing a build
+ error on some architectures. (closes: #485971)
+ * Update debconf translations:
+ - French, from Christian Perrier. (closes: #478553)
+ - German, from Alwin Meschede. (closes: #478672)
+ - Italian, from Luca Monducci. (closes: #479363)
+ - Czech, from Miroslav Kure. (closes: #480924)
+ - Galician, from Jacobo Tarrio. (closes: #480965)
+ - Basque, from Piarres Beobide. (closes: #481840)
+ - Swedish, from Martin Bagge. (closes: #482466, #486307)
+ - Turkish, from Mert Dirik. (closes: #484704)
+ - Russian, from Yuri Kozlov. (closes: #486149)
+ - Finnish, from Esko Arajärvi. (closes: #486554)
+ - Portuguese, from Miguel Figueiredo. (closes: #486709)
+ - Romanian, from Eddy Petrișor. (closes: #486944)
+ - Japanese, from Hideki Yamane. (closes: #487270)
+
+ -- Norbert Tretkowski <nobse@debian.org> Sat, 21 Jun 2008 19:20:48 +0200
+
+mysql-dfsg-5.0 (5.0.51a-7) unstable; urgency=high
+
+ [ Norbert Tretkowski ]
+ * SECURITY:
+ Fix for CVE-2008-2079: It was possible to circumvent privileges through
+ the creation of MyISAM tables employing the DATA DIRECTORY and INDEX
+ DIRECTORY options to overwrite existing table files in the MySQL data
+ directory. Use of the MySQL data directory in DATA DIRECTORY and INDEX
+ DIRECTORY is now disallowed. Patch from openSUSE 11.0, thanks to Michal
+ Marek. (closes: #480292)
+ * Fix build on non-linux systems, like hurd-i386. (closes: #480362)
+ * Include symlinks for mysqlcheck. (closes: #480647)
+
+ [ Monty Taylor ]
+ * Remove ndb_cpcd, as it is only for the NDB test suite and not useful as a
+ public program.
+ * Fix debian-start.inc.sh for table names with characters needing quotes.
+ Thanks Felix Rublack! (closes: #480525, #481154, #481303, #484012)
+ * Delete mysql-common.README.Debian. Nothing in it was relevant, and the
+ useful information is in mysql-server anyway. (closes: #480940)
+ * Remove a spurious HOME= in logrotate script.
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 05 Jun 2008 11:49:45 +0200
+
+mysql-dfsg-5.0 (5.0.51a-6) unstable; urgency=low
+
+ * Fix debian-start.inc.sh to not print the row counts of the tables
+ queried. (closes: #478256, #479697)
+
+ -- Monty Taylor <mordred@inaugust.com> Wed, 14 May 2008 00:47:46 -0700
+
+mysql-dfsg-5.0 (5.0.51a-5) unstable; urgency=medium
+
+ * New patch 57_fix_mysql_replication.dpatch from 5.0.54 to fix directory for
+ relay logs when using replication.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 27 Apr 2008 13:55:04 +0200
+
+mysql-dfsg-5.0 (5.0.51a-4) unstable; urgency=low
+
+ [ Monty Taylor ]
+ * Remove build of ndb docs, since they are not installed. Removed build deps
+ on TeX and doxygen since that's all they were there for.
+ * Replace script in check_for_crashed_tables with a myisam-recover option
+ and a script to trigger a check of those tables. (thanks HarrisonF and
+ kolbe)
+ * Replace direct calls to test suite with calls to the make targets used by
+ the MySQL build and qa teams for releases.
+ * Add --skip-ndbcluster to the postinst bootstrap command. It's really a
+ workaround for a bug in 5.1, but it's probably a good idea anyway since we
+ certainly don't need cluster to spin up, and if people have enabled
+ cluster in their my.cnf file, there could be postinst issues if cluster
+ isn't running.
+ * Remove reference to configure options that no longer exist.
+ * Add myself to uploaders.
+
+ [ Norbert Tretkowski ]
+ * New patch 56_fix_order_by.dpatch from Ubuntu to fix ORDER BY not working
+ with GROUP BY. (closes: #471737)
+ * Add note about filename extensions in the /etc/mysql/conf.d/ directory in
+ my.cnf. (closes: #461759)
+ * Confirm password on install, patch from Nicolas Valcárcel.
+ (closes: #471887)
+ * Remove Adam Conrad from uploaders on his request. Thanks for your work in
+ the past!
+ * Use lsb_release to detect distribution.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sat, 05 Apr 2008 21:51:43 +0200
+
+mysql-dfsg-5.0 (5.0.51a-3) unstable; urgency=low
+
+ * Disable patch 60_raise-max-keylength.dpatch in default build, but still
+ ship it in the source package.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 17 Feb 2008 18:54:42 +0100
+
+mysql-dfsg-5.0 (5.0.51a-2) unstable; urgency=low
+
+ * Replace 54_ssl-client-support.dpatch added in 5.0.51-2 with patch from
+ upstream.
+ * Ignore errors in testsuite on powerpc.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 17 Feb 2008 12:42:58 +0100
+
+mysql-dfsg-5.0 (5.0.51a-1) unstable; urgency=low
+
+ [ Norbert Tretkowski ]
+ * New upstream security hotfix release. Low priority upload anyway because
+ 5.0.51-3 already contained all security fixes.
+ * Remove patches:
+ + debian/patches/51_mysqlcheck-result.dpatch
+ + debian/patches/92_SECURITY_CVE-2007-6303.dpatch
+ + debian/patches/93_SECURITY_CVE-2007-6304.dpatch
+ + debian/patches/94_SECURITY_CVE-2008-0226+0227.dpatch
+ * Add recommendation on libhtml-template-perl to -server package, used by
+ ndb_size. (closes: #462265)
+ * New patch 60_raise-max-keylength.dpatch to raise the maximum key length to
+ 4005 bytes or 1335 UTF-8 characters. (closes: #463137)
+ * New patch 51_sort-order.dpatch from 5.0.52 to fix incorrect order when
+ using range conditions on 2 tables or more.
+ * Support DEB_BUILD_OPTIONS option 'nocheck' to skip tests.
+ * Update mysqlreport to 3.4a release.
+
+ [ Luk Claes ]
+ * Updated Japanese debconf translation. (closes: #462158)
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 06 Feb 2008 11:57:45 +0100
+
+mysql-dfsg-5.0 (5.0.51-3) unstable; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2008-0226 and CVE-2008-0227: Three vulnerabilities in yaSSL
+ versions 1.7.5 and earlier were discovered that could lead to a server
+ crash or execution of unauthorized code. The exploit requires a server
+ with yaSSL enabled and TCP/IP connections enabled, but does not require
+ valid MySQL account credentials. The exploit does not apply to OpenSSL.
+ (closes: #460873)
+ * Fix LSB header in init scripts (patch from Petter Reinholdtsen).
+ (closes: #458798)
+ * Run testsuite on all archs, but ignore errors on alpha, arm, armel, hppa,
+ mipsel and sparc. (closes: #460402)
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 23 Jan 2008 11:37:11 +0100
+
+mysql-dfsg-5.0 (5.0.51-2) unstable; urgency=low
+
+ [ Monty Taylor ]
+ * Added --with-system-type to set the version_compile_os field.
+ * Cleaned up some lintian warnings.
+ * Removed 43_scripts__mysql_update__password.dpatch since we don't use
+ mysql_upgrade_shell anymore and use mysql_upgrade instead.
+ * Removed 88_mctype_attrib.dpatch, http://bugs.mysql.com/bug.php?id=25118 is
+ closed with http://lists.mysql.com/commits/24337
+ * Added mysql-community/mysql-enterprise virtual packages in provides and
+ conflicts to ease transitions between versions.
+
+ [ Norbert Tretkowski ]
+ * Add -fPIC to CFLAGS to allow other packages to be built against
+ libmysqld.a on amd64. (closes: #457915)
+ * New patch 55_testsuite-2008.dpatch to fix FTBFS in testsuite.
+ (closes: #458695)
+ * New patch 54_ssl-client-support.dpatch to fix SSL client support.
+ * Don't run testsuite on alpha, arm, hppa, mipsel and sparc.
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 02 Jan 2008 18:40:04 +0100
+
+mysql-dfsg-5.0 (5.0.51-1) unstable; urgency=low
+
+ * New upstream release.
+ + Fix a crash in mysql_client_test due to gcc 4.x optimizations.
+ (closes: #452558)
+ * Update patches:
+ + debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
+ + debian/patches/89_ndb__staticlib.dpatch
+ * Run testsuite after build.
+ * Re-add manpages, they are licensed under GPL now and redistribution is
+ permitted.
+ * Drop linux-libc-dev build-dependency, it's now being pulled by libc-dev
+ which is build-essential. (closes: #431018)
+ * Remove old optimizations for MySQL 3.23.x, they are no longer required.
+ (closes: #436552)
+ * Don't fail when upgrading mysql-common if $datadir is empty or not defined
+ (patch from Edward Allcutt). (closes: #453127)
+ * New patch from 5.0.52 to fix mysqldump because 'null' is shown as type of
+ fields for view with bad definer. (closes: #454227)
+ * New patch from 5.0.52 to fix mysqlcheck test result.
+ * New patch from 5.0.52 to fix wrong optimization in ndb code when building
+ with gcc 4.2.x.
+ * New patch from 5.0.54 to fix wrong number output due to integer overflow
+ when building with gcc 4.2.x.
+ * New Finnish debconf translation from Esko Arajärvi. (closes: #448776)
+ * Update Basque debconf translation from Aitor Ibañez. (closes: #456193)
+ * Add Vcs-* and Homepage fields to source stanza in control file.
+ * Update mysqlreport to 3.2 release.
+ * Let mysql-server-5.0 pre-depend on debconf, because it's preinst is using
+ it.
+ * Drop menu item for innotop.
+
+ -- Norbert Tretkowski <nobse@debian.org> Fri, 14 Dec 2007 09:59:36 +0100
+
+mysql-dfsg-5.0 (5.0.45-5) unstable; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2007-6303: ALTER VIEW retained the original DEFINER value,
+ even when altered by another user, which could allow that user to gain the
+ access rights of the view. Now ALTER VIEW is allowed only to the original
+ definer or users with the SUPER privilege. (closes: #455737)
+ * SECURITY:
+ Fix for CVE-2007-6304: When using a FEDERATED table, the local server can
+ be forced to crash if the remote server returns a result with fewer columns
+ than expected.
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 12 Dec 2007 20:23:43 +0100
+
+mysql-dfsg-5.0 (5.0.45-4) unstable; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2007-5969: Using RENAME TABLE against a table with explicit
+ DATA DIRECTORY and INDEX DIRECTORY options can be used to overwrite system
+ table information by replacing the file to which the symlink points.
+ (closes: #455010)
+
+ -- Norbert Tretkowski <nobse@debian.org> Sun, 09 Dec 2007 12:29:54 +0100
+
+mysql-dfsg-5.0 (5.0.45-3) unstable; urgency=high
+
+ * SECURITY:
+ Fix for CVE-2007-5925: The convert_search_mode_to_innobase function in
+ ha_innodb.cc in the InnoDB engine in MySQL 5.1.23-BK and earlier allows
+ remote authenticated users to cause a denial of service (database crash)
+ via a certain CONTAINS operation on an indexed column, which triggers an
+ assertion error. (closes: #451235)
+
+ -- Norbert Tretkowski <nobse@debian.org> Thu, 15 Nov 2007 18:40:11 +0100
+
+mysql-dfsg-5.0 (5.0.45-2) unstable; urgency=low
+
+ * Package is now team-maintained. (closes: #421026)
+
+ [ Sean Finney ]
+ * New/updated debconf translations:
+ - Spanish, from Javier Fernández-Sanguino Peña (closes: #426442).
+ - German, from Alwin Meschede (closes: #426545).
+ - Danish, from Claus Hindsgaul (closes: #426783).
+ - French, from Christian Perrier (closes: #430944).
+ * Add Recommends on libterm-readkey-perl for mysql-client-5.0 package, used
+ by mysqlreport add-on to mask password entry (closes: #438375).
+
+ [ Norbert Tretkowski ]
+ * Add myself to uploaders.
+ * Suggest usage of an update statement on the user table to change the mysql
+ root user password instead using mysqladmin, to catch all root users from
+ all hosts. (closes: #435744)
+ * Remove informations about a crash in the server during flush-logs when
+ having expire_logs_days enabled but log-bin not, this bug was fixed in
+ 5.0.32 already. (closes: #368547)
+ * Disable log_bin option in default config file and add a note to the NEWS
+ file. (closes: #349661)
+ * Fix FTBFS if build twice in a row. (closes: #442684)
+ * Remove check for buggy options from init script.
+ * Update innotop to 1.6.0 release.
+ * Add mysqlreport and innotop to mysql-client description.
+ * Use shorter server version string.
+
+ -- Norbert Tretkowski <nobse@debian.org> Wed, 14 Nov 2007 20:00:06 +0100
+
+mysql-dfsg-5.0 (5.0.45-1) unstable; urgency=low
+
+ * New upstream release.
+
+ [sean finney]
+ * removed patches that are incorporated into the latest release:
+ - 70_cpuid_on_i486.dpatch
+ - 91_SECURITY_CVE-2007-2691_alter-drop
+ * new patch 90_upstreamdebiandir.dpatch to keep a few lingering references
+ to the upstream ./debian dir out of the build, at least until we find
+ a nice way to collaborate on sharing the directory.
+ * updated CRUFT list to fix double-build breakage (closes: #424590).
+ * add conditional build-deps for linux-libc-dev to fix FTBFS for
+ non-linux arch's (closes: #431018).
+ * added notes to my.cnf and README.Debian about setting tmpdir when
+ configuring a replication slave. thanks to Rudy Gevaert for pointing
+ this out (closes: #431825).
+
+ -- sean finney <seanius@debian.org> Tue, 17 Jul 2007 23:50:33 +0200
+
+mysql-dfsg-5.0 (5.0.41a-1) unstable; urgency=high
+
+ [sean finney]
+ * SECURITY:
+ Fix for CVE-2007-2691: DROP/RENAME TABLE statements (closes: #424778).
+ [Christian Hammers]
+ * Removed all manpages from the source (therefore the "41a") as they
+ are not licensed under the GPL and redistribution is not permitted
+ (thanks to Mathias Gug). Closes: #430018
+ * Added linux-libc-dev to the build-depends as else an illegal dependency to
+ asm/atomic.h is generated in /usr/include/mysql/my_global.h. Closes: 424276
+ [Christian Perrier]
+ * Debconf templates and debian/control reviewed by the debian-l10n-
+ english team as part of the Smith review project. Closes: #419974
+ * Debconf translation updates:
+ - French. Closes: #422187
+ - Galician. Closes: #420118
+ - Italian. Closes: #421349
+ - Brazilian Portuguese. Closes: #421516
+ - Arabic. Closes: #421751
+ - Czech. Closes: #421766
+ - Portuguese. Closes: #422428
+
+ -- Christian Hammers <ch@debian.org> Sun, 24 Jun 2007 21:12:42 +0200
+
+mysql-dfsg-5.0 (5.0.41-2) unstable; urgency=low
+
+ * the previous "translation changes" inadvertently introduced unrelated
+ changes in the package control file.
+
+ -- sean finney <seanius@debian.org> Sun, 13 May 2007 12:32:45 +0200
+
+mysql-dfsg-5.0 (5.0.41-1) unstable; urgency=low
+
+ * New upstream release
+ [sean finney]
+ * Bump the priority of the debconf prompt for the root password to high, to
+ ensure the question shows up in a default installation (closes: #418672).
+ * Debconf templates and debian/control reviewed by the debian-l10n-
+ english team as part of the Smith review project. Closes: #419974
+ * Debconf translation updates:
+ - French. Closes: #422187
+ - Galician. Closes: #420118
+ - Italian. Closes: #421349
+ - Brazilian Portuguese. Closes: #421516
+ - Arabic. Closes: #421751
+ - Czech. Closes: #421766
+ - Portuguese. Closes: #422428
+ * massaged the local PATH_MAX patch.
+ * removed temp sql parsing patch which has been incorporated upstream
+ * upstream no longer includes the mysql_create_system_tables command,
+ so removed our local patches for it.
+ * the following issues may have been fixed in a previous version of
+ mysql-server-5.0, but the exact version is not clear so they will be
+ marked as fixed in this version.
+ * lots of NDB-related fixes, including those related to problems with
+ AUTO_INCREMENT (closes: #310878).
+ * fix for "connections remaining in sleep state" (closes: #318011).
+ * fix for "denies queries randomly" (closes: #399602).
+ * problems indexing on char() binary fields were ISAM specific, which is
+ no longer supported (closes: #326698).
+ * fix for problems with "complicated joins" (closes: 348682).
+ * fix for problems with "flushing logs, server crash" (closes: #348682).
+ * fix for AUTO_INCREMENT and duplicate keys (closes: #416145).
+ * fix for "DROP FUNCTIONS doesn't work" (closes: #290670).
+
+ -- sean finney <seanius@debian.org> Sat, 12 May 2007 12:10:20 +0200
+
+mysql-dfsg-5.0 (5.0.38-3) unstable; urgency=low
+
+ * Added innotop.
+ * Changed maintainer email address to
+ pkg-mysql-commits@lists.alioth.debian.org
+
+ -- Christian Hammers <ch@debian.org> Thu, 19 Apr 2007 19:21:15 +0200
+
+mysql-dfsg-5.0 (5.0.38-2) unstable; urgency=high
+
+ * SECURITY:
+ In some previous versions mysql_install_db was not idempotent and did
+ always create passwordless root accounts although it should only on
+ initial installs (thanks to Olaf van der Spek). Closes: #418672
+ * Added check for passwordless root accounts to debian-start.
+ * As MySQL-5.0 is, at least currently, incompatible with Kernel 2.4 the
+ installation is aborted for such old kernels. Debian Etch does not support
+ them anyway according to the release notes but this might be unexpected
+ and many production servers still have self build ones installed (thanks
+ to Marc-Christian Petersen). See: #416841
+ * Adjusted TeX build-deps to texlive.
+
+ -- Christian Hammers <ch@debian.org> Tue, 17 Apr 2007 01:00:41 +0200
+
+mysql-dfsg-5.0 (5.0.38-1) unstable; urgency=low
+
+ * New upstream release.
+ * Activated the blackhole engine as it's needed for replicating partition
+ designs (thanks to Cyril SCETBON).
+ * Fixed segfault on i486 systems without cpuid instruction (thanks to
+ Lennart Sorensen). Closes: #410474
+ * Only use of the non-essential debconf package in postrm if it is still
+ installed (thanks to Michael Ablassmeier). Closes: #416838
+
+ -- Christian Hammers <ch@debian.org> Thu, 5 Apr 2007 22:43:41 +0200
+
+mysql-dfsg-5.0 (5.0.36-1) unstable; urgency=low
+
+ * New upstream release.
+ Closes: #400460, #408159, #408533
+
+ -- Christian Hammers <ch@debian.org> Thu, 22 Mar 2007 22:16:31 +0100
+
+mysql-dfsg-5.0 (5.0.32-10) unstable; urgency=high
+
+ * Really fixed FTBFS on Sparc introduced with the "make -j" trick in
+ 5.0.32-8 (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Mar 2007 20:52:33 +0100
+
+mysql-dfsg-5.0 (5.0.32-9) unstable; urgency=high
+
+ * Fixed FTBFS on Sparc introduced with the "make -j" trick in 5.0.32-8
+ (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org> Tue, 15 Mar 2007 18:55:42 +0100
+
+mysql-dfsg-5.0 (5.0.32-8) unstable; urgency=high
+
+ [Sean Finney]
+ * SECURITY:
+ - CVE-2007-1420: Single Row Subselect DoS. Specially crafted subselect
+ queries could crash the mysql server. Patch backported from upstream
+ changeset 19685 (46_CVE-2007-1420_subselect_dos.dpatch)
+ closes: #414790.
+ [Christian Hammers]
+ * Adapt MAKE_J to use the -j option with the number of available processors.
+ (thanks to Raphael Pinson).
+ * Updated mysqlreport to latest upstream (and patched --help usage message
+ and "return if qcache_size==0").
+
+ -- sean finney <seanius@debian.org> Wed, 14 Mar 2007 20:19:08 +0100
+
+mysql-dfsg-5.0 (5.0.32-7) unstable; urgency=low
+
+ * Updated French Debconf translation (thanks to Christian Perrier).
+ Closes: #411330
+ * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+ Closes: #411328
+ * Updated Portuguese Debconf translation (thanks to "Traduz").
+ Closes: #411339
+ * Updated Czech Debconf translation (thanks to Miroslav Kure).
+ Closes: #411341
+ * Added Norwegian Debconf translation (thanks to Bjorn Steensrud).
+ Closes: #411345
+ * Updated Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+ Pena). Closes: #411347
+ * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+ Closes: #411368
+ * Updated Swedish Debconf translation (thanks to Andreas Henriksson).
+ Closes: #411370
+ * Updated Italian Debconf translation (thanks to Luca Monducci).
+ Closes: #411377
+ * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+ Closes: #411379
+ * Updated Russian Debconf translation (thanks to Yuriy Talakan).
+ Closes: #411442
+ * Updated Basque Debconf translation (thanks to Piarres Beobide).
+ Closes: #411457
+ * Updated German Debconf translation (thanks to Alwin Meschede).
+ Closes: #411480
+ * Updated Dutch Debconf translation (thanks to Thijs Kinkhorst).
+ * Updated Brazilian Portuguese translation (thanks to Andre Luis Lopes).
+ Closes: #411536
+ * Updated Romanian Debconf translation (thanks to Stan Ioan-Eugen).
+ Closes: #411764
+
+ -- Christian Hammers <ch@debian.org> Fri, 16 Feb 2007 23:20:42 +0100
+
+mysql-dfsg-5.0 (5.0.32-6) unstable; urgency=low
+
+ * Changed wording in Debconf templates to better fit to the graphical
+ interface (thanks to Frank Kuester). Closes: #411165
+ * Lintian suggested style changes to some other Debconf questions.
+ * Removed accidently stdout output from init script.
+
+ -- Christian Hammers <ch@debian.org> Fri, 16 Feb 2007 20:29:18 +0100
+
+mysql-dfsg-5.0 (5.0.32-5) unstable; urgency=medium
+
+ * Backported upstream patch for a bug that crashed the server when using
+ certain join/group/limit combinations.
+ Users of the Joomla CMS seemed to be affected by this. Closes: #403721
+ * The debian-start script that runs on every server start now first upgrades
+ the system tables (if neccessary) and then check them as it sometimes did
+ not work the other way around (e.g. for MediaWiki). The script now uses
+ mysql_update instead of mysql_update_script as recommended. Closes: 409780
+ * Remove the Debconf generated config file in postrm.
+
+ -- Christian Hammers <ch@debian.org> Thu, 15 Feb 2007 04:47:04 +0100
+
+mysql-dfsg-5.0 (5.0.32-4) unstable; urgency=high
+
+ [Christian Hammers]
+ * Changed minimum required version in dh_makeshlibs to 5.0.27-1 as
+ 5.0.26 had an ABI breakage in it!
+ This is the cause for Perl programs crashing with the following error:
+ "Transactions not supported by database at /usr/lib/perl5/DBI.pm line 672"
+ * The old_passwords setting that is set according to a Debconf question is
+ now written to /etc/mysql/conf.d/old_passwords.cnf instead directly to the
+ conffile /etc/mysql/my.cnf which would be fobidden by policy (thanks to
+ Robert Bihlmeyer). Closes: #409750
+ * Added some more comments to the default my.cnf.
+ [Monty Taylor]
+ * Added bison to build dependencies.
+ * Added a "start-initial" option to the Data Node init script to support
+ initial node starts.
+ * Changed NDB Data and Management node startup seqence. Prevented both from
+ restarting on upgrade to address rolling upgrade issues.
+ * Updated build-depends to depend on automake1.9 instead of automake1.8
+ to match what upstream uses.
+
+ -- Christian Hammers <ch@debian.org> Wed, 31 Jan 2007 01:14:09 +0100
+
+mysql-dfsg-5.0 (5.0.32-3) unstable; urgency=high
+
+ * mysql-server-5.0 pre-depends on adduser now and has --disabled-login
+ explicitly added to be on the safe side (thanks to the puiparts team).
+ Closes: #408362
+ * Corrections the terminology regarding NDB in the comments of all config
+ files and init scripts (thanks to Geert Vanderkelen of MySQL).
+ * Updated Swedish Debconf translation (thanks to Andreas Henriksson).
+ Closes: #407859
+ * Updated Czech Debconf translation (thanks to Miroslav Kure).
+ Closes: #407809
+
+ -- Christian Hammers <ch@debian.org> Thu, 11 Jan 2007 11:18:47 +0100
+
+mysql-dfsg-5.0 (5.0.32-2) unstable; urgency=high
+
+ * The last upload suffered from a regression that made NDB totally
+ unusable and caused a dependency to libmysqlclient15-dev in the
+ mysql-server-5.0 package. The relevant 85_* patch was re-added again.
+ Closes: #406435
+ * Added lintian-overrides for an error that does not affect our packages.
+ There are now only warnings and not errors left.
+
+ -- Christian Hammers <ch@debian.org> Tue, 9 Jan 2007 23:55:10 +0100
+
+mysql-dfsg-5.0 (5.0.32-1) unstable; urgency=high
+
+ * New upstream version.
+ * SECURITY: mysql_fix_privilege_tables.sql altered the
+ table_privs.table_priv column to contain too few privileges, causing
+ loss of the CREATE VIEW and SHOW VIEW privileges. (MySQL Bug#20589)
+ * SECURITY (DoS): ALTER TABLE statements that performed both RENAME TO
+ and {ENABLE|DISABLE} KEYS operations caused a server crash. (MySQL
+ Bug#24089)
+ * SECURITY (DoS): LAST_DAY('0000-00-00') could cause a server crash.
+ (MySQL Bug#23653)
+ * SECURITY (DoS): Using EXPLAIN caused a server crash for queries that
+ selected from INFORMATION_SCHEMA in a subquery in the FROM clause.
+ (MySQL Bug#22413)
+ * SECURITY (DoS): Invalidating the query cache (e.g. when using stored procedures)
+ caused a server crash for INSERT INTO ... SELECT statements that
+ selected from a view. (MySQL Bug#20045)
+ * Using mysql_upgrade with a password crashed the server. Closes: #406229
+ * yaSSL crashed on pre-Pentium Intel and Cyrix CPUs. (MySQL Bug#21765)
+ Closes: #383759
+ * Lots of small fixes to the NDB cluster storage engine.
+ * Updated Japanese Debconf template (thanks to Hideki Yamane).
+ Closes: #405793
+ * Fixed comment regarding "mycheck" in debian-start (thanks to
+ Enrico Zini). Closes: #405787
+
+ -- Christian Hammers <ch@debian.org> Sat, 6 Jan 2007 14:26:20 +0100
+
+mysql-dfsg-5.0 (5.0.30-3) unstable; urgency=low
+
+ * Updated Brazilian Debconf translation (thanks to Andre Luis Lopes).
+ Closes: #403821
+ * Added Romanian Debconf translation (thanks to Stan Ioan-Eugen).
+ Closes: #403943
+ * Updated Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+ Pena). Closes: #404084
+ * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+ Closes: #404318
+ * Updated Dutch Debconf translation (thanks to Vincent Zweije).
+ Closes: #404566
+ * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+ Closes: #405018
+
+ -- Christian Hammers <ch@debian.org> Thu, 21 Dec 2006 21:35:09 +0100
+
+mysql-dfsg-5.0 (5.0.30-2) unstable; urgency=high
+
+ * Fixed upstream regression in header files that lead to FTBFS for
+ mysql-admin, mysql-query-browser and probably other pacakges.
+ (thanks to Andreas Henriksson). Closes: #403081, #403082
+ * Fixed some upstream scripts by replacing /etc by /etc/mysql (thanks to
+ Julien Antony). Closes: #401083
+ * Updated French Debconf translation (thanks to Christian Perrier).
+ Closes: #401434
+ * Added Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+ Pena). Closes: #401953
+ * Marked a Debconf question that is just a dummy and only internally
+ used as not-needing-translation. Closes: #403163
+ * Fixed mysqlslowdump patch to not remove the usage() function (thanks
+ to Monty Tailor).
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Dec 2006 19:20:10 +0100
+
+mysql-dfsg-5.0 (5.0.30-1) unstable; urgency=low
+
+ * New upstream version (switch to the MySQL Enterprise branch).
+ * Upstream bugfix for the Innodb performance bug:
+ "Very poor performance with multiple queries running
+ concurrently (Bug#15815)".
+ * Upstream bugfix for a possible server crash:
+ "Selecting from a MERGE table could result in a server crash if the
+ underlying tables had fewer indexes than the MERGE table itself
+ (Bug#22937)"
+ * Upstream bugfies for *lot* of NDB problems.
+ * Upstream bugfix for Innodb optimizer bug. Closes: #397597
+ * Updated Italian Debconf translation (thanks to Luca Monducci).
+ Closes: #401305
+ * Updated debian/watch file to MySQL Enterprise branch.
+
+ -- Christian Hammers <ch@debian.org> Sat, 2 Dec 2006 16:36:38 +0100
+
+mysql-dfsg-5.0 (5.0.27-2) unstable; urgency=medium
+
+ * Disabled YaSSL x86 assembler as it was reported to crash applications
+ like pam-mysql or proftpd-mysql which are linked against libmysqlclient
+ on i486 and Cyrix (i586) CPUs. Closes: #385147
+ * Adjusted mysql-server-4.1 priority to extra and section to oldlibs
+ according to the ftp masters overrides.
+ * Updated German Debconf translation (thanks to Alwin Meschede).
+ Closes: #400809
+
+ -- Christian Hammers <ch@debian.org> Wed, 22 Nov 2006 13:36:31 +0100
+
+mysql-dfsg-5.0 (5.0.27-1) unstable; urgency=medium
+
+ * New upstream version (but no codechange, the only difference to 5.0.26
+ was a patch to the ABI change which Debian already included.
+ * When dist-upgrading from mysql-server-4.1/sarge dpkg does not longer
+ ask unnecessary "config file has changed" questions regarding
+ /etc/init.d/mysql, /etc/logrotate.d/mysql-server and
+ /etc/mysql/debian-start just because these files previously belonged
+ to mysql-server-4.1 and not to mysql-server-5.0.
+ To archive this mysql-server-5.0 now pre-depends on mysql-common which
+ provides current versions of those files.
+ * The automatic run mysql_upgrade now works with non-standard datadir
+ settings, too (thanks to Benjami Villoslada). Closes: #394607
+ * Debconf now asks if the old_passwords option is really needed.
+ * Improved explanations of the old_passwords variable in my.cnf.
+ * Removed possibly leftover cron script from MySQL-4.1 (thanks to
+ Mario Oyorzabal Salgado). Closes: #390889
+ * Postrm ignores failed "userdel mysql".
+ * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+ Closes: #398784
+ * Added Euskarian Debconf translation (thanks to Piarres Beobide).
+ Closes: #399045
+ * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+ Closes: #399074
+ * Updated German Debconf translation (thanks to Alwin Meschede).
+ Closes: #399087
+ * New Portuguese debconf translations from Miguel Figueiredo.
+ Closes: #398186
+
+ -- Christian Hammers <ch@debian.org> Tue, 7 Nov 2006 21:26:25 +0100
+
+mysql-dfsg-5.0 (5.0.26-3) unstable; urgency=high
+
+ [sean finney]
+ * Fix for the deadly ISAM trap. Now during upgrades we will do our
+ very best to convert pre-existing ISAM format tables using the
+ binaries from the previous package. Success is not guaranteed, but
+ this is probably as good as it gets. Note that this also necessitates
+ re-introducing an (empty transitional) mysql-server-4.1 package.
+ Closes: #354544, #354850
+ * Remove a couple spurious and wrongly placed WARNING statements from
+ 45_warn-CLI-passwords.dpatch. thanks to Dan Jacobsen for pointing these
+ out. Closes: #394262
+
+ -- sean finney <seanius@debian.org> Fri, 03 Nov 2006 18:34:46 +0100
+
+mysql-dfsg-5.0 (5.0.26-2) unstable; urgency=high
+
+ * Fixed FTBFS for Alpha by applying an upstream patch (thanks to Falk
+ Hueffner). Closes: #395921
+
+ -- Christian Hammers <ch@debian.org> Sat, 28 Oct 2006 20:13:46 +0200
+
+mysql-dfsg-5.0 (5.0.26-1) unstable; urgency=high
+
+ * SECURITY:
+ This combined release of 5.0.25 and 5.0.26 fixes lot of possible server
+ crashs so it should get into Etch. Quoting the changelog (bug numbers are
+ bugs.mysql.com ones):
+ - character_set_results can be NULL to signify no conversion, but some
+ code did not check for NULL, resulting in a server crash. (Bug#21913)
+ - Using cursors with READ COMMITTED isolation level could cause InnoDB to
+ crash. (Bug#19834)
+ - Some prepared statements caused a server crash when executed a second
+ time. (Bug#21166)
+ - When DROP DATABASE or SHOW OPEN TABLES was issued while concurrently
+ issuing DROP TABLE (or RENAME TABLE, CREATE TABLE LIKE or any other
+ statement that required a name lock) in another connection, the server
+ crashed. (Bug#21216)
+ - Use of zero-length variable names caused a server crash. (Bug#20908)
+ - For InnoDB tables, the server could crash when executing NOT IN ()
+ subqueries. (Bug#21077)
+ - Repeated DROP TABLE statements in a stored procedure could sometimes
+ cause the server to crash. (Bug#19399)
+ - Performing an INSERT on a view that was defined using a SELECT that
+ specified a collation and a column alias caused the server to crash
+ (Bug#21086).
+ - A query of the form shown here caused the server to crash. (Bug#21007)
+ - NDB Cluster: Some queries involving joins on very large NDB tables could
+ crash the MySQL server. (Bug#21059)
+ - The character set was not being properly initialized for CAST() with a
+ type like CHAR(2) BINARY, which resulted in incorrect results or even a
+ server crash. (Bug#17903)
+ - For certain queries, the server incorrectly resolved a reference to an
+ aggregate function and crashed. (Bug#20868)
+ - The server crashed when using the range access method to execut a
+ subquery with a ORDER BY DESC clause. (Bug#20869)
+ - Triggers on tables in the mysql database caused a server crash. Triggers
+ for tables in this database now are disallowed. (Bug#18361)
+ - Using SELECT on a corrupt MyISAM table using the dynamic record format
+ could cause a server crash. (Bug#19835)
+ - Use of MIN() or MAX() with GROUP BY on a ucs2 column could cause a
+ server crash. (Bug#20076)
+ - Selecting from a MERGE table could result in a server crash if the
+ underlying tables had fewer indexes than the MERGE table itself.
+ (Bug#21617, Bug#22937)
+
+ * New upstream release.
+ - This bug would cause trouble for Sarge->Etch upgrades, it was supposed to
+ have been fixed in 5.0.16 but that apparently did not fix the whole
+ problem:
+ Using tables from MySQL 4.x in MySQL 5.x, in particular those with VARCHAR
+ fields and using INSERT DELAYED to update data in the table would result in
+ either data corruption or a server crash. (Bug#16611, Bug#16218, Bug#17294)
+ Closes: #386337
+ - Fixes data corruption as an automatic client reconnect used to set
+ the wrong character set. Closes: #365050
+ - Fixes an undefined ulong type in an include file. Closes: #389102
+ - Fixes wrong output format when using Unicode characters. Closes: #355302
+ - Fixes mysql_upgrade when using a password. Closes: #371841
+
+ [Christian Hammers]
+ * Removed --sysconfdir from debian/rules as it puts /etc/mysql/ at the
+ end of the my.cnf search patch thus overriding $HOME/my.cnf
+ (thanks to Christoph Biedl). Closes: #394992
+ * The provided patch from bug #385947 was wrong, the variable is called
+ BLOCKSIZE not BLOCK_SIZE according to "strings `which df`" (thanks to
+ Bruno Muller). Closes: #385947
+
+ [sean finney]
+ * new dutch debconf translations from Vincent Zweije (closes: #392809).
+ * new japanese debconf translations from Hideki Yamane (closes: #391625).
+ * new italian debconf translations from Luca Monducci (closes: #391741).
+ * new french debconf translations from Christian Perrier (closes: #393334).
+ * ran debconf-updatepo to merge the fuzzies into svn.
+ * massage the following patches so they continue to apply cleanly:
+ - 44_scripts__mysql_config__libs.dpatch to cleanly apply.
+ - 45_warn-CLI-passwords.dpatch
+ - 96_TEMP__libmysqlclient_ssl_symbols.dpatch (note, this patch might
+ no longer be needed, but is retained "just in case" after massaging it)
+ * the following patches have been incorporated upstream:
+ - 70_kfreebsd.dpatch
+ - 80_hurd_mach.dpatch
+ - 87_ps_Hurd.dpatch
+ - 90_TEMP__client__mysql_upgrade__O_EXEC.dpatch
+ - 91_TEMP__client__mysql_upgrade__password.dpatch
+ - 92_TEMP__client__mysql_upgrade__defaultgroups.dpatch
+ - 94_TEMP__CVE-2006-4227.dpatch
+ - 95_TEMP__CVE-2006-4226.dpatch
+ * the udf_example.cc has disappeared from the source code, but there's
+ a udf_example.c which seems to be a good example to use instead :)
+ * update documentation in the configuration to no longer reference
+ using my.cnf in the DATADIR, as it's never been the recommended
+ method for debian systems and hasn't worked since 5.0 was released
+ anyway (closes: #393868).
+
+ -- Christian Hammers <ch@debian.org> Wed, 25 Oct 2006 19:54:04 +0200
+
+mysql-dfsg-5.0 (5.0.24a-9) unstable; urgency=medium
+
+ * Having expire_logs_days enabled but log-bin not crashes the server. Using
+ both or none of those options is safe. To prevent this happening during the
+ nightly log rotation via /etc/logrotate.d/mysql the initscript checks for
+ malicious combination of options. See: #368547
+ * The Sarge package "mysql-server" which used to include the mysqld daemon
+ may still be in unselected-configured state (i.e. after a remove but not
+ purge) in which case its now obsolete cronscript has to be moved away
+ (thanks to Charles Lepple). Closes: #385669
+ * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+ Closes: #390315
+ * Updated Frensh Debconf translation (thanks to Christian Perrier).
+ Closes: #390980
+
+ -- Christian Hammers <ch@debian.org> Tue, 3 Oct 2006 14:55:31 +0200
+
+mysql-dfsg-5.0 (5.0.24a-8) unstable; urgency=low
+
+ * (broken upload)
+
+ -- Christian Hammers <ch@debian.org> Tue, 3 Oct 2006 14:55:31 +0200
+
+mysql-dfsg-5.0 (5.0.24a-7) unstable; urgency=low
+
+ * Stopped mysql_config from announcing unnecessary library dependencies
+ which until now cause "NEEDED" dependencies in the "readelf -d" output
+ of libraries who only depend on libmysqlclient.so (thanks to Michal
+ Cihar). Closes: #390692
+
+ -- Christian Hammers <ch@debian.org> Sun, 1 Oct 2006 23:59:43 +0200
+
+mysql-dfsg-5.0 (5.0.24a-6) unstable; urgency=low
+
+ [sean finney]
+ * finally add support for setting a root password at install.
+ while this is not a random password as requested in one bug
+ report, we believe it is the best solution and provides a
+ means to set a random password via preseeding if it's really
+ desired (Closes: #316127, #298295).
+
+ -- sean finney <seanius@debian.org> Sun, 01 Oct 2006 23:34:30 +0200
+
+mysql-dfsg-5.0 (5.0.24a-5) unstable; urgency=low
+
+ * Added ${shlibs:Depends} to debian/control section libmysqlclient-dev as it
+ contains the experimental /usr/lib/mysql/libndbclient.so.0.0.0.
+ * Bumped standards version to 3.7.2.
+ * Added LSB info section to init scripts.
+ * Rephrased Debconf templates as suggested by lintian.
+ * Added benchmark suite in /usr/share/mysql/sql-bench/.
+ * The mysql.timezone* tables are now filled by the postinst script (thanks
+ to Mark Sheppard). Closes: #388491
+ * Moved Debconf install notes to README.Debian. Displaying them with
+ medium priority was a bug anyway. Closes: #388941
+ * Replaced /usr/bin/mysql_upgrade by /usr/bin/mysql_upgrade_shell in
+ /etc/mysql/debian-start.sh as it works without errors (thanks to Javier
+ Kohen). Closes: #389443
+
+ -- Christian Hammers <ch@debian.org> Wed, 20 Sep 2006 15:01:42 +0200
+
+mysql-dfsg-5.0 (5.0.24a-4) unstable; urgency=high
+
+ * libmysqlclient.so.15 from 5.0.24 accidentaly exports some symbols that are
+ historically exported by OpenSSL's libcrypto.so. This bug was supposed to
+ be fixed in 5.0.24a bug according to the mysql bug tracking system will
+ only be fixed in 5.0.25 so I backported the patch. People already reported
+ crashing apps due to this (thanks to Duncan Simpson). See also: #385348
+ Closes: #388262
+ * Fixed BLOCKSIZE to BLOCK_SIZE in initscript (thanks to Bruno Muller).
+ Closes: #385947
+ * Added hint to "--extended-insert=0" to mysqldump manpage (thanks to Martin
+ Schulze).
+ * Documented the meaning of "NDB" in README.Debian (thanks to Dan Jacobson).
+ Closes: #386274
+ * Added patch to build on hurd-i386 (thanks to Cyril Brulebois). Closes: #387369
+ * Fixed debian-start script to work together with the recend LSB modifications in
+ the initscript (thanks to wens). Closes: #387481
+ * Reverted tmpdir change in my.cnf back to /tmp to comply with FHS (thanks
+ to Alessandro Valente). Closes: #382778
+ * Added logcheck filter rule (thanks to Paul Wise). Closes: #381043
+ * I will definetly not disable InnoDB but added a note to the default my.cnf
+ that disabling it saves about 100MB virtual memory (thanks to Olivier
+ Berger). Closes: #384399
+ * Added thread_cache_size=8 to default my.cnf as this variable seems to have
+ a negligible memory footprint but can improve performance when lots of
+ threads connect simultaneously as often seen on web servers.
+
+ -- Christian Hammers <ch@debian.org> Mon, 4 Sep 2006 00:21:50 +0200
+
+mysql-dfsg-5.0 (5.0.24a-3) unstable; urgency=low
+
+ * Fixed potential tempfile problem in the newly added mysqlreport script.
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Sep 2006 23:17:24 +0200
+
+mysql-dfsg-5.0 (5.0.24a-2) unstable; urgency=low
+
+ * Added "mysqlreport" (GPL'ed) from hackmysql.com.
+ * Temporarily disabled expire_days option as it causes the server
+ to crash. See #368547
+ * Made output of init scripts LSB compliant (thanks to David Haerdeman).
+ Closes: #385874
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Sep 2006 19:06:53 +0200
+
+mysql-dfsg-5.0 (5.0.24a-1) unstable; urgency=high
+
+ * New upstream version.
+ * The shared library in the 5.0.24 upstream release accidently exported
+ some symbols that are also exported by the OpenSSL libraries (notably
+ BN_bin2bn) causing unexpected behaviour in applications using these
+ functions (thanks to Peter Cernak). Closes: #385348
+ * Added note about possible crash on certain i486 clone CPUs.
+ * Made recipient address of startup mysqlcheck output configurable
+ (thanks to Mattias Guns). Closes: #385119
+
+ -- Christian Hammers <ch@debian.org> Mon, 28 Aug 2006 01:22:12 +0200
+
+mysql-dfsg-5.0 (5.0.24-3) unstable; urgency=high
+
+ * SECURITY:
+ CVE-2006-4226:
+ When run on case-sensitive filesystems, MySQL allows remote
+ authenticated users to create or access a database when the database
+ name differs only in case from a database for which they have
+ permissions.
+ CVE-2006-4227:
+ MySQL evaluates arguments of suid routines in the security context of
+ the routine's definer instead of the routine's caller, which allows
+ remote authenticated users to gain privileges through a routine that
+ has been made available using GRANT EXECUTE.
+ Thanks to Stefan Fritsch for reporting. Closes: #384798
+
+ -- Christian Hammers <ch@debian.org> Sat, 26 Aug 2006 04:55:17 +0200
+
+mysql-dfsg-5.0 (5.0.24-2) unstable; urgency=high
+
+ * 5.0.24-1 introduced an ABI incompatibility, which this patch reverts.
+ Programs compiled against 5.0.24-1 are not compatible with any other
+ version and needs a rebuild.
+ This bug already caused a lot of segfaults and crashes in various
+ programs. Thanks to Chad MILLER from MySQL for quickly providing a patch.
+ The shlibdeps version has been increased to 5.0.24-2.
+ Closes: #384047, #384221, #383700
+
+ -- Christian Hammers <ch@debian.org> Fri, 25 Aug 2006 21:47:35 +0200
+
+mysql-dfsg-5.0 (5.0.24-1) unstable; urgency=high
+
+ * SECURITY: Upstream fixes a security bug which allows a user to continue
+ accessing a table using a MERGE TABLE after the right to direct access to
+ the database has been revoked (CVE-2006-4031, MySQL bug #15195).
+ (Well they did not exactly fixed it, they documented the behaviour and
+ allow the admin to disable merge table alltogether...). Closes: #380271
+ * SECURITY: Applied patch that fixes a possibly insecure filehandling
+ in the recently added mysql_upgrade binary file (MySQL bug #10320).
+ * New upstream version.
+ - Fixes nasty MySQL bug #19618 that leads to crashes when using
+ "SELECT ... WHERE ... not in (1, -1)" (e.g. vbulletin was affected).
+ - Fixes upstream bug #16803 so that linking ~/.mysql_history to /dev/null
+ now has the desired effect of having no history.
+ * Really fixed the runlevels. Closes: #377651
+ * Added patch for broken upstream handling of "host=" to mysql_upgrade.c.
+ * Adjusted /etc/mysql/debian-start to new mysql_upgrade.c
+
+ -- Christian Hammers <ch@debian.org> Tue, 8 Aug 2006 00:44:13 +0200
+
+mysql-dfsg-5.0 (5.0.22-5) unstable; urgency=low
+
+ * Added further line to the logcheck ignore files (thanks to Paul Wise).
+ Closes: #381038
+
+ -- Christian Hammers <ch@debian.org> Wed, 2 Aug 2006 00:28:50 +0200
+
+mysql-dfsg-5.0 (5.0.22-4) unstable; urgency=low
+
+ * Upstream fixes a bug in the (never released) version 5.0.23 which could
+ maybe used to crash the server if the mysqlmanager daemon is in use
+ which is not yet the default in Debian. (CVE-2006-3486 *DISPUTED*)
+ * Changed runlevel priority of mysqld from 20 to 19 so that it gets started
+ before apache and proftpd etc. which might depend on an already running
+ database server (thanks to Martin Gruner). Closes: #377651
+ * Added patch which sets PATH_MAX in ndb (thanks to Cyril Brulebois).
+ Closes: #378949
+ * Activated YaSSL as licence issues are settled according to:
+ http://bugs.mysql.com/?id=16755. This also closes the FTBFS bug
+ regarding OpenSSL as it is discouraged to use now. Closes: #368639
+ * Removed SSL-MINI-HOWTO as the official documentation is good enough now.
+ * mysql_upgrade no longer gives --password on the commandline which would
+ be insecure (thanks to Dean Gaudet). Closes: #379199
+ * Adjusted debian/patches/45* to make consecutive builds in the same source
+ tree possible (thanks to Bob Tanner). Closes: #368661
+ * mysql-server-5.0 is now suggesting tinyca as yaSSL is enabled and tinyca
+ was found to be really cool :)
+ * Moved tempdir from /tmp to /var/tmp as it will more likely have enough
+ free space as /tmp is often on the root partition and /var or at least
+ /var/tmp is on a bigger one.
+
+ -- Christian Hammers <ch@debian.org> Mon, 10 Jul 2006 23:30:26 +0200
+
+mysql-dfsg-5.0 (5.0.22-3) unstable; urgency=low
+
+ * Added patch for MySQL bug #19618: "select x from x
+ where x not in(1,-1)" may crash the server" (thanks to
+ Ruben Puettmann).
+
+ -- Christian Hammers <ch@debian.org> Fri, 9 Jun 2006 01:41:44 +0200
+
+mysql-dfsg-5.0 (5.0.22-2) unstable; urgency=high
+
+ * Fixed debian-sys-maint related bug in postinst (thanks to
+ Jean-Christophe Dubacq). Closes: #369970
+ * The last upload was a security patch (which I did not know as I
+ uploaded before the announcement came). I now added the CVE id for
+ reference and set urgency to high as the last entry did not.
+
+ -- Christian Hammers <ch@debian.org> Wed, 31 May 2006 01:04:11 +0200
+
+mysql-dfsg-5.0 (5.0.22-1) unstable; urgency=low
+
+ * SECURITY: This upstream release fixes an SQL-injection with multibyte
+ encoding problem. (CVE-2006-2753)
+ * New upstream release.
+ * Upstream fixes REPAIR TABLE problem. Closes: #354300
+ * Upstream fixes problem that empty strings in varchar and text columns
+ are displayed as NULL. Closes: #368663
+
+ -- Christian Hammers <ch@debian.org> Tue, 30 May 2006 23:43:24 +0200
+
+mysql-dfsg-5.0 (5.0.21-4) unstable; urgency=low
+
+ * Added "BLOCKSIZE=" to the diskfree check (thanks to Farzad FARID).
+ Closes: #367027, #367083
+ * Further fixed mysql_upgrade upstream script (thanks to Andreas Pakulat)
+ Closes: #366155
+ * Adjusted the /proc test in debian/rules from /proc/1 to /proc/self
+ to make building on grsec systems possible (thanks to K. Rosenegger).
+ Closes: #366824
+ * Updated Russion Debconf translation (thanks to Yuriy Talakan).
+ Closes: #367141
+ * Updated Czech Debconf translation (thanks to Kiroslav Kure).
+ Closes: #367160
+ * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+ Closes: #367384
+ * Updated Swedish Debconf translation (thanks to Daniel Nylander).
+ Closes: #368186
+
+ -- Christian Hammers <ch@debian.org> Wed, 10 May 2006 08:45:42 +0200
+
+mysql-dfsg-5.0 (5.0.21-3) unstable; urgency=low
+
+ * Fixed FTBFS problem which was caused by a patch that modifies Makefile.am
+ as well as Makefile.in and was not deteced because my desktop was fast
+ enough to patch both files within the same second and so fooled automake.
+ (thanks to Blars Blarson for notifying me). Closes: #366534
+
+ -- Christian Hammers <ch@debian.org> Sat, 6 May 2006 19:03:58 +0200
+
+mysql-dfsg-5.0 (5.0.21-2) unstable; urgency=low
+
+ * Fixed bug in postinst that did not correctly rewrite
+ /etc/mysql/debian.cnf (thanks to Daniel Leidert).
+ Closes: #365433, #366155
+
+ -- Christian Hammers <ch@debian.org> Thu, 4 May 2006 02:37:03 +0200
+
+mysql-dfsg-5.0 (5.0.21-1) unstable; urgency=high
+
+ * SECURITY: New upstream release with some security relevant bugfixes:
+ * "Buffer over-read in check_connection with usernames lacking a
+ trailing null byte" (CVE-2006-1516)
+ * "Anonymous Login Handshake - Information Leakage" (CVE-2006-1517)
+ * "COM_TABLE_DUMP Information Leakage and Arbitrary command execution"
+ (CVE-2006-1518)
+ Closes: #365938, #365939
+ * Added diskfree check to the init script (thanks to Tim Baverstock).
+ Closes: #365460
+ * First amd64 upload!
+
+ -- Christian Hammers <ch@debian.org> Sat, 29 Apr 2006 04:31:27 +0200
+
+mysql-dfsg-5.0 (5.0.20a-2) unstable; urgency=low
+
+ * The new mysql-upgrade which is started from /etc/mysql/debian-start
+ does now use the debian-sys-maint user for authentication (thanks to
+ Philipp). Closes: #364991
+ * Wrote patch debian/patches/43* which adds a password option to
+ mysql_update. See MySQL bug #19400.
+ * Added "Provides: libmysqlclient-dev" to libmysqlclient15-dev as I saw no
+ obvious reasons against it (problems should be documented in
+ debian/README.Maintainer!) (thanks to Olaf van der Spek). Closes: #364899
+ * Updated Netherlands debconf translation (thanks to Vincent Zweije)
+ Closes: #364464
+ * Updated French debconf translation (thanks to Christian Perrier)
+ Closes: #364401
+ * Updated Danish debconf translation (thanks to Claus Hindsgaul)
+ Closes: #365135
+
+ -- Christian Hammers <ch@debian.org> Wed, 26 Apr 2006 01:14:53 +0200
+
+mysql-dfsg-5.0 (5.0.20a-1) unstable; urgency=low
+
+ * New upstream release.
+ * Added the new mysql_upgrade script and added it to
+ /etc/mysql/debian-start (thanks to Alessandro Polverini).
+ The script is currently very noise that is a known bug and will be
+ fixed in the next release!
+ Closes: #363458
+ * No longer creates the "test" database. This actuallay had been tried
+ to archive before (at least patches) exists but apparently was not the
+ case in the last versions (thanks to Olaf van der Spek). Closes: #362126
+ * Reformatted libmysqlclient15off.NEWS.Debian to changelog format
+ (thanks to Peter Palfrader). Closes: #363062
+
+ -- Christian Hammers <ch@debian.org> Sat, 15 Apr 2006 13:05:22 +0200
+
+mysql-dfsg-5.0 (5.0.20-1) unstable; urgency=high
+
+ * Upstream contains a fix for a nasty bug (MySQL#18153) that users
+ already experienced and that caused corrupted triggers after
+ REPAIR/OPTIMIZE/ALTER TABLE statements.
+ (thanks to Jerome Despatis for pointing out)
+ * Added patch for the "updates on multiple tables is buggy after
+ upgrading from 4.1 to 5.0" problem which MySQL has been committed
+ for the upcoming 5.0.21 release. Closes #352704
+ * Added Netherlands debconf translation (thanks to Vincent Zweije).
+ Closes: #360443
+ * Added Galician debconf translation (thanks to Jacobo Tarrio).
+ Closes: #361257
+
+ -- Christian Hammers <ch@debian.org> Fri, 7 Apr 2006 00:00:43 +0200
+
+mysql-dfsg-5.0 (5.0.19-3) unstable; urgency=high
+
+ [ Christian Hammers ]
+ * Fixed libmysqlclient15.README.Debian regarding package name changes
+ (thanks to Leppo).
+ * Moved libheap.a etc. back to /usr/lib/mysql/ as their names are just
+ too generic. Closes: #353924
+ [ Sean Finney ]
+ * updated danish debconf translation, thanks to Claus Hindsgaul
+ (closes: #357424).
+ [ Adam Conrad ]
+ * Send stderr from 'find' in preinst to /dev/null to tidy up chatter.
+ * Backport patch for CVE-2006-0903 from the upcoming release to resolve
+ a log bypass vulnerability when using non-binary logs (closes: #359701)
+
+ -- Adam Conrad <adconrad@0c3.net> Tue, 4 Apr 2006 15:23:18 +1000
+
+mysql-dfsg-5.0 (5.0.19-2) unstable; urgency=medium
+
+ * New upstream release.
+ * Renamed package libmysqlclient15 to libmysqlclient15off due to
+ binary incompatible changes.
+ See /usr/share/doc/libmysqlclient15off/README.Debian
+ * Updated Czech debconf translation (thanks to Miroslav Kure).
+ Closes: #356503
+ * Updated French debconf translation (thanks to Christian Perrier).
+ Closes: #356332
+ * Improved README.Debian (thanks to Olaf van der Spek). Closes: #355702
+ * Fixed 5.0.18-8 changelog by saying in which package the NEWS.Debian
+ file is (thanks to Ross Boylan). Closes: #355978
+
+ -- Christian Hammers <ch@debian.org> Fri, 17 Mar 2006 02:32:19 +0100
+
+mysql-dfsg-5.0 (5.0.19-1) experimental; urgency=medium
+
+ * New upstream release.
+ * SECURITY: CVE-2006-3081: A bug where str_to_date(1,NULL) lead to a
+ server crash has been fixed.
+ (this note has been added subsequently for reference)
+ * Renamed package libmysqlclient15 to libmysqlclient15off.
+ See /usr/share/doc/libmysqlclient15off/NEWS.Debian
+ * Updated Czech debconf translation (thanks to Miroslav Kure).
+ Closes: #356503
+ * Updated French debconf translation (thanks to Christian Perrier).
+ Closes: #356332
+ * Improved README.Debian (thanks to Olaf van der Spek). Closes: #355702
+ * Fixed 5.0.18-8 changelog by saying in which package the NEWS.Debian
+ file is (thanks to Ross Boylan). Closes: #355978
+
+ -- Christian Hammers <ch@debian.org> Tue, 14 Mar 2006 22:56:13 +0100
+
+mysql-dfsg-5.0 (5.0.18-9) unstable; urgency=medium
+
+ [ Christian Hammers ]
+ * When using apt-get the check for left-over ISAM tables can abort the
+ installation of mysql-server-5.0 but not prevent the mysql-server-4.1
+ package from getting removed. The only thing I can do is reflect this
+ in the Debconf notice that is shown and suggest to reinstall
+ mysql-server-4.1 for converting. See: #354850
+ * Suggests removing of /etc/cron.daily/mysql-server in last NEWS message
+ (thanks to Mourad De Clerck). Closes: #354111
+ * Added versioned symbols for kfreebsd and Hurd, too (thanks to Aurelien
+ Jarno and Michael Bank). Closes: #353971
+ * Added versioned symbols for kfreebsd, too (thanks to Aurelien Jarno).
+ Closes: #353971
+ [ Adam Conrad ]
+ * Add 39_scripts__mysqld_safe.sh__port_dir.dpatch to ensure that the
+ permissions on /var/run/mysqld are always correct, even on a tmpfs.
+
+ -- Christian Hammers <ch@debian.org> Mon, 6 Mar 2006 21:42:13 +0100
+
+mysql-dfsg-5.0 (5.0.18-8) unstable; urgency=low
+
+ * The rotation of the binary logs is now configured via
+ expire-logs-days in /etc/mysql/my.cnf and handled completely
+ by the server and no longer in configured in debian-log-rotate.conf
+ and handled by a cron job. Thanks to David Johnson.
+ See /usr/share/doc/mysql-server-5.0/NEWS.Debian
+ * Ran aspell over some files in debian/ and learned a lot :)
+ * debian/rules: Added check if versioned symbols are really there.
+ * Updated SSL-MINI-HOWTO.
+ * Updated copyright (removed the parts regarding the now removed
+ BerkeleyDB table handler and mysql-doc package).
+ * Relocated a variable in preinst (thanks to Michael Heldebrant).
+ Closes: #349258, #352587, #351216
+ * Updated Danish debconf translation (thanks to Claus Hindsgaul).
+ Closes: #349013
+ * Updated Swedish debconf translation (thanks to Daniel Nylander).
+ Closes: #349522
+ * Updated French debconf translation (thanks to Christian Perrier).
+ Closes: #349592
+ * Fixed typo in README.Debian (thanks to Vincent Ricard).
+ * Prolonged waiting time for mysqld in the init script. Closes: #352070
+
+ -- Christian Hammers <ch@debian.org> Mon, 23 Jan 2006 23:13:46 +0100
+
+mysql-dfsg-5.0 (5.0.18-7) unstable; urgency=low
+
+ * Made mailx in debian-start.inc.sh optional and changed the dependency on it
+ on it to a mere recommendation. Closes: #316297
+ * the previous FTBFS patches for GNU/Hurd inadvertently led to configure
+ being regenerating, losing a couple trivial things like our versioned
+ symbols patch, causing many nasty problems (closes: #348854).
+
+ -- sean finney <seanius@debian.org> Fri, 20 Jan 2006 20:59:27 +0100
+
+mysql-dfsg-5.0 (5.0.18-6) unstable; urgency=low
+
+ * Added version comment (thanks to Daniel van Eeden).
+ * Added two patches to build on GNU/Hurd (thanks to Michael Bank).
+ Closes: #348182
+ * Abort upgrade if old and now unsupported ISAM tables are present
+ (thanks to David Coe). Closes: #345895
+
+ -- Christian Hammers <ch@debian.org> Tue, 17 Jan 2006 19:25:59 +0100
+
+mysql-dfsg-5.0 (5.0.18-5) unstable; urgency=low
+
+ * Bump shlibdeps for libmysqlclient15 to (>= 5.0.15-1), which was
+ the first non-beta release from upstream, as well as being shortly
+ after we broke the ABI in Debian by introducing versioned symbols.
+
+ -- Adam Conrad <adconrad@0c3.net> Fri, 13 Jan 2006 13:18:03 +1100
+
+mysql-dfsg-5.0 (5.0.18-4) unstable; urgency=low
+
+ * Munge our dependencies further to smooth upgrades even more, noting
+ that we really need 5.0 to conflict with 4.1, and stealing a page from
+ the book of mysql-common, it doesn't hurt to hint package managers in
+ the direction of "hey, this stuff is a complete replacement for 4.1"
+ * Change the description of mysql-server and mysql-client to remove the
+ references to it being "transition", and instead point out that it's
+ the way to get the "current best version" of each package installed.
+
+ -- Adam Conrad <adconrad@0c3.net> Wed, 11 Jan 2006 11:39:45 +1100
+
+mysql-dfsg-5.0 (5.0.18-3) unstable; urgency=low
+
+ * Make the mysql-{client,server}-5.0 conflict against mysql-{client,server}
+ versioned, so they can be installed side-by-side and upgrade properly.
+ * Add myself to Uploaders; since I have access to the alioth repository.
+
+ -- Adam Conrad <adconrad@0c3.net> Tue, 10 Jan 2006 19:15:48 +1100
+
+mysql-dfsg-5.0 (5.0.18-2) unstable; urgency=low
+
+ * Removed the transitional package that forced an upgrade from
+ mysql-server-4.1 to mysql-server-5.0 as I was convinced that
+ having a general "mysql-server" package with adjusted dependencies
+ is enough (thanks to Adam Conrad).
+ * Updated logcheck.ignore files (thanks to Jamie McCarthy). Closes: #340193
+
+ -- Christian Hammers <ch@debian.org> Mon, 9 Jan 2006 21:54:53 +0100
+
+mysql-dfsg-5.0 (5.0.18-1) unstable; urgency=low
+
+ * New upstream version.
+ * Added empty transitional packages that force an upgrade from the
+ server and client packages that have been present in Sarge.
+ * Fixed SSL-MINI-HOWTO (thanks to Jonas Smedegaard). Closes: #340589
+
+ -- Christian Hammers <ch@debian.org> Mon, 2 Jan 2006 21:17:51 +0100
+
+mysql-dfsg-5.0 (5.0.17-1) unstable; urgency=low
+
+ * Never released as Debian package.
+
+ -- Christian Hammers <ch@debian.org> Thu, 22 Dec 2005 07:49:52 +0100
+
+mysql-dfsg-5.0 (5.0.16-1) unstable; urgency=low
+
+ * New upstream version.
+ * Removed the error logs from the logrotate script as Debian does
+ not use them anymore. Closes: #339628
+
+ -- Christian Hammers <ch@debian.org> Tue, 22 Nov 2005 01:19:11 +0100
+
+mysql-dfsg-5.0 (5.0.15-2) unstable; urgency=medium
+
+ * Added 14_configure__gcc-atomic.h.diff to fix FTBFS on m68k
+ (thanks to Stephen R Marenka). Closes: #337082
+ * Removed dynamic linking against libstdc++ as it was not really
+ needed (thanks to Adam Conrad). Closes: #328613
+ * Fixed the "/var/lib/mysql is a symlink" workaround that accidently
+ left a stalled symlink (thanks to Thomas Lamy). Closes: #336759
+ * As the init script cannot distinguish between a broken startup and
+ one that just takes very long the "failed" message now says
+ "or took more than 6s" (thanks to Olaf van der Spek). Closes: #335547
+
+ -- Christian Hammers <ch@debian.org> Thu, 3 Nov 2005 22:00:15 +0100
+
+mysql-dfsg-5.0 (5.0.15-1) unstable; urgency=low
+
+ * New upstream version. 5.0 has finally been declared STABLE!
+ * Added small patch to debian/rules that fixed sporadic build errors
+ where stdout and stderr were piped together, got mixed up and broke
+ * Added --with-big-tables to ./configure (thanks to tj.trevelyan).
+ Closes: #333090
+ * Added capability to parse "-rc" to debian/watch.
+ * Fixed cronscript (thanks to Andrew Deason). Closes: #335244
+ * Added Swedish debconf translation (thanks to Daniel Nylander).
+ Closes: #333670
+ * Added comment to README.Debian regarding applications that manually
+ set new-style passwords... Closes: #334444
+ * Sean Finney:
+ - Fix duplicate reference to [-e|--extended-insert]. Closes: #334957
+ - Fix default behavior for mysqldumpslow. Closes: #334517
+ - Reference documentation issue in mysql manpage. Closes: #335219
+
+ -- Christian Hammers <ch@debian.org> Fri, 30 Sep 2005 00:10:39 +0200
+
+mysql-dfsg-5.0 (5.0.13rc-1) unstable; urgency=low
+
+ * New upstream release. Now "release-candidate"!
+ * Removed any dynamic link dependencies to libndbclient.so.0 which
+ is due to its version only distributed as a static library.
+ * Sean Finney:
+ - FTBFS fix related to stripping rpath in debian/rules
+
+ -- Christian Hammers <ch@debian.org> Mon, 26 Sep 2005 22:09:26 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-5) unstable; urgency=low
+
+ * The recent FTBFS were probably result of a timing bug in the
+ debian/patches/75_*.dpatch file where Makefile.in got patched just
+ before the Makefile.shared which it depended on. For that reason
+ only some of the autobuilders failed. Closes: #330149
+ * Fixed chrpath removal (option -k had to be added).
+ * Corrected debconf dependency as requested by Joey Hess.
+
+ -- Christian Hammers <ch@debian.org> Mon, 26 Sep 2005 18:37:07 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-4) unstable; urgency=low
+
+ * Removed experimental shared library libndbclient.so.0.0.0 as it
+ is doomed to cause trouble as long as it is present in both MySQL 4.1
+ and 5.0 without real soname and its own package. We still have
+ libndbclient.a for developers. (thanks to Adam Conrad and
+ mediaforest.net). Closes: #329772
+
+ -- Christian Hammers <ch@debian.org> Fri, 23 Sep 2005 12:36:48 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-3) unstable; urgency=medium
+
+ * Symbol versioning support! wooooohoooooo!
+ (thanks to Steve Langasek) Closes: #236288
+ * Moved libndbcclient.so.0 to the -dev package as it is provided by
+ libmysqlclient14 and -15 which must be installable simultaneously.
+ * Removed mysql-*-doc suggestions.
+
+ -- Christian Hammers <ch@debian.org> Tue, 20 Sep 2005 00:07:03 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-2) unstable; urgency=low
+
+ * Added patch to build on GNU/kFreeBSD (thanks to Aurelien Jarno).
+ Closes: #327702
+ * Added patch that was already been present on the 4.1 branch which
+ makes the "status" command of the init script more sensible
+ (thanks to Stephen Gildea). Closes: #311836
+ * Added Vietnamese Debconf translation (thanks to Clytie Siddal).
+ Closes: #313006
+ * Updated German Debconf translation (thanks to Jens Seidel).
+ Closes: #313957
+ * Corrected commends in example debian-log-rotate.conf. The default is
+ unlike the mysql-sever-4.1 package which needed to stay backwards
+ compatible now 2 to avoid filling up the disk endlessly.
+ * Fixed watch file to be "-beta" aware.
+
+ -- Christian Hammers <ch@debian.org> Thu, 15 Sep 2005 20:50:19 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-1) unstable; urgency=medium
+
+ * Christian Hammers:
+ - New upstream release.
+ - Changed build-dep to libreadline5-dev as requested by Matthias Klose.
+ Closes: #326316
+ - Applied fix for changed output format of SHOW MASTER LOGS for
+ binary log rotation (thanks to Martin Krueger). Closes: #326427, #326427
+ - Removed explicit setting of $PATH as I saw no sense in it and
+ it introduced a bug (thanks to Quim Calpe). Closes: #326769
+ - Removed PID file creation from /etc/init.d/mysql-ndb as it does
+ not work with this daemon (thanks to Quim Calpe).
+ - Updated French Debconf translation (thanks to Christian Perrier).
+ Closes: #324805
+ - Moved conflicts line in debian/control from libmysqlclient15 to
+ libmysqlclient15-dev and removed some pre-sarge conflicts as
+ suggested by Adam Majer. Closes: #324623
+ * Sean Finney:
+ - For posterity, CAN-2005-2558 has been fixed since 5.0.7beta.
+
+ -- Christian Hammers <ch@debian.org> Thu, 15 Sep 2005 19:58:22 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-3) unstable; urgency=low
+
+ * Temporarily build only with -O2 to circumvent gcc internal errors
+ (thanks to Matthias Klose). Related to: #321165
+
+ -- Christian Hammers <ch@debian.org> Thu, 18 Aug 2005 15:44:04 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-2) unstable; urgency=low
+
+ * Fixed README.Debian regarding the status of mysql-doc.
+ * Added "set +e" around chgrp in mysql-server-5.0.preinst to
+ not fail on .journal files (thanks to Christophe Nowicki).
+ Closes: #318435
+
+ -- Christian Hammers <ch@debian.org> Sun, 14 Aug 2005 18:02:08 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-1) unstable; urgency=low
+
+ * New upstream version.
+ * Added Danish Debconf translations (thanks to Claus Hindsgaul).
+ Closes: #322384
+ * Updated Czech Debconf translations (thanks to Miroslav Kure).
+ Closes: #321765
+
+ -- Christian Hammers <ch@debian.org> Sat, 13 Aug 2005 11:56:15 +0000
+
+mysql-dfsg-5.0 (5.0.10beta-1) unstable; urgency=low
+
+ * New upstream release.
+ * Christian Hammers:
+ - Added check for mounted /proc to debian/rules.
+ * Sean Finney:
+ - fix for fix_mysql_privilege_tables/mysql_fix_privilege_tables typo
+ in mysql-server-5.0's README.Debian (see #319838).
+
+ -- Christian Hammers <ch@debian.org> Sun, 31 Jul 2005 00:30:45 +0200
+
+mysql-dfsg-5.0 (5.0.7beta-1) unstable; urgency=low
+
+ * Second try for new upstream release.
+ * Renamed mysql-common-5.0 to mysql-common as future libmysqlclient16
+ from e.g. MySQL-5.1 would else introduce mysql-common-5.1 which makes
+ a simultanous installation of libmysqlclient14 impossible as that
+ depends on either mysql-common or mysql-common-5.0 but not on future
+ versions. Thus we decided to always let the newest MySQL version
+ provide mysql-common.
+ * Added ${misc:Depends} as suggested by debhelper manpage.
+ * Raised standard in control file to 3.6.2.
+ * Removed DH_COMPAT from rules in faviour of debian/compat.
+ * Checkes for presence of init script before executing it in preinst.
+ Referres: 315959
+ * Added 60_includes_mysys.h__gcc40.dpatch for GCC-4.0 compatibility.
+
+ -- Christian Hammers <ch@debian.org> Wed, 29 Jun 2005 00:39:05 +0200
+
+mysql-dfsg-5.0 (5.0.5beta-1) unstable; urgency=low
+
+ * New major release! Still beta so be carefull...
+ * Added federated storage engine.
+
+ -- Christian Hammers <ch@debian.org> Wed, 8 Jun 2005 19:29:45 +0200
+
+mysql-dfsg-4.1 (4.1.12-1) unstable; urgency=low
+
+ * Christian Hammers:
+ - New upstream release.
+ - Disabled BerkeleyDB finally. It has been obsoleted by InnoDB.
+ * Sean Finney:
+ - Updated French translation from Christian Perrier (Closes: #310526).
+ - Updated Japanese translation from Hideki Yamane (Closes: #310263).
+ - Updated Russian translation from Yuriy Talakan (Closes: #310197).
+
+ -- Christian Hammers <ch@debian.org> Sat, 4 Jun 2005 05:49:11 +0200
+
+mysql-dfsg-4.1 (4.1.11a-4) unstable; urgency=high
+
+ * Fixed FTBFS problem which was caused due to the fact that last uploads
+ BerkeleyDB patch was tried to applied on all architectures and not only
+ on those where BerkeleyDB is actually beeing built. Closes: #310296
+
+ -- Christian Hammers <ch@debian.org> Mon, 23 May 2005 00:54:51 +0200
+
+mysql-dfsg-4.1 (4.1.11a-3) unstable; urgency=high
+
+ * Added patch from Piotr Roszatycki to compile the bundled db3 library
+ that is needed for the BerkeleyDB support with versioned symbols so
+ that mysqld no longer crashes when it gets linked together with the
+ Debian db3 version which happens when e.g. using libnss-db.
+ Closes: #308966
+
+ -- Christian Hammers <ch@debian.org> Thu, 19 May 2005 01:41:14 +0200
+
+mysql-dfsg-4.1 (4.1.11a-2) unstable; urgency=high
+
+ * Okay, the hackery with /var/lib/dpkg/info/mysql-server.list will not
+ stand and is removed from the preinst of mysql-server.
+ * New workaround for the symlink problem that does not involve mucking
+ with dpkg's file lists is storing the symlinks in a temporary location
+ across upgrades.
+ As this sometimes fails since apt-get does not always call new.preinst
+ before old.postrm, some remarks were added to README.Debian and the
+ Debconf installation notes to minimize the inconvinience this causes.
+
+ -- sean finney <seanius@debian.org> Sun, 15 May 2005 10:25:31 -0400
+
+mysql-dfsg-4.1 (4.1.11a-1) unstable; urgency=high
+
+ * Added the "a" to the version number to be able to upload a new
+ .orig.tar.gz file which now has the non-free Docs/ directory removed
+ as this has been forgotten in the 4.1.11 release (thanks to Goeran
+ Weinholt). Closes: #308691
+ * The Woody package listed /var/lib/mysql and /var/log/mysql in its
+ /var/lib/dpkg/info/mysql-server.list. These directories are often
+ replaced by symlinks to data partitions which triggers a dpkg bug
+ that causes these symlinks to be removed on upgrades. The new preinst
+ prevents this by removing the two lines from the .list file
+ (thanks to Andreas Barth and Jamin W. Collins). See dpkg bug #287978.
+ * Updated French Debconf translation (thanks to Christian Perrier).
+ Closes: #308353
+
+ -- Christian Hammers <ch@debian.org> Thu, 12 May 2005 21:52:46 +0200
+
+mysql-dfsg-4.1 (4.1.11-3) unstable; urgency=high
+
+ * The "do you want to remove /var/lib/mysql when purging the package" flag
+ from old versions is removed once this package is beeing installed so
+ that purging an old Woody mysql-server package while having a
+ mysql-server-4.1 package installed can no longer lead to the removal of
+ all databases. Additionaly clarified the wording of this versions Debconf
+ template and added a check that skips this purge in the postrm script
+ if another mysql-server* package has /usr/sbin/mysqld installed.
+ (thanks to Adrian Bunk for spotting that problem) Closes: #307473
+ * Cronfile was not beeing installed as the filename was not in the
+ correct format for "dh_installcron --name" (thanks to Tomislav
+ Gountchev). Closes: #302712
+
+ -- Christian Hammers <ch@debian.org> Sat, 23 Apr 2005 22:55:15 +0200
+
+mysql-dfsg-4.1 (4.1.11-2) unstable; urgency=low
+
+ * Sean Finney:
+ - don't freak out if we can't remove /etc/mysql during purge.
+ - debian/rules clean works again.
+ * Christian Hammers:
+ - Fixed typo in README.Debian (thanks to Joerg Rieger). Closes: #304897
+ - Completely removed the passwordless test user as it was not only
+ insecure but also lead to irritations as MySQL checks first the
+ permissions of this user and then those of a password having one.
+ See bug report from Hilko Bengen for details. Closes: #301741
+
+ -- Christian Hammers <ch@debian.org> Sat, 16 Apr 2005 15:55:00 +0200
+
+mysql-dfsg-4.1 (4.1.11-1) unstable; urgency=low
+
+ * New upstream version.
+ * Upstream fix for charset/collation problem. Closes: #282256
+ * Upstream fix for subselect crash. Closes: #297687
+ * Corrected minor issue in Debconf template regarding skip-networking
+ (thanks to Isaac Clerencia). Closes: #303417
+ * Made dependency to gawk unnecessary (thanks to Zoran Dzelajlija).
+ Closes: #302284
+ * Removed obsolete 50_innodb_mixlen.dpatch.
+ * Removed obsolete 51_CAN-2004-0957_db_grant_underscore.dpatch.
+
+ -- Christian Hammers <ch@debian.org> Fri, 8 Apr 2005 00:23:53 +0200
+
+mysql-dfsg-4.1 (4.1.10a-7) unstable; urgency=low
+
+ * Sean Finney:
+ - fix for the mysteriously disappeared cronjob. thanks to
+ Peter Palfrader <weasel@debian.org> for pointing out this omission.
+ (closes: #302712).
+
+ -- sean finney <seanius@debian.org> Sat, 02 Apr 2005 16:54:13 -0500
+
+mysql-dfsg-4.1 (4.1.10a-6) unstable; urgency=high
+
+ * Sean Finney:
+ - the previous upload did not completely address the issue. this one
+ should do so. d'oh.
+
+ -- sean finney <seanius@debian.org> Thu, 31 Mar 2005 03:35:50 +0000
+
+mysql-dfsg-4.1 (4.1.10a-5) unstable; urgency=high
+
+ * Sean Finney:
+ - the following security issue is addressed in this upload:
+ CAN-2004-0957 (grant privilege escalation on tables with underscores)
+ thanks to sergei at mysql for all his help with this.
+
+ -- sean finney <seanius@debian.org> Wed, 30 Mar 2005 21:19:26 -0500
+
+mysql-dfsg-4.1 (4.1.10a-4) unstable; urgency=low
+
+ * Sean Finney:
+ - FTBFS fix for amd64/gcc-4.0. Thanks to Andreas Jochens <aj@andaco.de>
+ for reporting this (closes: #301807).
+ - ANSI-compatible quoting fix in daily cron job. thanks to
+ Karl Hammar <karl@aspodata.se> for pointing out the problem in
+ the 4.0 branch.
+ - Added myself as a co-maintainer in the control file (closes: #295312).
+
+ -- sean finney <seanius@debian.org> Tue, 29 Mar 2005 18:54:42 -0500
+
+mysql-dfsg-4.1 (4.1.10a-3) unstable; urgency=low
+
+ * BerkeleyDB is now disabled by default as its use is discouraged by MySQL.
+ * Added embedded server libraries as they finally do compile.
+ They are currently in libmysqlclient-dev as they are still
+ experimental and only available as .a library (thanks to Keith Packard).
+ Closes: #297062
+ * Fixed obsolete "tail" syntax (thanks to Sven Mueller). Closes: #301413
+ * Added CAN numbers for the latest security bugfix upload.
+ * Updated manpage of mysqlmanager (thanks to Justin Pryzby). Closes: #299844
+ * Added comments to default configuration.
+
+ -- Christian Hammers <ch@debian.org> Sun, 20 Mar 2005 17:40:18 +0100
+
+mysql-dfsg-4.1 (4.1.10a-2) unstable; urgency=low
+
+ * Disabled "--with-mysqld-ldflags=-all-static" as it causes sig11 crashes
+ if LDAP is used for groups in /etc/nsswitch.conf. Confirmed by Sean Finney
+ and Daniel Dehennin. Closes: #299382
+
+ -- Christian Hammers <ch@debian.org> Mon, 14 Mar 2005 03:01:03 +0100
+
+mysql-dfsg-4.1 (4.1.10a-1) unstable; urgency=high
+
+ * SECURITY:
+ - The following security related updates are addressed:
+ CAN-2005-0711 (temporary file creation with "CREATE TEMPORARY TABLE")
+ CAN-2005-0709 (arbitrary library injection in udf_init())
+ CAN-2005-0710 (arbitrary code execution via "CREATE FUNCTION")
+ Closes: #299029, #299031, #299065
+ * New Upstream Release.
+ - Fixes some server crash conditions.
+ - Upstream includes fix for TMPDIR overriding my.cnf tmpdir setting
+ Closes: #294347
+ - Fixes InnoDB error message. Closes: #298875
+ - Fixes resouce limiting. Closes: #285044
+ * Improved checking whether or not the server is alive in the init script
+ which should make it possible to run several mysqld instances in
+ different chroot environments. Closes: #297772
+ * Fixed cron script name as dots are not allowed (thanks to Michel
+ v/d Ven). Closes: #298447
+ * Added -O3 and --with-mysqld-ldflags=-all-static as MySQL recommends to
+ build the server binary statically in order to gain about 13% more
+ performance (thanks to Marcin Kowalski).
+ * Added patch to let mysqld_safe react to signals (thanks to Erich
+ Schubert). Closes: #208364
+ * (Thanks to Sean Finney for doing a great share of work for this release!)
+
+ -- Christian Hammers <ch@debian.org> Thu, 3 Mar 2005 02:36:39 +0100
+
+mysql-dfsg-4.1 (4.1.10-4) unstable; urgency=medium
+
+ * Fixed bug that prevented MySQL from starting after upgrades.
+ Closes: #297198, #296403
+ * Added comment about logging to syslog to the default my.cnf
+ and the logrotate script (thanks to Ryszard Lach). Closes: #295507
+
+ -- Christian Hammers <ch@debian.org> Thu, 3 Mar 2005 00:28:02 +0100
+
+mysql-dfsg-4.1 (4.1.10-3) unstable; urgency=low
+
+ * Sean Finney: Cronjobs now exit silently when the server package
+ has been removed but not purged (thanks to Vineet Kumar).
+ Closes: #297404
+ * Fixed comments of /etc/mysql/debian-log-rotate.conf (thanks to
+ Philip Ross). Closes: #297467
+ * Made mysqld_safe reacting sane on signals (thanks to Erich Schubert).
+ Closes: #208364
+
+ -- Christian Hammers <ch@debian.org> Tue, 1 Mar 2005 19:44:34 +0100
+
+mysql-dfsg-4.1 (4.1.10-2) unstable; urgency=low
+
+ * Converted to dpatch.
+ * debian/ is now maintained via Subversion on svn.debian.org.
+
+ -- Christian Hammers <ch@debian.org> Tue, 1 Mar 2005 02:16:36 +0100
+
+mysql-dfsg-4.1 (4.1.10-1) unstable; urgency=low
+
+ * New upstream version.
+ * Upstream fixed memleak bug. Closes: #205587
+ * Added debian/copyright.more for personal reference.
+ * Lowered default query cache size as suggested by Arjen from MySQL.
+ * Switched from log to log-bin as suggested by Arjen from MySQL.
+ * Fixed typo in my.cnf (thanks to Sebastian Feltel). Closes: #295247
+ * Replaced --defaults-extra-file by --defaults-file in Debian scripts
+ as former lets password/host etc be overwriteable by /root/.my.cnf.
+ Added socket to /etc/mysql/debian.cnf to let it work. (thanks to
+ SATOH Fumiyasu). Closes: #295170
+
+ -- Christian Hammers <ch@debian.org> Tue, 15 Feb 2005 23:47:02 +0100
+
+mysql-dfsg-4.1 (4.1.9-4) unstable; urgency=low
+
+ * Improved the way mysqld is started and registered with update-rc.d
+ in cases where the admin modifies the runlevel configuration.
+ Most notably removed the debconf question whether or not mysql should
+ start on when booting. Closes: #274264
+ * Renamed configuration option old-passwords to the more preferred
+ naming convention old_passwords. Same for some others (thanks to
+ Patrice Pawlak). Closes: #293983
+
+ -- Christian Hammers <ch@debian.org> Tue, 8 Feb 2005 02:21:18 +0100
+
+mysql-dfsg-4.1 (4.1.9-3) unstable; urgency=low
+
+ * Renamed ca_ES.po to ca.po to reach a broader audience (thanks to
+ Christian Perrier). Closes: #293786
+ * Expicitly disabled mysqlfs support as it has never been enabled by
+ configure during the autodetection but fails due to broken upstream
+ code when users try to build the package theirselves while having
+ liborbit-dev installed which triggers the mysqlfs autodetection
+ (thanks to Max Kellermann). Closes: #293431
+ * Added dependencies to gawk as one script does not work with original-awk
+ (thanks to Petr Ferschmann). Closes: #291634
+
+ -- Christian Hammers <ch@debian.org> Sun, 6 Feb 2005 23:33:11 +0100
+
+mysql-dfsg-4.1 (4.1.9-2) unstable; urgency=high
+
+ * SECURITY:
+ For historical reasons /usr/share/mysql/ was owned and writable by
+ the user "mysql". This is a security problem as some scripts that
+ are run by root are in this directory and could be modified and used
+ by a malicious user who already has mysql privileges to gain full root
+ rights (thanks to Matt Brubeck). Closes: #293345
+ * Changed "skip-networking" to "bind-address 127.0.0.1" which is more
+ compatible and not less secure but maybe even more, as less people enable
+ networking for all interfaces (thanks to Arjen Lentz).
+ * Enabled InnoDB by default as recommended by Arjen Lentz from MySQL.
+ * Added remarks about hosts.allow to README.Debian (thanks to David
+ Chappell). Closes: #291300
+ * mysql-server-4.1 now provides mysql-server (thanks to Paul van den Berg).
+ Closes: #287735
+
+ -- Christian Hammers <ch@debian.org> Wed, 2 Feb 2005 23:31:55 +0100
+
+mysql-dfsg-4.1 (4.1.9-1) unstable; urgency=low
+
+ * New upstream version.
+ * mysql-client-4.1 now provides "mysql-client" so that packages depending
+ on mysql-client (ca. 40) can now be used with MySQL-4.1, too.
+
+ -- Christian Hammers <ch@debian.org> Sun, 23 Jan 2005 22:52:48 +0100
+
+mysql-dfsg-4.1 (4.1.8a-6) unstable; urgency=high
+
+ * SECURITY:
+ Javier Fernandez-Sanguino Pena from the Debian Security Audit Project
+ discovered a temporary file vulnerability in the mysqlaccess script of
+ MySQL that could allow an unprivileged user to let root overwrite
+ arbitrary files via a symlink attack and could also could unveil the
+ contents of a temporary file which might contain sensitive information.
+ (CAN-2005-0004, http://lists.mysql.com/internals/20600) Closes: #291122
+
+ -- Christian Hammers <ch@debian.org> Tue, 18 Jan 2005 23:11:48 +0100
+
+mysql-dfsg-4.1 (4.1.8a-5) unstable; urgency=medium
+
+ * Fixed important upstream bug that causes from_unixtime(0) to return
+ NULL instead of "1970-01-01 00:00:00" which fails on NOT NULL columns.
+ Closes: #287792
+ * Fixes upstream bug in mysql_list_fields() . Closes: #282486
+ * Fixes bug that lead to double rotated logfiles when mysql-server 4.0
+ was previously installed (thanks to Olaf van der Spek). Closes: #289851
+ * Fixed typo in README.Debian (thanks to Mark Nipper). Closes: #289131
+ * Changed max_allowed_packet in my.cnf to 16M as in 4.0.x (thanks to
+ Olaf van der Spek). Closes: #289840
+ * Updated French debconf translation (thanks to Christian Perrier).
+ Closes: #287955
+
+ -- Christian Hammers <ch@debian.org> Thu, 13 Jan 2005 01:29:05 +0100
+
+mysql-dfsg-4.1 (4.1.8a-4) unstable; urgency=low
+
+ * Broken patch again :-(
+
+ -- Christian Hammers <ch@debian.org> Sun, 9 Jan 2005 23:47:55 +0100
+
+mysql-dfsg-4.1 (4.1.8a-3) unstable; urgency=low
+
+ * The mutex patch was a bit too x86 centric. This broke the alpha build.
+
+ -- Christian Hammers <ch@debian.org> Sun, 9 Jan 2005 14:18:49 +0100
+
+mysql-dfsg-4.1 (4.1.8a-2) unstable; urgency=medium
+
+ * Some Makefiles that were patched by me got overwritten by the GNU
+ autotools, probably because I also patched ./configure. Fixed now,
+ the critical mutex patch is now back in again. Closes: #286961
+ * Added patch to make MySQL compile on ARM (thanks to Adam Majer).
+ Closes: #285071
+
+ -- Christian Hammers <ch@debian.org> Thu, 6 Jan 2005 09:30:13 +0100
+
+mysql-dfsg-4.1 (4.1.8a-1) unstable; urgency=medium
+
+ * Upstream 4.1.8 had some problems in their GNU Autotools files so they
+ released 4.1.8a. Debian's 4.1.8 was fixed by running autoreconf but this
+ again overwrote MySQL changes to ltmain.sh which are supposed to fix some
+ problems on uncommon architectures (maybe the FTBFS on alpha, arm, m68k
+ and sparc?).
+ * libmysqlclient_r.so.14 from 4.1.8-3 also missed a link dependency to
+ libz which lead to unresolved symbols visible with "ldd -r" (thanks
+ to Laurent Bonnaud). Closes: #287573
+
+ -- Christian Hammers <ch@debian.org> Wed, 29 Dec 2004 14:26:33 +0100
+
+mysql-dfsg-4.1 (4.1.8-3) unstable; urgency=low
+
+ * Fixed checking for error messages by forcing english language
+ output by adding LC_ALL=C to debian-start (thanks to Rene
+ Konasz) Closes: #285709
+ * Fixed bashisms in Debian scripts. Closes: #286863
+ * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+ Closes: #287003
+ * Improved 4.0 to 4.1 upgrade if /var/lib/mysql is a symlink
+ (thanks to Thomas Lamy). Closes: #286560
+ * Added patch for FTBFS problem where no LinuxThreads can be found.
+ I don't know if this still applies but it should not hurt.
+ The patch is debian/patches/configure__AMD64-LinuxThreads-vs-NPTL.diff
+
+ -- Christian Hammers <ch@debian.org> Sun, 26 Dec 2004 14:04:20 +0100
+
+mysql-dfsg-4.1 (4.1.8-2) unstable; urgency=low
+
+ * If /var/lib/mysql is a symlink then it is kept as such.
+ * Added the old-passwords option to the default my.cnf to stay
+ compatible to clients that are still compiled to libmysqlclient10
+ and libmysqlclient12 for licence reasons.
+ * Adjusted tetex build-deps to ease backporting (thanks to Norbert
+ Tretkowski from backports.org).
+
+ -- Christian Hammers <ch@debian.org> Tue, 21 Dec 2004 01:00:27 +0100
+
+mysql-dfsg-4.1 (4.1.8-1) unstable; urgency=medium
+
+ * New upstream version. Closes: #286175
+ * Added conflict to libmysqlclient-dev (thanks to Adam Majer).
+ Closes: #286538
+ * Added debconf-updatepo to debian/rules:clean.
+ * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+ Closes: #285107
+ * Updated French Debconf translation (thanks to Christian Perrier).
+ Closes: #285977
+ * Renamed cz.po to cs.po (thanks to Miroslav Kure). Closes: #285438
+ * Aplied patch for changed server notice to debian-start (thanks to
+ Adam Majer). Closes: #286035
+ * Changed nice value in default my.cnf as nohup changed its behaviour
+ (thanks to Dariush Pietrzak). Closes: #285446
+ * Increased verbosity of preinst script in cases where it cannot stop
+ a running server (thanks to Jan Minar). Closes: #285982
+ * Splitted the code parts of /etc/mysql/debian-start to
+ /usr/share/mysql/debian-start.inc.sh (thanks to Jan Minar).
+ Closes: #285988
+
+ -- Christian Hammers <ch@debian.org> Mon, 20 Dec 2004 00:33:21 +0100
+
+mysql-dfsg-4.1 (4.1.7-4) unstable; urgency=medium
+
+ * Removed OpenSSL support.
+ After a short discussion with MySQL, I decided to drop OpenSSL support as
+ 1. MySQL started shipping their binaries without it, too and do not
+ seem to support it in favour of using a different library somewhen.
+ 2. MySQL did not adjust their licence to grant permission to link
+ against OpenSSL.
+ 3. Even if they did, third parties who use libmysqlclient.so often
+ do not realise licencing problems or even do not want OpenSSL.
+ (thanks to Jordi Mallach and the responders to MySQL bug #6924)
+ Closes: #283786
+ * debian/control: Improved depends and conflicts to mysql-4.0.
+
+ -- Christian Hammers <ch@debian.org> Thu, 2 Dec 2004 22:02:28 +0100
+
+mysql-dfsg-4.1 (4.1.7-3) unstable; urgency=low
+
+ * Raised version to make it higher as the one in experimental.
+
+ -- Christian Hammers <ch@debian.org> Wed, 1 Dec 2004 21:09:20 +0100
+
+mysql-dfsg-4.1 (4.1.7-2) unstable; urgency=low
+
+ * Patched scripts/mysql_install_db so that it no longer creates a
+ passwordless test database during installation (thanks to Patrick
+ Schnorbus). Closes: #281158
+ * Added Czech debconf translation (thanks to Miroslav Kure).
+ Closes: #283222
+
+ -- Christian Hammers <ch@debian.org> Wed, 1 Dec 2004 01:29:31 +0100
+
+mysql-dfsg-4.1 (4.1.7-1) unstable; urgency=low
+
+ * New upstream branch!
+ * Adjusted debian/control to make this package suitable to get parallel
+ to version 4.0.x into unstable and sarge. The package names are
+ different so that "mysql-server" still defaults to the rock-stable
+ 4.0 instead to this announced-to-be-stable 4.1.
+ * Added --with-mutex=i86/gcc-assemler to the Berkeley-DB configure
+ to prevent the use of NPLT threads when compiling under kernel 2.6
+ because the binaries are else not runable on kernel 2.4 hosts.
+ Closes: #278638, #274598
+
+ -- Christian Hammers <ch@debian.org> Sun, 31 Oct 2004 20:15:03 +0100
+
+mysql-dfsg (4.1.6-1) experimental; urgency=low
+
+ * New upstream version.
+ * Fixed symlinks in libmysqlclient-dev package. Closes: #277028
+ * This time I did not update the libtool files as they were pretty
+ up to date and I want to have a shorter diff file.
+
+ -- Christian Hammers <ch@debian.org> Wed, 20 Oct 2004 00:07:58 +0200
+
+mysql-dfsg (4.1.5-3) experimental; urgency=low
+
+ * debian/postinst: mysql_install_db changed parameter from --IN-RPM
+ to --rpm which caused problems during installs. Closes: #276320
+
+ -- Christian Hammers <ch@debian.org> Sat, 16 Oct 2004 20:36:46 +0200
+
+mysql-dfsg (4.1.5-2) experimental; urgency=low
+
+ * Activated support for ndb clustering (thanks to Kevin M. Rosenberg).
+ Closes: #275109
+
+ -- Christian Hammers <ch@debian.org> Wed, 6 Oct 2004 01:58:00 +0200
+
+mysql-dfsg (4.1.5-1) experimental; urgency=low
+
+ * WARNING:
+ The upstream branch 4.1 is still considered BETA.
+ The Debian packages for 4.1 were done without big testing. If you miss
+ a new functionality or binary, contact me and I check add the relevant
+ configure option or include the program.
+ * New MAJOR upstream version.
+ Thanks to the great demand here's now the first MySQL 4.1 experimental
+ release. FEEDBACK IS WELCOME.
+ * 4.0->4.1 notes:
+ - debian/patches/alpha.diff could not be applied, I fix that later
+ - debian/patches/scripts__mysql_install_db.sh.diff was obsolete
+ - debian/patches/scripts__Makefile.in was neccessary due to a dependency
+ to the removed non-free Docs/ directory. Upstream has been contacted.
+ - Build-Deps: += automake1.7
+ - debian/rules: embedded servers examples did not compile, removed
+
+ -- Christian Hammers <ch@debian.org> Sun, 26 Sep 2004 19:46:47 +0200
+
+mysql-dfsg (4.0.21-3) unstable; urgency=low
+
+ * Upstream tried to fix a security bug in mysqlhotcopy and broke it :-)
+ Applied a patch (see debian/patches) from Martin Pitt. Closes: #271632
+ * Between 4.0.20 and 4.0.21 the Debian specific changes in
+ /usr/bin/mysqld_safe that piped the error log to syslog got lost
+ and are now back again.
+ * Fixed capitalization in debconf headings.
+ * Changed wording of the initscript status message to make heartbeat
+ happier. Closes: #271591
+
+ -- Christian Hammers <ch@debian.org> Fri, 17 Sep 2004 18:42:25 +0200
+
+mysql-dfsg (4.0.21-2) unstable; urgency=medium
+
+ * The dependencies between mysql-client and libmysqlclient12 were
+ too loose, when upgrading only the client this can lead to non working
+ binaries due to relocation errors (thanks to Dominic Cleal).
+ Closes: #271803
+ * Fixed typo in mysqldump.1 manpage (thanks to Nicolas Francois).
+ Closes: #271334
+
+ -- Christian Hammers <ch@debian.org> Wed, 15 Sep 2004 15:38:11 +0200
+
+mysql-dfsg (4.0.21-1) unstable; urgency=high
+
+ * SECURITY:
+ This upstream version fixes some security problems that might at least
+ allow a DoS attack on the server.
+ * Fixed an old bug in concurrent accesses to `MERGE' tables (even
+ one `MERGE' table and `MyISAM' tables), that could've resulted in
+ a crash or hang of the server. (Bug #2408)
+ * Fixed bug in privilege checking where, under some conditions, one
+ was able to grant privileges on the database, he has no privileges
+ on. (Bug #3933)
+ * Fixed crash in `MATCH ... AGAINST()' on a phrase search operator
+ with a missing closing double quote. (Bug #3870)
+ * Fixed potential memory overrun in `mysql_real_connect()' (which
+ required a compromised DNS server and certain operating systems).
+ (Bug #4017)
+ * New upstream version.
+ * Fixes bug that made x="foo" in WHERE sometimes the same as x="foo ".
+ Closes: #211618
+ * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+ Closes: #271097
+
+ -- Christian Hammers <ch@debian.org> Sat, 11 Sep 2004 23:15:44 +0200
+
+mysql-dfsg (4.0.20-14) unstable; urgency=low
+
+ * Dave Rolsky spottet that -DBIG_JOINS was not properly enabled.
+ It allowes joining 64 instead of an 32 tables to join.
+
+ -- Christian Hammers <ch@debian.org> Thu, 9 Sep 2004 20:24:02 +0200
+
+mysql-dfsg (4.0.20-13) unstable; urgency=medium
+
+ * Fixed a bug in the initscript which caused the check for not properly
+ closed i.e. corrupt tables that is executed when the server starts
+ not to run in background as supposed.
+ Although the check does not repair anything on servers with several
+ thousand tables the script was reported to take some minutes which
+ is quite annoying. (Thanks to Jakob Goldbach). Closes: #270800
+
+ -- Christian Hammers <ch@debian.org> Thu, 9 Sep 2004 17:11:05 +0200
+
+mysql-dfsg (4.0.20-12) unstable; urgency=medium
+
+ * Filter messages regarding table handles that do not support CHECK TABLE
+ in the script that checks for corrupted tables on every start which lead
+ to unnecessary mails (thanks to David Everly). Closes: #269811
+ * Added a note to the corrupt-table-check mail which notes that a
+ false-positive is reported in the case that immediately after starting
+ the server a client starts using a table (thanks to Uwe Kappe).
+ Closes: #269985
+ * Added "quote-names" as default to the [mysqldump] section in
+ /etc/mysql/my.cnf as too many users stumble over dump files that
+ could not be read in again due to the valid use of reserved words
+ as table names. This has also be done by upstream in 4.1.1 and has
+ no known drawbacks. Closes: #269865
+ * Binary logs can now be rotated as well. Defaults to off, though, for
+ compatibilty reasons (thanks to Mark Ferlatte). Closes: #94230, #269110
+ * The mysql user "debian-sys-maint" now gets all possible rights which
+ makes binary logging possible and helps other package maintainer who
+ wants to use it to create package specific databases and users.
+ * Added example how to change daemon nice level via /etc/mysql/my.cnf
+ * Updated French debconf translations (thanks to Christian Perrier).
+ Closes: #265811
+ * Renamed options in the default config file that still had old names
+ (thanks to Yves Kreis). Closes: #266445
+ * Fixed spelling in debconf note.
+ * Added -l and -L to dh_shlibdeps.
+
+ -- Christian Hammers <ch@debian.org> Fri, 3 Sep 2004 20:10:46 +0200
+
+mysql-dfsg (4.0.20-11) unstable; urgency=high
+
+ * SECURITY
+ This version fixes a security flaw in mysqlhotcopy which created
+ temporary files in /tmp which had predictable filenames and such
+ could be used for a tempfile run attack.
+ The issue has been recorded as CAN-2004-0457.
+
+ -- Christian Hammers <ch@debian.org> Sat, 14 Aug 2004 18:27:19 +0200
+
+mysql-dfsg (4.0.20-10) unstable; urgency=low
+
+ * MySQL finally updated their copyright page and installed v1.5 of
+ the "Free/Libre and Open Source Software License (FLOSS) - Exception"
+ which will hopefully end the license hell they created by putting the
+ client libraries under GPL instead of LGPL which conflicts with PHP and
+ other software that used to link against MySQL.
+ The license text is not yet in any release MySQL version but visible
+ on their web site and copied into the debian/copyright file.
+ Special thanks to Zak Greant <zak@mysql.com> and the debian-legal list
+ for helping to solve this release critical problem.
+ Closes: #242449
+ * Updated Brazil debconf translation (thanks to Andre Luis Lopes).
+ Closes: #264233
+ * Updated Japanese debconf translation (thanks to Hideki Yamane).
+ Closes: #264620
+ * Fixed minor typo in debconf description (thanks to TROJETTE Mohammed
+ Adnene). Closes: #264840
+ * Improved init and preinst script which now detects stalled servers which
+ do no longer communicate but are present in the process list (thanks to
+ Henrik Johansson). Closes: #263215
+
+ -- Christian Hammers <ch@debian.org> Mon, 9 Aug 2004 19:44:28 +0200
+
+mysql-dfsg (4.0.20-9) unstable; urgency=medium
+
+ * Partly reverted the last patch which gave the mysql-user
+ "debian-sys-maint" more rights as there are old versions of MySQL which
+ have fewer privlige columns. Now only those are set (thanks to Alan Tam).
+ Closes: #263111
+
+ -- Christian Hammers <ch@debian.org> Tue, 3 Aug 2004 13:03:02 +0200
+
+mysql-dfsg (4.0.20-8) unstable; urgency=low
+
+ * The mysqlcheck that is started from the initscript will now be
+ backgrounded because it might else prevent the boot process to continue.
+ It also now notifies root by mail and syslog if a table is corrupt.
+ * The "debian-sys-maint" MySQL user now has almost full rights so that other
+ packages might use this account to create databases and user (thanks to
+ Andreas Barth). Closes: #262541
+ * Added paranoid rules for logcheck.
+
+ -- Christian Hammers <ch@debian.org> Sun, 1 Aug 2004 21:00:55 +0200
+
+mysql-dfsg (4.0.20-8) unstable; urgency=low
+
+ * Upload stalled. Not released.
+
+ -- Christian Hammers <ch@debian.org> Sun, 1 Aug 2004 20:27:55 +0200
+
+mysql-dfsg (4.0.20-7) unstable; urgency=medium
+
+ * Solved the upstream bug that error messages of the server are written
+ in a file that is then rotated away leaving mysqld logging effectively
+ to /dev/null. It now logs to a /usr/bin/logger process which puts the
+ messages into the syslog.
+ Modified files: /etc/init.d/mysql, /usr/bin/mysqld_safe and the
+ logchecker files. Closes: #254070
+ * The initscript does no longer call mysqlcheck directly but via
+ /etc/mysql/debian-start which is a user customizable config script.
+ * Splitted the debconf "install and update notes" and only show them
+ when it is appropriate (thanks to Steve Langasek). Closes: #240515
+ * Added NEWS.Debian.
+ * Added hint to -DBIG_ROWS, which is currently not used, to README.Debian.
+ * Corrected typo in myisampack manpage (thanks to Marc Lehmann).
+ Closes: #207090
+ * Added Catalan debconf translation (thanks to Aleix Badia i Bosch).
+ Closes: #236651
+
+ -- Christian Hammers <ch@debian.org> Wed, 28 Jul 2004 01:41:51 +0200
+
+mysql-dfsg (4.0.20-6) unstable; urgency=low
+
+ * The build arch detected by configure was "pc-linux-gnu (i686)"
+ instead of "pc-linux-gnu (i386)". Was no problem AFAIK but
+ Adam Majer asked me to explicitly change it to i386. Closes: #261382
+ * Removed some unused shell scripts from /usr/share/mysql.
+ * Added lintian overrides.
+ * Removed rpath by using chrpath.
+
+ -- Christian Hammers <ch@debian.org> Mon, 26 Jul 2004 00:17:12 +0200
+
+mysql-dfsg (4.0.20-5) unstable; urgency=medium
+
+ * The mysqlcheck in the init script is only called when the server
+ is really alive. Also, the mysql-user 'debian-sys-maint' now has
+ global select rights (thanks to Nathan Poznick). Closes: #261130
+ * Moved the debconf question whether to remove the databases or not
+ from mysql-server.config to mysql-server.postrm so that it shows
+ up on purge time and not months earlier (thanks to Wouter Verhelst).
+ Closes: #251838
+
+ -- Christian Hammers <ch@debian.org> Fri, 23 Jul 2004 22:41:13 +0200
+
+mysql-dfsg (4.0.20-4) unstable; urgency=low
+
+ * Added a "mysqlcheck -A --fast" to the 'start' section of the
+ init script to help admins detect corrupt tables after a server crash.
+ Currently it exists with an error message but leaves the server
+ running. Feedback appreciated!
+ * Made postinst script more robust by calling db_stop earlier and
+ so prevent pipe-deadlocks.
+ * Fixed minor typos in initscript (thanks to "C.Y.M."). Closes: 259518
+ * Added the undocumented "-DBIG_JOINS" that MySQL apparently uses in
+ their MAX binaries. It enables 62 instead of 30 tables in a "join".
+ (thanks to Dave Rolsky). Closes: #260843
+ * Added a "df --portability /var/lib/mysql/." check to the preinst
+ script as users experienced hard to kill hanging mysqlds in such
+ a situation (thanks to Vaidas Pilkauskas). Closes: #260306
+
+ -- Christian Hammers <ch@debian.org> Fri, 23 Jul 2004 00:51:32 +0200
+
+mysql-dfsg (4.0.20-3) unstable; urgency=low
+
+ * Improved tolerance if the init script has been deleted (thanks to
+ Leonid Shulov for spotting the problem).
+ * Minor wording changes to README.Debian generalizing /root/ by $HOME
+ (thanks to Santiago Vila). Closes: #257725
+ * Added Japanese debconf translation (thanks to Hideki Yamane).
+ Closes: #256485
+ * Fixed commend in my.cnf regarding logfile directory (thanks to Jayen
+ Ashar). Closes: #253434
+ * Correted "ease to" by "ease of" in package description (thanks to
+ Johannes Berg). Closes: #253510
+
+ -- Christian Hammers <ch@debian.org> Fri, 9 Jul 2004 00:57:42 +0200
+
+mysql-dfsg (4.0.20-2) unstable; urgency=low
+
+ * Removed RPM .spec file from the included documentation as it is pretty
+ useless (thanks to Loic Minier).
+ * Added turkish debconf translation (thanks to Recai Oktas). Closes: #252802
+
+ -- Christian Hammers <ch@debian.org> Sun, 6 Jun 2004 14:48:26 +0200
+
+mysql-dfsg (4.0.20-1) unstable; urgency=low
+
+ * New upstream version.
+
+ -- Christian Hammers <ch@debian.org> Mon, 31 May 2004 23:36:39 +0200
+
+mysql-dfsg (4.0.18-8) unstable; urgency=low
+
+ * Updated french translation (thanks to Christian Perrier). Closes: #246789
+
+ -- Christian Hammers <ch@debian.org> Tue, 4 May 2004 23:26:54 +0200
+
+mysql-dfsg (4.0.18-7) unstable; urgency=low
+
+ * Added CVE ids for the recent security fixes.
+ 4.0.18-4 is CAN-2004-0381 (mysqlbug) and
+ 4.0.18-6 is CAN-2004-0388 (mysql_multi)
+
+ -- Christian Hammers <ch@debian.org> Mon, 19 Apr 2004 18:32:03 +0200
+
+mysql-dfsg (4.0.18-6) unstable; urgency=medium
+
+ * SECURITY:
+ Fixed minor tempfile-run security problem in mysqld_multi.
+ Unprivileged users could create symlinks to files which were then
+ unknowingly overwritten by run when this script gets executed.
+ Upstream informed. Thanks to Martin Schulze for finding this.
+
+ -- Christian Hammers <ch@debian.org> Wed, 7 Apr 2004 01:28:22 +0200
+
+mysql-dfsg (4.0.18-5) unstable; urgency=low
+
+ * Little improvements in debian scripts for last upload.
+ * Added check to logrotate script for the case that a mysql
+ server is running but not be accessible with the username and
+ password from /etc/mysql/debian.conf (thanks to Jeffrey W. Baker).
+ Closes: 239421
+
+ -- Christian Hammers <ch@debian.org> Sun, 4 Apr 2004 15:27:40 +0200
+
+mysql-dfsg (4.0.18-4) unstable; urgency=medium
+
+ * SECURITY:
+ Aplied fix for unprobable tempfile-symlink security problem in
+ mysqlbug reported by Shaun Colley on bugtraq on 2004-03-24.
+ * Updated french debconf translation (thanks to Christian Perrier).
+ Closes: #236878
+ * Updated portugesian debconf translation (thanks to Nuno Senica).
+ Closes: #239168
+ * Updated german debconf translation (thanks to Alwin Meschede).
+ Closes: #241749
+ * Improved debconf template regarding fix_privileges_tables (thanks
+ to Matt Zimmermann for suggestions). Closes: #219400
+ * Improved README.Debian regarding to password settings (thanks to
+ Yann Dirson). Closes: #241328
+
+ -- Christian Hammers <ch@debian.org> Sat, 3 Apr 2004 19:52:15 +0200
+
+mysql-dfsg (4.0.18-3) unstable; urgency=medium
+
+ * Added Build-Depend to po-debconf to let it build everywhere.
+
+ -- Christian Hammers <ch@debian.org> Wed, 31 Mar 2004 23:43:33 +0200
+
+mysql-dfsg (4.0.18-2) unstable; urgency=low
+
+ * Added a "2>/dev/null" to a "which" command as there are two
+ "which" versions in Debian of which one needs it. Closes: #235363
+
+ -- Christian Hammers <ch@debian.org> Tue, 2 Mar 2004 23:31:28 +0100
+
+mysql-dfsg (4.0.18-1) unstable; urgency=low
+
+ * New upstream version.
+ * Should now compile and run on ia64 (thanks to Thorsten Werner and
+ David Mosberger-Tang). Closes: #226863 #228834
+ * Converted init scripts to invoce-rc.d (thanks to Erich Schubert).
+ Closes: 232118
+ * Secondlast upload changed logfile location. Closes: #182655
+ * Updated Brasilian translation (thanks to Andre Luis Lopes). Closes:
+ #219847
+
+ -- Christian Hammers <ch@debian.org> Tue, 17 Feb 2004 23:44:58 +0100
+
+mysql-dfsg (4.0.17-2) unstable; urgency=low
+
+ * Improved manpage for mysqldumpslow.1 (thanks to Anthony DeRobertis).
+ Closes: #231039
+ * Improved stopping of crashed daemons in init script (thanks to
+ Matthias Urlichs). Closes: #230327
+
+ -- Christian Hammers <ch@debian.org> Mon, 9 Feb 2004 21:54:29 +0100
+
+mysql-dfsg (4.0.17-1) unstable; urgency=low
+
+ * Made logging into /var/log/mysql/ the default. Closes: #225206
+
+ * New upstream version. Closes: #225028
+ * Turned on a 25MB query cache by default (thanks to Cyril Bouthors).
+ Closes: #226789
+ * Updated russian translation (thanks to Ilgiz Kalmetev). Closes: #219263
+ * Upstream fixes the problem that AND was not commutative (thanks for
+ Iain D Broadfoot for mentioning). Closes: #227927
+ * Fixed minor typo in my.cnf comments (thanks to James Renken).
+ Closes: #221496
+ * Better documents regex. Closes: #214952
+ * Fixed minor germanism in debconf template (thanks to Marc Haber).
+ Closes: #224148
+ * Added explaining comment to my.cnf regarding quoted passwords
+ (Thanks to Patrick von der Hagen). Closes: #224906
+ * Changed "find -exec" to "find -print0 | xargs -0" in preinst to
+ speed it up. Thanks to Cyril Bouthors. Closes: #220229
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Jan 2004 16:16:25 +0100
+
+mysql-dfsg (4.0.16-2) unstable; urgency=low
+
+ * Tried to repair undefined weak symbols by adding a little Makefile
+ patch. Closes: #215973
+
+ -- Christian Hammers <ch@debian.org> Mon, 27 Oct 2003 22:52:10 +0100
+
+mysql-dfsg (4.0.16-1) unstable; urgency=low
+
+ * New upstream release.
+ (Mostly little memory problems and other bugfixes it seems)
+ * Replaced "." by ":" in chown calls to comply with the env setting
+ "_POSIX2_VERSION=2000112" (thanks to Robert Luberda). Closes: #217399
+ * Adjusted syntax in my.cnf to 4.x standard (thanks to Guillaume Plessis).
+ Closes: #217273
+ * Improved README.Debian password instructions (thanks to Levi Waldron).
+ Closes: #215046
+ * Improved NIS warning debconf-template (thanks to Jeff Breidenbach).
+ Closes: #215791
+ * Explicitly added libssl-dev to the libmysqlclient-dev package as it
+ is needed for mysql_config and the libmysqlclient package only depends
+ on libssl which has no unnumbered .so version (thanks to Simon Peter
+ and Davor Ocelic). Closes: #214436, #216162
+ * Added "-lwrap" to "mysql_config --libmysqld-libs" and filed it as
+ upstream bug #1650 (thanks to Noah Levitt). Closes: #214636
+
+ -- Christian Hammers <ch@debian.org> Sat, 25 Oct 2003 01:09:27 +0200
+
+mysql-dfsg (4.0.15a-1) unstable; urgency=low
+
+ * Same package as 4.0.15-2 but I could not convince the Debian
+ installer to move the packages out of incoming.
+
+ -- Christian Hammers <ch@debian.org> Tue, 7 Oct 2003 15:10:26 +0200
+
+mysql-dfsg (4.0.15-2) unstable; urgency=low
+
+ * Updated package description (thanks to Adrian Bunk). Closes: #210988
+ * Fixed small typos in manpages (thanks to Nicolas Francois).
+ Closes: #211983
+ * More updates to package description (thanks to Matthias Lutz/ddtp).
+ Closes: #213456
+ * Updated standards to 3.6.1.
+ * Closes "new 4.0.15 available" bug. Closes: #213349
+ * Updated README.Debian with notes regarding the MySQL manual section
+ "2.4 Post-installation Setup and Testing" (thanks to Daniel B.).
+ Closes: #210841
+
+ -- Christian Hammers <ch@debian.org> Fri, 3 Oct 2003 15:59:39 +0200
+
+mysql-dfsg (4.0.15-1) unstable; urgency=high
+
+ * SECURITY:
+ Users who are able to use the "ALTER TABLE" command on the "mysql"
+ database may be able to exploit this vulnerability to gain a shell with
+ the privileges of the mysql server (usually running as the 'mysql' user).
+ Closes: #210403
+ * Fixes small description typos (thanks to Oscar Jarkvik).
+ * Updated Brazilian Portuguese debconf translation. (thanks to Andre Luis
+ Lopes). Closes: 208030
+ * Replaced depricated '.' by ':' in chown (thanks to Matt Zimmerman).
+ * Fixed manpage typo (thanks to Marc Lehmann). Closes: #207090
+
+ -- Christian Hammers <ch@debian.org> Fri, 3 Oct 2003 15:59:35 +0200
+
+mysql-dfsg (4.0.14-1) unstable; urgency=low
+
+ * New upstream version.
+
+ -- Christian Hammers <ch@debian.org> Sun, 24 Aug 2003 16:40:36 +0200
+
+mysql-dfsg (4.0.13-3) unstable; urgency=low
+
+ * Now start mysqld as default unless you choose not when configurig
+ with debconf priority low. So packages depending on the server when
+ installing can access it. Thanks Matt Zimmermann (Closes: #200277)
+ * Made mysql-server de-installable if the config and database files were
+ removed by hand before. Thanks to Ard van Breemen (Closes: #200304)
+
+ -- Christian Hammers <ch@debian.org> Tue, 8 Jul 2003 22:30:40 +0200
+
+mysql-dfsg (4.0.13-2) unstable; urgency=low
+
+ * Added "nice" option for mysqld_safe to give mysqld a different priority.
+ Submitted to upstream as MySQL Bug #627. Closes: #192087
+ * Fixed possible unbound variable in init script. Closes: #194621
+ * Fixed french debconf translation (thx Christian Perrier) Closes: #194739
+ * Get rid of automake1.5 (for Eric Dorland).
+
+ -- Christian Hammers <ch@debian.org> Wed, 11 Jun 2003 18:58:32 +0200
+
+mysql-dfsg (4.0.13-1) unstable; urgency=medium
+
+ * New upstream version.
+ !!! Fixes a very bad natural join bug which justifies the urgency=medium.
+ !!! http://bugs.mysql.com/bug.php?id=291
+ * Fixed mysql_fix_privileges manpage (Frederic Briere) Closes: #191776
+ * preinst: "which" is more chatty normal executable than as builtin.
+ (Thanks to David B Harris). Closes: #188659
+
+ -- Christian Hammers <ch@debian.org> Tue, 6 May 2003 22:03:45 +0200
+
+mysql-dfsg (4.0.12-3) unstable; urgency=medium
+
+ * Reincluded new way of creating my debian-sys-maint user from
+ an old release from experimental. Now works again with old
+ and new privilege table format. (Thanks to Vincent Danjean
+ for spotting the problem) Closes: #188201
+ * Reincluded hurd build dependency fix from 3.23 branch.
+ (Thanks to Robert Millan). Closes: #185929
+ * Fixed soname in libmysqlclient-dev. Closes: #188160
+ * Remove /var/log/mysql/ when purging the package. Closes: #188064
+ * Removed /usr/share/doc/mysql/ from mysql-server. Closes: #188066
+ * Let group "adm" be able to read logfiles. Closes: #188067
+ * Do not call usermod on every upgrade. Closes: #188248
+ (Thanks to Philippe Troin for the last three)
+ * Fixed mysql-server.preinst so that it works on shells where
+ which is a builtin, too. (Thanks to Erich Schubert) Closes: #181525
+
+ -- Christian Hammers <ch@debian.org> Fri, 11 Apr 2003 11:32:45 +0200
+
+mysql-dfsg (4.0.12-2) unstable; urgency=low
+
+ *
+ * NEW MAJOR UPSTREAM RELEASE:
+ *
+ MySQL 4 has finally been declared as 'stable'. Hurray! Read changelogs.
+ Thanks to all testers, esp. Jose Luis Tallon, of the versions
+ that were in the "experimental" section before.
+ * Modified postinst script to run mysql_fix_privileges on every update.
+ IMPORTANT: Please report if this breaks anything, it is not supposed to.
+ * Wrote a SSL-MINI-HOWTO.txt!
+ * Added zlib1g-dev to libmysqlclient12-dev. Closes: 186656
+ * Changed section of libmysqlclient12-dev to libdevel.
+ * Added even more selfwritten manpages.
+ * Fixed typos.
+
+ -- Christian Hammers <ch@debian.org> Sun, 6 Apr 2003 13:47:32 +0200
+
+mysql-dfsg (4.0.10.gamma-1) experimental; urgency=low
+
+ * New upstream version.
+ * They merged some of my patches from debian/patches. Whoa!
+ * This release should fix the error-logfile problem where mysqld
+ keeps the error.log open while logrotate removes it.
+
+ -- Christian Hammers <ch@debian.org> Wed, 12 Feb 2003 22:39:48 +0100
+
+mysql-dfsg (4.0.9.gamma-1) experimental; urgency=low
+
+ * New upstream version.
+ * Updated the GNU autoconf files to make building on MIPS work.
+ See bug #176829.
+
+ -- Christian Hammers <ch@debian.org> Wed, 29 Jan 2003 22:07:44 +0100
+
+mysql-dfsg (4.0.8.gamma-1) experimental; urgency=low
+
+ * New upstream release.
+ * Improved logging of init script. Closes: #174790
+ * We have now libmysqlclient.so.12 instead of .11.
+
+ -- Christian Hammers <ch@debian.org> Thu, 9 Jan 2003 20:14:11 +0100
+
+mysql-dfsg (4.0.7.gamma-1) experimental; urgency=high
+
+ * SECURITY: This version fixes an upstream security release that is only
+ present in the 4.x branch which is currently only in the
+ experimental distribution and therefore will not get a DSA.
+ * New upstream release.
+
+ -- Christian Hammers <ch@debian.org> Sat, 28 Dec 2002 15:51:39 +0100
+
+mysql-dfsg (4.0.6.gamma-2) experimental; urgency=low
+
+ * Added --system to addgroup. Closes: #173866
+
+ -- Christian Hammers <ch@debian.org> Sat, 21 Dec 2002 15:28:26 +0100
+
+mysql-dfsg (4.0.6.gamma-1) experimental; urgency=low
+
+ * New upstream version. Now Gamma!
+ * There are no longer changes to the .orig.tar.gz neccessary to make diff
+ happy. docs/ has still to be deleted, although, as it is non-free.
+ * Incorporated patches from unstable.
+ * Added mysqlmanager and a couple of other new scripts.
+ * Enabled libmysqld embedded server library.
+ * Enabled SSL and Virtual-IO support.
+ (CORBA based MySQL-FS seems to be not existing..)
+
+ -- Christian Hammers <ch@debian.org> Fri, 20 Dec 2002 22:30:51 +0100
+
+mysql-dfsg (4.0.5a.beta-3) experimental; urgency=low
+
+ * Modified postinst to work with old and new mysql.user table format
+ and fixed spelling typo in postinst. Thanks to Roger Aich.
+ * Updated config.{guess,sub} to make the mipsel porters happy.
+ Thanks to Ryan Murray. Closes: #173553
+
+ -- Christian Hammers <ch@debian.org> Wed, 18 Dec 2002 15:56:34 +0100
+
+mysql-dfsg (4.0.5a.beta-2) experimental; urgency=low
+
+ * Upstream removed option "--skip-gemini". So did I. Closes: 173142
+
+ -- Christian Hammers <ch@debian.org> Tue, 17 Dec 2002 10:35:49 +0100
+
+mysql-dfsg (4.0.5a.beta-1) experimental; urgency=low
+
+ * First 4.x experimental package due to continuous user requests :-)
+ Please test and report!
+ * upstream: safe_mysqld has been renamed to mysqld_safe
+ * upstream: new library soname version libmysqlclient.so.11
+ * Renamed libmysqlclientXX-dev to libmysqlclient-dev as I don't plan to
+ support more than one development environment and this makes the
+ dependencies easier.
+ * FIXME: Skipped parts of the debian/patches/alpha patch as the global.h
+ is not existing.
+ * FIXME: How to get rid this? Old ltconfig patch already applied.
+ "lintian: binary-or-shlib-defines-rpath ./usr/bin/mysql /usr/lib/mysql"
+
+ -- Christian Hammers <ch@debian.org> Sun, 1 Dec 2002 18:32:32 +0100
+
+mysql-dfsg (3.23.53-4) unstable; urgency=medium
+
+ * Fixed errno.h problem. Closes: #168533, #168535
+
+ -- Christian Hammers <ch@debian.org> Sun, 10 Nov 2002 18:32:08 +0100
+
+mysql-dfsg (3.23.53-3) unstable; urgency=medium
+
+ * Changed automake build-dep to unversioned automake1.4. Closes: #166391
+ * Fixed description. Closes: #167270
+ (Thanks to Soren Boll Overgaard)
+
+ -- Christian Hammers <ch@debian.org> Tue, 5 Nov 2002 01:25:01 +0100
+
+mysql-dfsg (3.23.53-2) unstable; urgency=low
+
+ * Reverted user creation in init scripts. Closes: #166432
+ (Thanks to Birzan George Cristian)
+
+ -- Christian Hammers <ch@debian.org> Thu, 31 Oct 2002 15:36:25 +0100
+
+mysql-dfsg (3.23.53-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- Christian Hammers <ch@debian.org> Thu, 24 Oct 2002 23:04:16 +0200
+
+mysql-dfsg (3.23.52-3) unstable; urgency=low
+
+ * Substituted the first-install 'debian-sys-maint' user creation by
+ something ANSI SQL compliant. Closes: #163497
+ (Thanks to Karl Hammar)
+ * Tightend dependency to debhelper (>= 4.0.12) to be sure that
+ debconf-utils gets installed, too, as I use dh_installdebconf.
+ * Fixed upstream manpage bug in mysqldump.1. Closes: #159779
+ (Thanks to Colin Watson)
+ * Added comment about MIN_WORD_LEN to mysql-server.README.Debian
+ (Thanks to Philipp Dreimann)
+ * Added a dependency for zlib1g-dev to libmysqlclient10-dev.
+ (Thanks to Jordi Mallach)
+
+ -- Christian Hammers <ch@debian.org> Sun, 15 Sep 2002 17:14:44 +0200
+
+mysql-dfsg (3.23.52-2) unstable; urgency=low
+
+ * Fixed typo in preinst scripts.
+ * Removed bashism in init script.
+ * Fixed ambiguous debconf example. Closes: #158884
+
+ -- Christian Hammers <ch@debian.org> Fri, 30 Aug 2002 00:51:29 +0200
+
+mysql-dfsg (3.23.52-1) unstable; urgency=low
+
+ * New upstream version. Closes: #157731
+ * Clearified the meaning of the debian-sys-maint special user in the
+ README.Debian file. Closes: #153702
+ * Wrote some words regarding the skip-networking in README.Debian.
+ Closes: #157038
+ * Added dependency to passwd.
+ * Fixes typo and unnecessarily complication in is_mysql_alive().
+ * Added check for /etc/mysql/my.cnf in init script.
+
+ -- Christian Hammers <ch@debian.org> Tue, 27 Aug 2002 01:53:32 +0200
+
+mysql-dfsg (3.23.51-4) unstable; urgency=low
+
+ * Added a compressed "nm mysqld" output to allow people to trace
+ core dumps with /usr/bin/resolve_stack_dump as suggested in the
+ INSTALL-SOURCE file. Thanks to atudor@labs.agilent.com for the hint.
+
+ -- Christian Hammers <ch@debian.org> Wed, 24 Jul 2002 20:44:55 +0200
+
+mysql-dfsg (3.23.51-3) unstable; urgency=low
+
+ * Corrected copyright file: the MySQL client library is licenced under
+ the LGPL-2 not the GPL. From version 4.x it actually will be GPL this
+ is why parts of http://www.mysql.com/ already say so. Closes: #153591
+ * Corrected german translation.
+ Thanks to Roland Rosenfeld <roland@spinnaker.de>. Closes: #151903
+
+ -- Christian Hammers <ch@debian.org> Thu, 11 Jul 2002 20:32:28 +0200
+
+mysql-dfsg (3.23.51-2) unstable; urgency=low
+
+ * Improved NIS tolerance in preinst script.
+
+ -- Christian Hammers <ch@debian.org> Sun, 7 Jul 2002 04:43:28 +0200
+
+mysql-dfsg (3.23.51-1) unstable; urgency=medium
+
+ * New upstream version.
+ * I applied a patch that fixes a binary imcompatibility in
+ the shared libary libmysqlclient.so.10 between 3.23.50 and
+ some versions earlier. Upstream has been contacted and asked
+ for clarification. Closes: #149952
+ * Added support for NIS i.e. it shows a warning and fails if the
+ needed 'mysql' user does not exists but works if it does.
+ Closes: #143282, #147869
+ * Substituted $0 in init scripts by something really weird so that
+ "./S20mysql restart" works now, too. (BTW: S20? install file-rc!!!)
+ Closes: #148658
+ * Now postinst works even if /etc/init.d/mysql is removed. Closes: #151021
+ * Decided to leave "set +x" in postinst but wrote comment. Closes: #151022
+
+ -- Christian Hammers <ch@debian.org> Sun, 7 Jul 2002 04:43:25 +0200
+
+mysql-dfsg (3.23.50-1) unstable; urgency=medium
+
+ * New upstream version.
+ Fixes a very annoying and important bug that lets all mysql programs
+ including perl scripts etc. segfault when using the read_default_group()
+ function. 3.23.50 is currently a pre-release and expected to be released
+ next week. I plan to propose it for woody as soon as its stability has
+ been proven. The following bug reports are all regarding this issue.
+ Closes: #144960, #145322, #136798, #138143,
+
+ -- Christian Hammers <ch@debian.org> Sat, 18 May 2002 21:14:01 +0200
+
+mysql-dfsg (3.23.49x-1) unstable; urgency=low
+
+ * I had to split the package to seperate the manual as it is not GPL
+ like the rest of the software and docs but under a license that
+ e.g. forbids selling printed versions.
+ .
+ The upstream authors were contacted a while ago but did not like to
+ change the situation.
+ .
+ The names of the resulting packages have not changed as the manual
+ already was in a seperate mysql-doc package due to it's size.
+ The source packages are now splitted from one "mysql" to
+ "mysql-dfsg" in main and "mysql-nonfree" in non-free.
+ * No code change!
+ The "x" at the end of the version number ist just to be able to
+ upload a new source package. ("a" was already taken by upstream
+ for their binary upload correction)
+
+ -- Christian Hammers <ch@debian.org> Wed, 8 May 2002 02:01:41 +0200
+
+mysql (3.23.49-8) unstable; urgency=low
+
+ * Substituted $0 in init script to let e.g. "/etc# ./init.d/mysql restart"
+ works, too. Closes: #141555
+
+ -- Christian Hammers <ch@debian.org> Sun, 7 Apr 2002 15:00:44 +0200
+
+mysql (3.23.49-7) unstable; urgency=low
+
+ * The Makefiles are totally broken for the --enable-local-infile
+ option. I now patched libmysql/libmysql.c#mysql_init() manually.
+ Closes: #138347
+
+ -- Christian Hammers <ch@debian.org> Fri, 29 Mar 2002 23:55:15 +0100
+
+mysql (3.23.49-6) unstable; urgency=low
+
+ * Moved mysqlcheck from server to client package. Closes: #139799
+ * Added manpage for mysqlhotcopy. Regarding: #87097
+ * Added 'sharedscripts' directive to the logrotate script.
+ * Replaced grep by /usr/bin/getent to let the group/user checking work
+ on NIS/LDAP systems, too. Closes: #115677, #101529
+
+ -- Christian Hammers <ch@debian.org> Fri, 22 Mar 2002 22:40:51 +0100
+
+mysql (3.23.49-5) unstable; urgency=low
+
+ * Added skip-innodb to default my.cnf.
+ * Enabled --enable-local-infile, it seems to be a new option that
+ defaults to disable a formerly enabled feaure. Closes: #137115
+
+ -- Christian Hammers <ch@debian.org> Sat, 16 Mar 2002 00:29:10 +0100
+
+mysql (3.23.49-4) unstable; urgency=medium
+
+ * Recompiled against fixed libz.
+
+ * Enabled --enable-local-infile, it seems to be a new option that
+ defaults to disable a formerly enabled feaure. Closes: #137115
+ * Fixed README.compile_on_potato. Closes: #136529
+ * Now a ext3 .jounal file in /var/lib/mysql does not prevent the
+ installation (happens when creating a jounal on an already mounted
+ partition). Closes: #137146
+
+ -- Christian Hammers <ch@debian.org> Wed, 13 Mar 2002 13:34:24 +0100
+
+mysql (3.23.49-3) unstable; urgency=low
+
+ * Added Russian translation. Closes: #135846
+ * Fixed installation of .info documents. Closes: #135030
+
+ -- Christian Hammers <ch@debian.org> Wed, 27 Feb 2002 23:36:35 +0100
+
+mysql (3.23.49-2) unstable; urgency=low
+
+ * Updated french translation and split template files. Closes: #134754
+ * Fixed a small debian.cnf related bug in mysql-server.postinst.
+
+ -- Christian Hammers <ch@debian.org> Tue, 19 Feb 2002 23:13:58 +0100
+
+mysql (3.23.49-1) unstable; urgency=low
+
+ * New upstream release.
+ (Mainly InnoDB related fixes)
+ * Exported a $HOME variable in the scripts so that /root/.my.cnf
+ is not read anymore. This will avoid problems when admins put
+ only passwords but no usernames in this file. Closes: #132048
+ * New debian-sys-maint password algorithm (now ~96bit :-)) Closes: #133863
+ * Recreating debian-sys-main pwd on every install to help people who
+ accidently delete user or password files...
+ * Added /var/log/mysql so that user can put the binary logs in there as
+ mysql cannot write the .001 etc files itself in /var/log which is
+ owned by root.
+
+ -- Christian Hammers <ch@debian.org> Thu, 14 Feb 2002 22:17:45 +0100
+
+mysql (3.23.47-6) unstable; urgency=low
+
+ * Dropped a sentence about the new debian-sys-maint user in the
+ debconf note and updated the README.Debian. Related: #132048
+ * Added more french translation. Closes: #132390
+
+ -- Christian Hammers <ch@debian.org> Wed, 6 Feb 2002 09:41:29 +0100
+
+mysql (3.23.47-5) unstable; urgency=low
+
+ * Fixed grammar error in template. Closes: #132238
+ * Really fixed typo in logrotate script. Closes: #131711
+
+ -- Christian Hammers <ch@debian.org> Tue, 5 Feb 2002 14:20:08 +0100
+
+mysql (3.23.47-4) unstable; urgency=medium
+
+ * Fixes typo in postinst that let init script fail. Closes: #131743
+ * Fixed bashism bug that failed on ash. Closes: #131697
+ * Fixed typo in logrotate script. Closes: #131711
+
+ -- Christian Hammers <ch@debian.org> Thu, 31 Jan 2002 23:58:46 +0100
+
+mysql (3.23.47-3) unstable; urgency=low
+
+ * Added new Debian specific mysql user called 'debian-sys-maint' which
+ is used for pinging the server status, flushing the logs or shutting
+ down the server in maintenance scripts. The credentials of this user
+ are stored in the UID0-only readable file /etc/mysql/debian.cnf.
+ Closes: #129887, #130326, #99274
+ * Fixed unintended server startup at boottime. Closes: #122676, #130105
+ * New upstream fixes command line parsing bug: Closes: #128473
+ * Fixed manpage headers to let apropos work: Closes: #119122
+ * Added "status" options for /etc/init.d/mysql. Closes: #129020
+
+ -- Christian Hammers <ch@debian.org> Sun, 27 Jan 2002 19:46:11 +0100
+
+mysql (3.23.47-2) unstable; urgency=low
+
+ * Enhanced init scripts by using mysqladmin instead of kill $pid.
+ Thanks to Aaron Brick.
+
+ -- Christian Hammers <ch@debian.org> Fri, 18 Jan 2002 01:42:23 +0100
+
+mysql (3.23.47-1) unstable; urgency=low
+
+ * New upstream release.
+ * Updated brazilian translation of debconf descriptions. Closes: #123332
+
+ -- Christian Hammers <ch@debian.org> Sun, 6 Jan 2002 21:11:17 +0100
+
+mysql (3.23.46-3) unstable; urgency=low
+
+ * Fixed bug in postinst where a script was accidently called with
+ "bash -c <script> -IN_RPM" prevting the first argument to take effect
+ and then leading to failures on hosts with unresolvable hostnames.
+ Closes: #126147
+ * Small changes and comments in postinst.
+
+ -- Christian Hammers <ch@debian.org> Sat, 22 Dec 2001 14:03:02 +0100
+
+mysql (3.23.46-2) unstable; urgency=low
+
+ * Start/stop behaviour now configurable via debconf. Closes: #112174
+
+ -- Christian Hammers <ch@debian.org> Sun, 9 Dec 2001 21:38:54 +0100
+
+mysql (3.23.46-1) unstable; urgency=low
+
+ * New upstream release.
+ Only few fixes, mainly innodb related.
+
+ -- Christian Hammers <ch@debian.org> Sun, 2 Dec 2001 03:08:48 +0100
+
+mysql (3.23.45-1) unstable; urgency=low
+
+ * New upstream version.
+ Only few fixes, mainly innodb related.
+ * Added debconf note regarding the skip-networking option.
+
+ -- Christian Hammers <ch@debian.org> Sun, 25 Nov 2001 16:50:37 +0100
+
+mysql (3.23.44-2) unstable; urgency=low
+
+ * Finally removed debconf toggled "skip-networking" line add/remove
+ code for /etc/mysql/my.cnf. I don't like editing a file that's tagged
+ as configuration file.
+ I disabled networking by default for security reasons. Better ideas?
+
+ -- Christian Hammers <ch@debian.org> Fri, 16 Nov 2001 02:11:02 +0100
+
+mysql (3.23.44-1) unstable; urgency=low
+
+ * New upstream release.
+ - fixes replication bug (core dump)
+ * Made description better english :) Thanks to D. Welton.
+
+ -- Christian Hammers <ch@debian.org> Sun, 11 Nov 2001 15:44:07 +0100
+
+mysql (3.23.43-4) unstable; urgency=low
+
+ * Disabled statically linking.
+
+ -- Christian Hammers <ch@debian.org> Sat, 10 Nov 2001 03:15:56 +0100
+
+mysql (3.23.43-3) unstable; urgency=low
+
+ * Changed compiler settings after one user reported instabilities.
+ See #116631 for more information.
+
+ -- Christian Hammers <ch@debian.org> Tue, 30 Oct 2001 21:39:17 +0100
+
+mysql (3.23.43-2) unstable; urgency=low
+
+ * Patched sparc mutexes again. Closes: #113430
+
+ -- Christian Hammers <ch@debian.org> Sun, 7 Oct 2001 15:09:00 +0200
+
+mysql (3.23.43-1) unstable; urgency=low
+
+ * New upstream version.
+ - Fixed some unlikely(sic!) bugs and core dumps.
+ - Fixed a bug with BDB tables and UNIQUE columns that are NULL.
+ - [more minor bugs were fixed; see changelog]
+ * Adjusted build depends on libwrap0 for IA-64. Closes: #114582
+ * Added the mysqlcheck binary. Closes: #114490
+ * Fixed rules for arm architecture. Closes: #88186
+ * Renamed mysql_print_defaults to the original name my_print_defaults.
+ Isn't as descriptive but else I'd have to patch too much. Closes: #114492
+
+ -- Christian Hammers <ch@debian.org> Fri, 5 Oct 2001 22:24:40 +0200
+
+mysql (3.23.42-2) unstable; urgency=low
+
+ * Applied patch for m68k compile. Closes: #112904
+
+ -- Christian Hammers <ch@debian.org> Sun, 23 Sep 2001 21:32:57 +0200
+
+mysql (3.23.42-1) unstable; urgency=low
+
+ * New upstream releae.
+ Fixes critical bug with InnoDB and large BLOBs.
+
+ -- Christian Hammers <ch@debian.org> Tue, 18 Sep 2001 22:25:47 +0200
+
+mysql (3.23.41-2) unstable; urgency=low
+
+ * Fixed shlibs.local problem. Closes: #111573
+ * Replaced emacs by sensible-editor in mysqlbug.sh. Thanks Hans Ginzel.
+
+ -- Christian Hammers <ch@debian.org> Sun, 9 Sep 2001 17:16:42 +0200
+
+mysql (3.23.41-1) unstable; urgency=low
+
+ * New upstream release
+ * Fixed build problem on ia64. Closes: #110624
+
+ -- Christian Hammers <ch@debian.org> Tue, 14 Aug 2001 23:20:35 +0200
+
+mysql (3.23.40-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- Christian Hammers <ch@debian.org> Sun, 5 Aug 2001 19:46:18 +0200
+
+mysql (3.23.39-5) unstable; urgency=low
+
+ * Added debconf template for brazil. Closes: #106934, #106752
+ * Tightened dependencies on debconf.
+ * Adjusted mysql.err permissions in logrotate script to 0600. Closes: #105672
+
+ -- Christian Hammers <ch@debian.org> Mon, 30 Jul 2001 00:10:12 +0200
+
+mysql (3.23.39-4.1) unstable; urgency=low
+
+ * Maintainer-requested NMU.
+ * Fixing thread mutexes on Sparc and Alpha
+ (closes: Bug#101783)
+ * Added --enable-assembler for sparc. This should
+ allow mysql on sparc to use assembler versions of
+ some string functions (read: should speed up a bit).
+
+ -- Christopher C. Chimelis <chris@debian.org> Fri, 13 Jul 2001 15:09:30 -0400
+
+mysql (3.23.39-4) unstable; urgency=low
+
+ * Porting fixes.
+
+ -- Christian Hammers <ch@debian.org> Mon, 9 Jul 2001 17:56:54 +0200
+
+mysql (3.23.39-3.1) unstable; urgency=low
+
+ * NMU (for porting)
+ * Update config.sub and config.guess for hppa, sh & s390.
+ * Add --with-client-ldflags=-lstdc++ to configure line. Closes: #100884
+
+ -- Matthew Wilcox <willy@debian.org> Sun, 8 Jul 2001 19:26:59 -0600
+
+mysql (3.23.39-3) unstable; urgency=low
+
+ * Disabled berkeley-db on sparc again. Mutexes aren't working again :-(
+
+ -- Christian Hammers <ch@debian.org> Sat, 7 Jul 2001 18:30:08 +0200
+
+mysql (3.23.39-2) unstable; urgency=low
+
+ * Bugfixed the m68k mutex patch. Thanks to Michael Fedrowitz. Closes: #103145
+ * Removed config.cache files in bdb/ and innobase/. Closes: #103143
+
+ -- Christian Hammers <ch@debian.org> Wed, 4 Jul 2001 22:06:58 +0200
+
+mysql (3.23.39-1) unstable; urgency=low
+
+ * New upstream release. Minor bugfixes only.
+
+ -- Christian Hammers <ch@debian.org> Thu, 14 Jun 2001 13:53:03 +0200
+
+mysql (3.23.38-4) unstable; urgency=low
+
+ * Added logcheck files. Closes: #99131
+ (I can't let the usermod away since I don't know of an easy way to
+ retrive "passwd" information in a shell script considering that
+ people use different storage methods like LDAP/NIS instead of passwd.)
+
+ -- Christian Hammers <ch@debian.org> Fri, 8 Jun 2001 21:04:25 +0200
+
+mysql (3.23.38-3) unstable; urgency=low
+
+ * Explicit pointet to /root/.my.cnf to let /etc/init.d/mysql stop
+ work in sudo environments with $HOME!=/root work, too. Closes: #98324
+ * Removes empty /etc/mysql on purge. Closes: #98164
+
+ -- Christian Hammers <ch@debian.org> Tue, 22 May 2001 10:13:06 +0200
+
+mysql (3.23.38-2) unstable; urgency=low
+
+ * Added depends to libdbd-mysql-perl for mysql-server. Closes: #94306
+
+ -- Christian Hammers <ch@debian.org> Sat, 19 May 2001 19:43:26 +0200
+
+mysql (3.23.38-1) unstable; urgency=low
+
+ * New upstream release.
+ * Added Build-Depends to procps. Closes: #96768
+
+ -- Christian Hammers <ch@debian.org> Sun, 13 May 2001 17:30:15 +0200
+
+mysql (3.23.37-5) unstable; urgency=low
+
+ * Applied mutex patch for bdb support on m68k.
+ Thanks to Michael Fedrowitz for the patch.
+
+ -- Christian Hammers <ch@debian.org> Mon, 7 May 2001 12:30:40 +0200
+
+mysql (3.23.37-4) unstable; urgency=low
+
+ * Enable bdb support for m68k architecture.
+
+ -- Christian Hammers <ch@debian.org> Sat, 5 May 2001 16:47:36 +0200
+
+mysql (3.23.37-3) unstable; urgency=low
+
+ * Added thread-safe client library. Thanks to Shane Wegner. Closes: #95441
+
+ -- Christian Hammers <ch@debian.org> Sat, 28 Apr 2001 09:45:00 -0400
+
+mysql (3.23.37-2) unstable; urgency=low
+
+ * Added sparc to the list of BDB supporting architectures after some
+ tests on vore.debian.org and mails with Ben Collons.
+
+ -- Christian Hammers <ch@debian.org> Fri, 27 Apr 2001 09:30:09 -0400
+
+mysql (3.23.37-1) unstable; urgency=low
+
+ * New upstream version.
+ * Added gemini table support.
+ * Does anybody know how to enable SSL?
+ * Fixed ARM compilation problem. Closes: #88186
+
+ -- Christian Hammers <ch@debian.org> Sat, 21 Apr 2001 11:48:46 -0400
+
+mysql (3.23.36-2) unstable; urgency=low
+
+ * Added patch by Christopher C. Chimelis <chris@debian.org> to make
+ Berkeley db3 work again on Alpha architecture. Closes: #92787
+
+ -- Christian Hammers <ch@debian.org> Tue, 3 Apr 2001 23:41:46 +0200
+
+mysql (3.23.36-1) unstable; urgency=high
+
+ * New upstream version.
+ * SECURITY FIX: One could place database tables outside the database
+ directory by using '..' in one of the mysql helper programs where the
+ table name was not checked correctly. This could lead to root compromise
+ if the server would be running as root else you could at least do bad
+ things as user mysql.
+ * upstream: Fixed bug when thread creation failed.
+ * upstream: Fixed problem in Innobase with non-latin1 charsets
+ * upstream: Fixed a core-dump bug when using very complex query with DISTINGT
+ * upstream: many others so called minor bugs...
+ * fixes bug in init script. Closes: #90257
+ (this report was agains some older problem that has been fixed too in .33)
+
+ -- Christian Hammers <ch@debian.org> Fri, 30 Mar 2001 02:55:12 +0200
+
+mysql (3.23.35-1) unstable; urgency=medium
+
+ * New upstream relase.
+ * Fixes problem in ORDER BY clause. People using 3.33.34 should upgrade!
+ * Includes innobase support.
+ (Hope this is not such a catastrophe like berkeley db...)
+
+ -- Christian Hammers <ch@debian.org> Fri, 16 Mar 2001 23:30:30 +0100
+
+mysql (3.23.33-3) unstable; urgency=low
+
+ * Forgot #!/bin/sh at top of mysql-doc.postinst. Closes: #89801
+
+ -- Christian Hammers <ch@vore.debian.org> Thu, 15 Mar 2001 20:38:35 -0500
+
+mysql (3.23.33-2) unstable; urgency=low
+
+ * Added some missing scripts and manpages. Closes: #84068
+ * Added dependency to perl-5.6. Closes: #81942
+ * Added french templates somewhen ago. Closes: #83790
+ * Added patch to get db3 working on Alpha. Closes: #86033
+ Thanks to Christopher C. Chimelis <chris@debian.org>. The patch
+ itself is included as debian/patch.alpha, too.
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Feb 2001 06:40:40 +0100
+
+mysql (3.23.33-1) unstable; urgency=high
+
+ * Fixes two security bugs that allowes crashing the server and maybe
+ gaining the UID of the process that is linked against libmysqlclient!
+
+ -- Christian Hammers <ch@debian.org> Tue, 13 Feb 2001 23:01:18 +0100
+
+mysql (3.23.32-1) unstable; urgency=low
+
+ * New upstream releaes.
+ (just minor fixes)
+ * Added french and german debconf templates.
+
+ -- Christian Hammers <ch@debian.org> Sun, 4 Feb 2001 17:27:07 +0100
+
+mysql (3.23.31-1) unstable; urgency=high
+
+ * New upstream release.
+ * Fixes security bug that was announced at BUGTRAQ mailing list.
+ (Disappointingly not by mysql.com!). And allows a buffer overflow
+ and therefore access to the mysql UID and all databases when already
+ having a valid account. Closes: #82881
+
+ -- Christian Hammers <ch@debian.org> Sat, 20 Jan 2001 11:14:36 +0100
+
+mysql (3.23.30-2) unstable; urgency=low
+
+ * Recompiled with new dpkg-dev.
+
+ -- Christian Hammers <ch@debian.org> Sun, 14 Jan 2001 22:20:55 +0100
+
+mysql (3.23.30-1) unstable; urgency=low
+
+ * New upstream release.
+
+ -- Christian Hammers <ch@debian.org> Sun, 7 Jan 2001 22:10:18 +0100
+
+mysql (3.23.28-10) testing unstable; urgency=low
+
+ * I must upload to "testing" to get it into woody, right?!
+
+ -- Christian Hammers <ch@debian.org> Fri, 29 Dec 2000 14:43:57 +0100
+
+mysql (3.23.28-9) unstable; urgency=low
+
+ * Made it a replacement for libmysqlclient9.
+
+ -- Christian Hammers <ch@westend.com> Mon, 25 Dec 2000 19:15:04 +0100
+
+mysql (3.23.28-8) unstable; urgency=low
+
+ * Applied patch from a user to get the skip-networking option working!
+ Approved from a mysql employee but please test anyways.
+ This finally: Closes: #79672, #78634, #79660, #79658
+
+ -- Christian Hammers <ch@debian.org> Sat, 16 Dec 2000 14:01:36 +0100
+
+mysql (3.23.28-6) unstable; urgency=medium
+
+ * Fixed error in postinst. Closes: #79392, #79400, #79451, #79550
+ * Added .info files again on user request. Closes: #78988, #75737
+
+ -- Christian Hammers <ch@debian.org> Wed, 13 Dec 2000 21:18:24 +0100
+
+mysql (3.23.28-5) unstable; urgency=low
+
+ * Fixed a stupid bug in mysql-server.postinst regarding the
+ configuration of skip-networking. Closes: #78639, 78634
+ * Used patched bdb which hopefully enables mutexes on Alpha. Closes: #78197
+ * Added dependency to adduser. Closes: #76798
+
+ -- Christian Hammers <ch@debian.org> Sun, 10 Dec 2000 16:55:48 +0100
+
+mysql (3.23.28-4) unstable; urgency=low
+
+ [never uploaded]
+ * Fixed a stupid bug in mysql-server.postinst regarding the
+ configuration of skip-networking. Closes: #78639, 78634
+ * Used patched bdb which hopefully enables mutexes on Alpha. Closes: #78197
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Dec 2000 17:49:44 +0100
+
+mysql (3.23.28-3) unstable; urgency=low
+
+ * This time really fixed m68k build error. Closes: #78235
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Dec 2000 15:02:55 +0100
+
+mysql (3.23.28-2) unstable; urgency=low
+
+ * Adjusted rules file to make it buildable on m86k. Closes: #78235
+
+ -- Christian Hammers <ch@debian.org> Fri, 1 Dec 2000 20:07:26 +0100
+
+mysql (3.23.28-1) unstable; urgency=low
+
+ * New upstream vesrion. Now gamma!
+ * Changed umask of mysql.log making it o-rw
+ * Disabled listening on network reachable TCP ports by default due to
+ security considerations.
+
+ -- Christian Hammers <ch@debian.org> Thu, 23 Nov 2000 20:12:50 +0100
+
+mysql (3.23.27-1) unstable; urgency=low
+
+ * New upstream version.
+ * Closes: #75711
+
+ -- Christian Hammers <ch@debian.org> Sun, 29 Oct 2000 14:29:51 +0100
+
+mysql (3.23.25-4) unstable; urgency=low
+
+ * Recompiled to get rid of the dependency for zlib1 (libc5).
+ Closes: #74952, #74939
+
+ -- Christian Hammers <ch@debian.org> Tue, 17 Oct 2000 14:34:52 +0200
+
+mysql (3.23.25-3.1) unstable; urgency=low
+
+ * Maintainer-approved NMU.
+ * Includes patch to fix and enable db3 support on Alpha.
+ * Enable support for thread mutexes in db3 on sparc
+ (it works after all, according to Ben Collins)
+ * Removed atomic_ functions for Alpha since they are no
+ longer supported in the current glibc in woody.
+ * Cleaned up rules file a bit.
+
+ -- Christopher C. Chimelis <chris@debian.org> Sat, 14 Oct 2000 04:22:02 -0400
+
+mysql (3.23.25-3) unstable; urgency=low
+
+ * Upstream decided not to include my_config.h,my_dir.h into the installed
+ header files. As this file contains at least informative material
+ and more important is checked by several autoconf scripts I
+ included it by hand again.
+ * Made building of berkeley db conditional to architecture until
+ I get response whether it works on sparc/alpha now.
+
+ -- Christian Hammers <ch@debian.org> Wed, 11 Oct 2000 23:58:38 +0200
+
+mysql (3.23.25-2) unstable; urgency=medium
+
+ * Last build went terrible wrong.. Here's the changelog again:
+ * New upstream release.
+ * Shared library version was raised from 9 to 10.
+ Maintainers of packets using libmysqlclient9 must recompile!
+
+ -- Christian Hammers <ch@debian.org> Wed, 11 Oct 2000 01:16:34 +0200
+
+mysql (3.23.25-1) unstable; urgency=low
+
+ * New upstream release.
+ * Shared library version was raised from 9 to 10.
+ Maintainers of packets using libmysqlclient9 must recompile!
+
+ -- Christian Hammers <ch@debian.org> Sat, 7 Oct 2000 18:21:51 +0200
+
+mysql (3.23.24-2) unstable; urgency=low
+
+ * Applied upstream patch regarding quoting of mysqldump.
+ * Updated to db-3.1.17-patched (from www.mysql.com)
+
+ -- Christian Hammers <ch@debian.org> Fri, 15 Sep 2000 18:58:14 +0200
+
+mysql (3.23.24-1) unstable; urgency=medium
+
+ * New upstream version with some important fixes.
+ * upstream: Last version corrupted CHAR/VARCHAR/BLOB columns with
+ chararacters above ASCII 128! Check and repair all these tables.
+ * upstream: fixed small memory leak
+ * upstream: fixed problem with BDB tables and reading on unique
+ (not primary) key.
+ * Disabled BDB tables on all architectures except i386 due to many
+ bug reports (see #71206). -> HELP APPRECIATED <-
+
+ -- Christian Hammers <ch@debian.org> Tue, 12 Sep 2000 06:18:54 +0200
+
+mysql (3.23.23-2) unstable; urgency=low
+
+ * Strange... "nohup nice" gives differnet results and let therefore
+ crash safe_mysqld when starting up. Apparently it seems to be
+ kernel dependand. Now fixed by another conditional. This
+ more or less Closes: #71057
+ * This bug was reported (accidently) in the following identical reports:
+ Closes: #71253, #71254, #71257, #71258, #71259, #71262, #71266, #71267
+ Closes: #71268, #71271, #71275, #71277, #71278, #71283, #71291
+
+ -- Christian Hammers <ch@debian.org> Sat, 9 Sep 2000 20:13:50 +0200
+
+mysql (3.23.23-1) unstable; urgency=low
+
+ * New upstream version. Feature freeze!
+ * Fixed source build problem. Closes: #70707
+
+ -- Christian Hammers <ch@debian.org> Thu, 31 Aug 2000 10:03:35 +0200
+
+mysql (3.23.22b-1) unstable; urgency=low
+
+ * Reorganised docs. Now we have several small html files instead of
+ one with almost 2M. Closes: 70431
+ * Removed pdf,ps and html from source package shrinked it about 3M
+ (therefore the .orig.tar.gz is called 3.23.22b!)
+ * -> Last upload failed due to problems at the FTP site so here the
+ -> changelog again:
+ * Fixes memory leak, commit/rollback, reserved word "MASTER" ...
+ * Added Berkeley DB3 source code to the Debian diff to be able to
+ compile with bdb transaction support! (Great feature!!!)
+ * Upstream correction of error message. Closes: #68939
+ * Upstream correction of reserved word "source".
+
+ -- Christian Hammers <ch@debian.org> Fri, 25 Aug 2000 19:21:24 +0200
+
+mysql (3.23.22-1) unstable; urgency=low
+
+ * New upstream version.
+ * Fixes memory leak, commit/rollback, reserved word "MASTER" ...
+ * Added Berkeley DB3 source code to the Debian diff to be able to
+ compile with bdb transaction support! (Great feature!!!)
+ * Upstream correction of error message. Closes: #68939
+ * Upstream correction of reserved word "source".
+
+ -- Christian Hammers <ch@debian.org> Sun, 20 Aug 2000 09:05:48 +0200
+
+mysql (3.23.21-4) unstable; urgency=low
+
+ * Added libmysqlclient9.shlibs and shlibs.local file. Closes: #68669
+
+ -- Christian Hammers <ch@debian.org> Wed, 9 Aug 2000 14:22:49 +0200
+
+mysql (3.23.21-3) unstable; urgency=low
+
+ * Let "/etc/init.d/mysql restart" wait until the pid has been
+ removed before (but max 6 seconds) before restarting. Closes: 65070
+ * Added build dependencies.
+
+ -- Christian Hammers <ch@debian.org> Sun, 30 Jul 2000 16:16:48 +0200
+
+mysql (3.23.21-2) unstable; urgency=low
+
+ * Typo in safe_mysqld prevents start.
+
+ -- Christian Hammers <ch@debian.org> Sat, 29 Jul 2000 13:40:50 +0200
+
+mysql (3.23.21-1) unstable; urgency=low
+
+ * New upstream version.
+
+ -- Christian Hammers <ch@debian.org> Mon, 10 Jul 2000 22:54:17 +0200
+
+mysql (3.23.20-1) unstable; urgency=low
+
+ * MySQL finally got fully GPL'ed! This means that there is only one
+ souce package and only main/* binary packages from now on.
+ * Fixed symlink in libmysqlclient9-dev. Closes: 66452
+ * Apart from that the usual bug fixes for BETA software.
+
+ -- Christian Hammers <ch@debian.org> Mon, 3 Jul 2000 20:05:38 +0200
+
+mysql-pd (3.23.16-1) unstable; urgency=low
+
+ * New upstream release. (Actually a brand new upstream branch!)
+ * Added mysql-common package as the configuration file can be used
+ by all versions of the mysql client library.
+ Did some more package reorganisations, too. See README.Debian file!
+ * libmysqlclient.so raised major version from 6 to 9.
+ * Minor beautifications in the debian/ directory.
+
+ -- Christian Hammers <ch@debian.org> Sat, 27 May 2000 20:30:01 +0200
+
+mysql-gpl (3.22.30-2) frozen unstable; urgency=low
+
+ * Fixed path in libmysqlclient.la. Closes: #58875
+
+ -- Christian Hammers <ch@debian.org> Sat, 25 Jan 2000 20:27:29 -0700
+
+mysql-gpl (3.22.30-1) frozen unstable; urgency=low
+
+ * A small change in the libmysqlclient6 causes mysqladmin to print an
+ shared library error when displaying the defaults. Everything else
+ works fine so this error wasn't detected untill now. Closes: #58033
+ * TcX released a new MySQL version that includes another security patch,
+ this time against mysqlaccess. The author told me that it would be
+ fine if I just included the new .c in this source since I don't want
+ go to 3.22.32 in frozen.
+ * ->Release Manager: Although the version number increased there is
+ no new coded except for the shared library. The rest is the same
+ as in mysql-server and mysql-client.
+
+ -- Christian Hammers <ch@debian.org> Tue, 15 Feb 2000 23:26:54 +0100
+
+mysql-gpl (3.22.29-1) unstable; urgency=low
+
+ * New upstream version.
+
+ -- Christian Hammers <ch@debian.org> Thu, 6 Jan 2000 20:37:23 +0100
+
+mysql-gpl (3.22.27a-3) unstable; urgency=low
+
+ * Use system readline instead of bundled version. Closes: #50069
+ Any objections ?
+
+ -- Christian Hammers <ch@debian.org> Sun, 14 Nov 1999 18:09:48 +0100
+
+mysql-gpl (3.22.27a-2) unstable; urgency=low
+
+ * Now building mysql-gpl-doc in binary-indep.
+
+ -- Christian Hammers <ch@debian.org> Sat, 23 Oct 1999 04:22:36 +0200
+
+mysql-gpl (3.22.27a-1) unstable; urgency=low
+
+ * Adjusted version number to allow new orig.tar.gz.
+ The old seems broken :-( People reported compilation problems.
+ * Changed mysql-gpl-doc to "Architecture: all".
+
+ -- Christian Hammers <ch@debian.org> Sun, 17 Oct 1999 13:01:35 +0200
+
+mysql-gpl (3.22.27-1) unstable; urgency=low
+
+ * New upstream release. Fixes charset problem.
+
+ -- Christian Hammers <ch@debian.org> Mon, 11 Oct 1999 18:01:40 +0200
+
+mysql-gpl (3.22.26a-1) unstable; urgency=low
+
+ * New upstream version. Just some small bug fixes.
+ * FHS compliance.
+
+ -- Christian Hammers <ch@debian.org> Sun, 3 Oct 1999 10:16:14 +0200
+
+mysql-gpl (3.22.25-2) unstable; urgency=low
+
+ * Added conflict to all old mysql-dev packages. (fixes: #42966)
+
+ -- Christian Hammers <ch@debian.org> Sun, 15 Aug 1999 11:35:46 +0200
+
+mysql-gpl (3.22.25-1) unstable; urgency=low
+
+ * New upstream version. (We are waiting for 3.23.x !)
+ * Fixes some upstream small bugs.
+
+ -- Christian Hammers <ch@debian.org> Sun, 18 Jul 1999 22:02:06 +0200
+
+mysql-gpl (3.22.23b-4) unstable; urgency=low
+
+ * Rebuild for new perl.
+
+ -- Christian Hammers <ch@debian.org> Thu, 8 Jul 1999 01:09:57 +0200
+
+mysql-gpl (3.22.23b-3) unstable; urgency=low
+
+ * libmysqlclient had the wrong socket path.
+
+ -- Christian Hammers <ch@debian.org> Sun, 03 Jul 1999 23:13:30 +0200
+
+mysql-gpl (3.22.23b-2) unstable; urgency=low
+
+ * Missed one replace tag to an very old version of mysql-devel.
+
+ -- Christian Hammers <ch@debian.org> Sun, 27 Jun 1999 19:13:30 +0200
+
+mysql-gpl (3.22.23b-1) unstable; urgency=low
+
+ * New upstream minor version.
+ * Cleaned up the dependencies a bit.
+
+ -- Christian Hammers <ch@debian.org> Sun, 27 Jun 1999 19:13:30 +0200
+
+mysql-gpl (3.22.22-1) unstable; urgency=low
+
+ * New upstream version. (closes Bug#36493,37340)
+ * New maintainer upload.
+ * Package reorganisation: We prepare for the GPL'ed server which will
+ * be released soon and make the structure more clear to the user.
+
+ -- Christian Hammers <ch@debian.org> Mon, 3 May 1999 20:43:41 +0200
+
+mysql (3.22.21-1) unstable; urgency=low
+
+ * Never released. TcX was too fast :-)
+
+ -- Christian Hammers <ch@debian.org> Tue, 20 Apr 1999 17:22:04 +0200
+
+mysql-freebits (3.21.33b-3) unstable; urgency=low
+
+ * Recompile with libncurses
+
+ -- Scott Hanson <shanson@debian.org> Sat, 31 Oct 1998 15:04:39 +0100
+
+mysql-freebits (3.21.33b-2) unstable; urgency=low
+
+ * Recompile with libstdc++2.9 (fixes #27792)
+
+ -- Scott Hanson <shanson@debian.org> Mon, 12 Oct 1998 18:47:25 +0200
+
+mysql-freebits (3.21.33b-1) unstable; urgency=low
+
+ * New upstream version (probably the last for 3.21)
+
+ -- Scott Hanson <shanson@debian.org> Tue, 8 Sep 1998 18:59:37 +0200
+
+mysql-freebits (3.21.33-4) unstable; urgency=low
+
+ * Separate out non-free source files, move mysql-base, mysql-dev, and
+ * mysql-doc to main distribution
+ * Locale files /usr/share/mysql/ now in server, not base; therefore...
+ * Add conflict to mysql-server <=3.21.33-3
+
+ -- Scott Hanson <shanson@debian.org> Fri, 31 Jul 1998 19:16:08 +0200
+
+mysql (3.21.33-3) unstable; urgency=low
+
+ * Release to unstable with moved socket (fixes #24574)
+ * Add conflict to old libdbd-mysql-perl package
+
+ -- Scott Hanson <shanson@debian.org> Wed, 22 Jul 1998 22:17:43 +0200
+
+mysql (3.21.33-2) experimental; urgency=low
+
+ * Move socket from /tmp to /var/run (see #24574)
+ * Release to experimental, since this breaks everything statically
+ * linked to libmysqlclient!
+
+ -- Scott Hanson <shanson@debian.org> Wed, 15 Jul 1998 19:37:01 +0200
+
+mysql (3.21.33-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- Scott Hanson <shanson@debian.org> Sun, 12 Jul 1998 08:18:18 +0200
+
+mysql (3.21.32a-1) unstable; urgency=low
+
+ * New upstream release
+ * Lintian bugs: ldconfig, missing manpage, call to perl5
+ * Lintian bug shlib-with-non-pic-code _not_ yet fixed
+
+ -- Scott Hanson <shanson@debian.org> Sat, 4 Jul 1998 07:57:13 +0200
+
+mysql (3.21.31-1) unstable frozen; urgency=low
+
+ * New upstream release for hamm and slink (bug fixes only)
+ * Fix unsecure use of temp file in mysqlbug (fixes #23606)
+ * Added brief licensing information to control file
+
+ -- Scott Hanson <shanson@debian.org> Tue, 16 Jun 1998 10:52:44 +0200
+
+mysql (3.21.30-3) unstable; urgency=low
+
+ * Restore missing shared library dependencies for mysql-server
+
+ -- Scott Hanson <shanson@debian.org> Mon, 15 Jun 1998 07:51:58 +0200
+
+mysql (3.21.30-2) unstable; urgency=low
+
+ * Simplify debian/rules (fixes #17662)
+ * Edit manual.texi to add "Debian notes" to documentation
+ * Add note about passwords on command line (fixes #16471)
+ * Add note about getting privleges for users (fixes #22891)
+ * Correct "Possible license changes" heading (fixes #22711)
+ * Add uninstalled header files to /usr/doc/mysql-dev/examples (fixes #22627)
+ * Add udf_example.cc to /usr/doc/mysql-dev/examples (fixes #22710)
+
+ -- Scott Hanson <shanson@debian.org> Sun, 7 Jun 1998 13:05:37 +0200
+
+mysql (3.21.30-1) unstable; urgency=low
+
+ * Stable upstream release
+
+ -- Scott Hanson <shanson@debian.org> Tue, 12 May 1998 22:13:25 +0200
+
+mysql (3.21.29gamma-1) unstable; urgency=low
+
+ * New upstream release
+ * Do not create 'mysql' subdirectory for libs and headers (fixes #19020)
+ * Remove 'CXX=gcc' flag from configure (g++ now standard)
+
+ -- Scott Hanson <shanson@debian.org> Sun, 12 Apr 1998 18:38:03 +0200
+
+mysql (3.21.28gamma-1) unstable; urgency=low
+
+ * New upstream release
+ * Unstable-only release; hamm stays at 3.21.25 for now
+
+ -- Scott Hanson <shanson@debian.org> Thu, 2 Apr 1998 21:33:51 +0200
+
+mysql (3.21.25gamma-3) unstable frozen; urgency=low
+
+ * Have mysql-base suggest perl >= 5.004 for mysqlaccess (fixes #19593)
+ * Fix shlibs to refer to mysql-base rather than the no-longer-existant mysql
+
+ -- Scott Hanson <shanson@debian.org> Thu, 26 Mar 1998 18:22:59 +0100
+
+mysql (3.21.25gamma-2) unstable; urgency=low
+
+ * Restore libmysqlclient.so symlink to mysql-dev (fixes #19036)
+
+ -- Scott Hanson <shanson@debian.org> Sun, 8 Mar 1998 10:46:43 +0100
+
+mysql (3.21.25gamma-1) unstable; urgency=low
+
+ * Check if running as root in init.d script (fixes #18577)
+ * New upstream release
+
+ -- Scott Hanson <shanson@debian.org> Fri, 27 Feb 1998 20:01:30 +0100
+
+mysql (3.21.24gamma-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- Scott Hanson <shanson@debian.org> Mon, 23 Feb 1998 08:14:17 +0100
+
+mysql (3.21.23beta-3) unstable; urgency=low
+
+ * Squashed errors found by lintian
+
+ -- Scott Hanson <shanson@debian.org> Tue, 17 Feb 1998 20:19:01 +0100
+
+mysql (3.21.23beta-2) unstable; urgency=low
+
+ * Fixed overlaps with old mysql package (fixes #17843)
+
+ -- Scott Hanson <shanson@debian.org> Thu, 5 Feb 1998 22:55:00 +0100
+
+mysql (3.21.23beta-1) unstable; urgency=low
+
+ * New upstream release
+ * Fix include lines in mysql.h (fixes #17827)
+ * Move /usr/include/mysql to mysql-dev
+
+ -- Scott Hanson <shanson@debian.org> Wed, 4 Feb 1998 19:59:14 +0100
+
+mysql (3.21.22beta-3) unstable; urgency=low
+
+ * Correct descriptions in control file (fixes #17698)
+ * Clean up output of shutdown script
+
+ -- Scott Hanson <shanson@debian.org> Sat, 31 Jan 1998 19:04:29 +0100
+
+mysql (3.21.22beta-2) unstable; urgency=low
+
+ * Split out mysql-dev and mysql-bench subpackages
+
+ -- Scott Hanson <shanson@debian.org> Wed, 28 Jan 1998 19:52:27 +0100
+
+mysql (3.21.22beta-1) unstable; urgency=low
+
+ * New upstream release
+
+ -- Scott Hanson <shanson@debian.org> Wed, 28 Jan 1998 18:59:09 +0100
+
+mysql (3.21.21a.beta-2) unstable; urgency=low
+
+ * Compile with libpthreads from libc6-dev_2.0.6-3 rather than statically
+ linking to patched libpthreads (see changes to 3.20.29-2)
+
+ -- Scott Hanson <shanson@debian.org> Sun, 25 Jan 1998 13:17:15 +0100
+
+mysql (3.21.21a.beta-1) unstable; urgency=low
+
+ * Put initial database, mysql_install_db, safe_mysqld, isamlog and
+ isamchk in mysql-server
+ * Correct upstream release number so source packages are correctly built
+
+ -- Scott Hanson <shanson@debian.org> Mon, 19 Jan 1998 07:52:48 +0100
+
+mysql (3.21.21.beta-1) unstable; urgency=low
+
+ * Use debhelper where possible in rules
+ * Split binary packages into mysql-base, mysql-client, mysql-doc
+ * New upstream release
+
+ -- Scott Hanson <shanson@debian.org> Thu, 15 Jan 1998 08:12:17 +0100
+
+mysql (3.21.19.beta-1) unstable; urgency=low
+
+ * Offer to set root password in mysql_install_db
+ * Kill `pidof mysqld` on shutdown rather than use mysqladmin
+ * New upstream version
+
+ -- Scott Hanson <shanson@debian.org> Fri, 9 Jan 1998 20:06:35 +0100
+
+mysql (3.21.17a.beta-2) unstable; urgency=low
+
+ * Remove perl stuff (it's going back into libdbd-mysql-perl)
+ * Remove conflict with libdbd-mysql-perl
+ * Do not compress *html files (fixes #16314)
+
+ -- Scott Hanson <shanson@debian.org> Tue, 30 Dec 1997 07:34:20 +0100
+
+mysql (3.21.17a.beta-1) unstable; urgency=low
+
+ * Add conflict to libdbd-mysql-perl
+ * Use --pid-file option to place pid file in /var/run rather than patching
+ * Add install-info to postinst and postrm
+ * Add filename to message shown by mysql_install_db (fixes #16621)
+ * New upstream version
+
+ -- Scott Hanson <shanson@debian.org> Sun, 21 Dec 1997 19:41:45 +0100
+
+mysql (3.20.32a-5) unstable; urgency=low
+
+ * Move mysqld to /usr/lib/mysql, per policy discussion
+ * Adjust makefiles so perl libs get installed
+
+ -- Scott Hanson <shanson@debian.org> Wed, 3 Dec 1997 22:37:45 +0100
+
+mysql (3.20.32a-4) unstable; urgency=low
+
+ * Move mysqld to /usr/sbin to comply with FSSTND
+
+ -- Scott Hanson <shanson@debian.org> Mon, 3 Nov 1997 20:12:29 +0100
+
+mysql (3.20.32a-3) unstable; urgency=low
+
+ * Comment out tests in mysql_install_db... for real this time!
+
+ -- Scott Hanson <shanson@debian.org> Mon, 3 Nov 1997 07:32:53 +0100
+
+mysql (3.20.32a-2) unstable; urgency=low
+
+ * Comment out tests in mysql_install_db (fixes #14304)
+
+ -- Scott Hanson <shanson@debian.org> Sat, 1 Nov 1997 18:45:25 +0100
+
+mysql (3.20.32a-1) unstable; urgency=low
+
+ * New upstream version
+
+ -- Scott Hanson <shanson@debian.org> Wed, 29 Oct 1997 07:11:42 +0100
+
+mysql (3.20.29-2) unstable; urgency=low
+
+ * New maintainer
+ * Statically link mysqld to patched glibc-2.0.5 libpthread
+ (works around #13586; see README.debian.glibc-2.0.5)
+ * Conflict with libpthread0 (fixes #13448)
+ * Don't link libg++, avoiding problems with glibc libpthread
+
+ -- Scott Hanson <shanson@debian.org> Thu, 16 Oct 1997 19:25:23 +0200
+
+mysql (3.20.29-1) unstable; urgency=low
+
+ * New upstream version
+ * Recompiled with libc6
+ * Include mysql-faq_toc.html (fixes #10885)
+ * Reworked /etc/init.d/mysql script (thanks to Heiko)
+ * Remove file /usr/lib/libmysqlclient.so.4 when package is removed.
+ * Use absolute path specification for conffile
+ * Use /usr/bin/perl instead of /bin/perl (fixes #10654)
+ * Do not depend on mysql (fixes #12427)
+ * Installed missing manpage for Mysql perl module
+ * Don't use debstd anymore
+ * Pristine source
+ * Set section to `non-free/devel'
+ * Upgraded to standards version 2.3.0.0
+
+ -- Christian Schwarz <schwarz@debian.org> Fri, 12 Sep 1997 02:12:58 +0200
+
+mysql (3.20.16beta-2) unstable; urgency=low
+
+ * Uses /usr/bin/perl instead of /bin/perl (fixes bug #9731)
+ * Don't run mysqld with --log option
+ * Don't install regex manual pages
+ * Suggest package mysql-manual
+ * Fixed typo in changelog
+ * Upgrade to policy 2.1.3.2
+
+ -- Christian Schwarz <schwarz@debian.org> Sun, 11 May 1997 14:19:26 +0200
+
+mysql (3.20.16beta-1) unstable; urgency=low
+
+ * Initial Release.
+
+ -- Christian Schwarz <schwarz@debian.org> Sat, 12 Apr 1997 13:51:28 +0200
diff --git a/storage/xtradb/build/debian/compat b/storage/xtradb/build/debian/compat
new file mode 100644
index 00000000000..b8626c4cff2
--- /dev/null
+++ b/storage/xtradb/build/debian/compat
@@ -0,0 +1 @@
+4
diff --git a/storage/xtradb/build/debian/control b/storage/xtradb/build/debian/control
new file mode 100644
index 00000000000..4d229f20a2d
--- /dev/null
+++ b/storage/xtradb/build/debian/control
@@ -0,0 +1,118 @@
+Source: percona-xtradb-dfsg-5.1
+Section: misc
+Priority: optional
+Maintainer: Percona SQL Development Team <mysql-dev@percona.com>
+Uploaders: Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+Build-Depends: libtool (>= 1.4.2-7), procps | hurd, debhelper (>= 4.1.16), file (>= 3.28-1), libncurses5-dev (>= 5.0-6), perl (>= 5.6.0), libwrap0-dev (>= 7.6-8.3), zlib1g-dev (>= 1:1.1.3-5), libreadline5-dev | libreadline-dev, psmisc, po-debconf, chrpath, automake1.9, doxygen, gs, dpatch, gawk, bison, lsb-release, fakeroot
+Standards-Version: 3.8.0
+Homepage: http://www.percona.com/
+Vcs-Browser: http://bazaar.launchpad.net/~percona-dev/percona-xtradb/release-1.0/files
+Vcs-Bzr: bzr+ssh://bazaar.launchpad.net/~percona-dev/percona-xtradb/release-1.0/
+
+Package: libpercona-xtradb-client16
+Section: libs
+Architecture: any
+Depends: percona-xtradb-common (>= ${source:Version}), ${shlibs:Depends}
+Description: Percona SQL database client library
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the client library.
+
+Package: libpercona-xtradb-client15-dev
+Architecture: all
+Section: libdevel
+Depends: libpercona-xtradb-client-dev (>= ${source:Version})
+Description: Percona SQL database development files - empty transitional package
+ This is an empty package that depends on libpercona-xtradb-client-dev to ease the
+ transition for packages with versioned build-deps on libpercona-xtradb-client15-dev.
+
+Package: libpercona-xtradb-client-dev
+Architecture: any
+Section: libdevel
+Depends: libpercona-xtradb-client16 (>= ${source:Version}), zlib1g-dev, , ${shlibs:Depends}
+Conflicts: libmysqlclient14-dev, libmysqlclient12-dev, libmysqlclient10-dev, libmysqlclient15-dev, libmysqlclient16-dev
+Replaces: libmysqlclient14-dev, libmysqlclient12-dev, libmysqlclient10-dev, libmysqlclient15-dev, libmysqlclient16-dev
+Description: Percona SQL database development files
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes development libraries and header files.
+
+Package: percona-xtradb-common
+Section: database
+Architecture: all
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Conflicts: mysql-common-4.1, mysql-common-5.0, mysql-common-5.1, mysql-common
+Provides: mysql-common
+Replaces: mysql-common-4.1, mysql-common-5.0, mysql-common-5.1, mysql-common
+Description: Percona SQL database common files (e.g. /etc/mysql/my.cnf)
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes files needed by all versions of the client library
+ (e.g. /etc/mysql/my.cnf).
+
+Package: percona-xtradb-client-5.1
+Architecture: any
+Depends: debianutils (>=1.6), libdbi-perl, percona-xtradb-common (>= ${source:Version}), libpercona-xtradb-client16 (>= ${source:Version}), ${perl:Depends}, ${shlibs:Depends}, ${misc:Depends}
+Provides: virtual-mysql-client, mysql-client, mysql-client-4.1, percona-xtradb-client, percona-xtradb-client-5.1
+Conflicts: mysql-client (<< ${source:Version}), mysql-client-5.0, mysql-client-5.1, percona-xtradb-client-5.0
+Replaces: mysql-client (<< ${source:Version}), mysql-client-5.0, mysql-client-5.1, percona-xtradb-client-5.0
+Description: Percona SQL database client binaries
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the client binaries and the additional tools
+ innotop and mysqlreport.
+
+Package: percona-xtradb-server-5.1
+Architecture: any
+Suggests: tinyca
+Recommends: mailx, libhtml-template-perl
+Pre-Depends: percona-xtradb-common (>= ${source:Version}), adduser (>= 3.40), debconf
+Depends: percona-xtradb-client-5.1 (>= ${source:Version}), libdbi-perl, perl (>= 5.6), ${shlibs:Depends}, ${misc:Depends}, psmisc, passwd, lsb-base (>= 3.0-10)
+Conflicts: mysql-server (<< ${source:Version}), mysql-server-4.1, percona-xtradb-server-5.0
+Provides: mysql-server, virtual-mysql-server, mysql-server-5.0, percona-xtradb-server-5.1
+Replaces: mysql-server (<< ${source:Version}), mysql-server-5.0, percona-xtradb-server-5.0
+Description: Percona SQL database server binaries
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the server binaries.
+
+Package: percona-xtradb-server
+Section: database
+Architecture: all
+Depends: percona-xtradb-server-5.1
+Description: Percona SQL database server (metapackage depending on the latest version)
+ This is an empty package that depends on the current "best" version of
+ percona-xtradb-server (currently percona-xtradb-server-5.1), as determined by the Percona SQL
+ maintainers. Install this package if in doubt about which Percona SQL
+ version you need. That will install the version recommended by the
+ package maintainers.
+ .
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+
+Package: percona-xtradb-client
+Section: database
+Architecture: all
+Depends: percona-xtradb-client-5.1
+Description: Percona SQL database client (metapackage depending on the latest version)
+ This is an empty package that depends on the current "best" version of
+ percona-xtradb-client (currently percona-xtradb-client-5.1), as determined by the Percona SQL
+ maintainers. Install this package if in doubt about which Percona SQL version
+ you want, as this is the one we consider to be in the best shape.
diff --git a/storage/xtradb/build/debian/copyright b/storage/xtradb/build/debian/copyright
new file mode 100644
index 00000000000..47fab54fc59
--- /dev/null
+++ b/storage/xtradb/build/debian/copyright
@@ -0,0 +1,169 @@
+
+== MySQL ==
+
+The Debian package of MySQL was first debianzed on 1997-04-12 by Christian
+Schwarz <schwarz@debian.org> and ist maintained since 1999-04-20 by
+Christian Hammers <ch@debian.org>.
+
+It can be downloaded from http://www.mysql.com/
+
+Copyright:
+
+According to the file "COPYING" all parts of this package are licenced
+under the terms of the GNU GPL Version 2 of which a copy is available
+in /usr/share/common-licenses.
+
+To allow free software with other licences than the GPL to link against the
+shared library, special terms for "derived works" are defined in the file
+"EXCEPTIONS-CLIENT" which is quoted below.
+
+More information can be found on http://www.mysql.com/company/legal/licensing/
+
+The manual had to be removed as it is not free in the sense of the
+Debian Free Software Guidelines (DFSG).
+
+> Appendix I MySQL FLOSS License Exception
+> ****************************************
+>
+> Version 0.3, 10 February 2005
+>
+> The MySQL AB Exception for Free/Libre and Open Source Software-only
+> Applications Using MySQL Client Libraries (the "FLOSS Exception").
+>
+> Exception Intent
+> ================
+>
+> We want specified Free/Libre and Open Source Software ("FLOSS")
+> applications to be able to use specified GPL-licensed MySQL client
+> libraries (the "Program") despite the fact that not all FLOSS licenses
+> are compatible with version 2 of the GNU General Public License (the
+> "GPL").
+>
+> Legal Terms and Conditions
+> ==========================
+>
+> As a special exception to the terms and conditions of version 2.0 of the
+> GPL:
+>
+> 1. You are free to distribute a Derivative Work that is formed
+> entirely from the Program and one or more works (each, a "FLOSS
+> Work") licensed under one or more of the licenses listed below in
+> section 1, as long as:
+>
+> a. You obey the GPL in all respects for the Program and the
+> Derivative Work, except for identifiable sections of the
+> Derivative Work which are not derived from the Program, and
+> which can reasonably be considered independent and separate
+> works in themselves,
+>
+> b. all identifiable sections of the Derivative Work which are not
+> derived from the Program, and which can reasonably be
+> considered independent and separate works in themselves,
+>
+> i
+> are distributed subject to one of the FLOSS licenses
+> listed below, and
+>
+> ii
+> the object code or executable form of those sections are
+> accompanied by the complete corresponding
+> machine-readable source code for those sections on the
+> same medium and under the same FLOSS license as the
+> corresponding object code or executable forms of those
+> sections, and
+>
+> c. any works which are aggregated with the Program or with a
+> Derivative Work on a volume of a storage or distribution
+> medium in accordance with the GPL, can reasonably be
+> considered independent and separate works in themselves which
+> are not derivatives of either the Program, a Derivative Work
+> or a FLOSS Work.
+>
+> If the above conditions are not met, then the Program may only be
+> copied, modified, distributed or used under the terms and
+> conditions of the GPL or another valid licensing option from MySQL
+> AB.
+>
+> 2. FLOSS License List
+>
+> *License name* *Version(s)/Copyright Date*
+> Academic Free License 2.0
+> Apache Software License 1.0/1.1/2.0
+> Apple Public Source License 2.0
+> Artistic license From Perl 5.8.0
+> BSD license "July 22 1999"
+> Common Public License 1.0
+> GNU Library or "Lesser" General Public 2.0/2.1
+> License (LGPL)
+> Jabber Open Source License 1.0
+> MIT license -
+> Mozilla Public License (MPL) 1.0/1.1
+> Open Software License 2.0
+> OpenSSL license (with original SSLeay "2003" ("1998")
+> license)
+> PHP License 3.0
+> Python license (CNRI Python License) -
+> Python Software Foundation License 2.1.1
+> Sleepycat License "1999"
+> W3C License "2001"
+> X11 License "2001"
+> Zlib/libpng License -
+> Zope Public License 2.0
+>
+> Due to the many variants of some of the above licenses, we require
+> that any version follow the 2003 version of the Free Software
+> Foundation's Free Software Definition
+> (`http://www.gnu.org/philosophy/free-sw.html') or version 1.9 of
+> the Open Source Definition by the Open Source Initiative
+> (`http://www.opensource.org/docs/definition.php').
+>
+> 3. Definitions
+>
+> a. Terms used, but not defined, herein shall have the meaning
+> provided in the GPL.
+>
+> b. Derivative Work means a derivative work under copyright law.
+>
+> 4. Applicability This FLOSS Exception applies to all Programs that
+> contain a notice placed by MySQL AB saying that the Program may be
+> distributed under the terms of this FLOSS Exception. If you
+> create or distribute a work which is a Derivative Work of both the
+> Program and any other work licensed under the GPL, then this FLOSS
+> Exception is not available for that work; thus, you must remove
+> the FLOSS Exception notice from that work and comply with the GPL
+> in all respects, including by retaining all GPL notices. You may
+> choose to redistribute a copy of the Program exclusively under the
+> terms of the GPL by removing the FLOSS Exception notice from that
+> copy of the Program, provided that the copy has never been
+> modified by you or any third party.
+
+
+== innotop ==
+
+Author: Baron Schwartz <baron@xaprb.com>
+URL: http://innotop.sourceforge.net
+
+License:
+> This software is dual licensed, either GPL version 2 or Artistic License.
+>
+> This package is free software; you can redistribute it and/or modify
+> it under the terms of the GNU General Public License as published by
+> the Free Software Foundation; either version 2 of the License, or
+> (at your option) any later version.
+>
+> This package is distributed in the hope that it will be useful,
+> but WITHOUT ANY WARRANTY; without even the implied warranty of
+> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+> GNU General Public License for more details.
+>
+> You should have received a copy of the GNU General Public License
+> along with this package; if not, write to the Free Software
+> Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+On Debian systems, the complete text of the GNU General Public License and the
+Artistic License can be found in `/usr/share/common-licenses/'.
+
+The upstream author explained here: http://bugs.gentoo.org/show_bug.cgi?id=14760
+that these licenses also apply to the following files:
+- innotop.html
+- InnoDBParser.pm
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer
new file mode 100644
index 00000000000..f24cdcd519d
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer
@@ -0,0 +1,4 @@
+The examples directory includes files that might be needed by some
+developers:
+- header files not installed by default
+- the example file udf_example.c
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs
new file mode 100644
index 00000000000..f6ad2870431
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs
@@ -0,0 +1,2 @@
+usr/include/
+usr/lib/
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples
new file mode 100644
index 00000000000..f1649c311c4
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples
@@ -0,0 +1 @@
+sql/udf_example.c
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files
new file mode 100644
index 00000000000..6803365b5ea
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files
@@ -0,0 +1,7 @@
+usr/bin/mysql_config
+usr/include/mysql/*.h
+usr/lib/libmysqlclient.a
+usr/lib/libmysqlclient.la
+usr/lib/mysql/*.a
+usr/lib/mysql/*.la
+usr/share/man/man1/mysql_config.1
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links
new file mode 100644
index 00000000000..0481d1a0020
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links
@@ -0,0 +1,2 @@
+usr/lib/libmysqlclient.so.16 usr/lib/libmysqlclient.so
+usr/lib/libmysqlclient_r.so.16 usr/lib/libmysqlclient_r.so
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs b/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs
new file mode 100644
index 00000000000..2964de6141b
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs
@@ -0,0 +1 @@
+usr/lib/
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.docs b/storage/xtradb/build/debian/libpercona-xtradb-client16.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.files b/storage/xtradb/build/debian/libpercona-xtradb-client16.files
new file mode 100644
index 00000000000..5162b7b9639
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.files
@@ -0,0 +1 @@
+usr/lib/libmysqlclient*.so.*
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst b/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst
new file mode 100644
index 00000000000..29d3b86f978
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst
@@ -0,0 +1,12 @@
+#!/bin/bash -e
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
+
+# vim: ts=4
+
+
diff --git a/storage/xtradb/build/debian/patches/00list b/storage/xtradb/build/debian/patches/00list
new file mode 100644
index 00000000000..5c50ab1ba53
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/00list
@@ -0,0 +1,6 @@
+33_scripts__mysql_create_system_tables__no_test.dpatch
+38_scripts__mysqld_safe.sh__signals.dpatch
+41_scripts__mysql_install_db.sh__no_test.dpatch
+44_scripts__mysql_config__libs.dpatch
+50_mysql-test__db_test.dpatch
+60_percona_support.dpatch
diff --git a/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch
new file mode 100644
index 00000000000..ca138afa746
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch
@@ -0,0 +1,776 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 01_MAKEFILES__Docs_Makefile.in.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Creates Docs/Makefile.in
+
+@DPATCH@
+
+--- old/Docs/Images/Makefile.in 2005-03-01 02:08:01.877429040 +0100
++++ new/Docs/Images/Makefile.in 2005-02-28 21:21:24.000000000 +0100
+@@ -0,0 +1,765 @@
++# Makefile.in generated by automake 1.7.9 from Makefile.am.
++# @configure_input@
++
++# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
++# Free Software Foundation, Inc.
++# This Makefile.in is free software; the Free Software Foundation
++# gives unlimited permission to copy and/or distribute it,
++# with or without modifications, as long as this notice is preserved.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
++# PARTICULAR PURPOSE.
++
++@SET_MAKE@
++
++# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
++#
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 2 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with this program; if not, write to the Free Software
++# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++# Process this file with automake to create Makefile.in
++
++srcdir = @srcdir@
++top_srcdir = @top_srcdir@
++VPATH = @srcdir@
++pkgdatadir = $(datadir)/@PACKAGE@
++pkglibdir = $(libdir)/@PACKAGE@
++pkgincludedir = $(includedir)/@PACKAGE@
++top_builddir = .
++
++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
++INSTALL = @INSTALL@
++install_sh_DATA = $(install_sh) -c -m 644
++install_sh_PROGRAM = $(install_sh) -c
++install_sh_SCRIPT = $(install_sh) -c
++INSTALL_HEADER = $(INSTALL_DATA)
++transform = $(program_transform_name)
++NORMAL_INSTALL = :
++PRE_INSTALL = :
++POST_INSTALL = :
++NORMAL_UNINSTALL = :
++PRE_UNINSTALL = :
++POST_UNINSTALL = :
++build_triplet = @build@
++host_triplet = @host@
++target_triplet = @target@
++ACLOCAL = @ACLOCAL@
++ALLOCA = @ALLOCA@
++AMDEP_FALSE = @AMDEP_FALSE@
++AMDEP_TRUE = @AMDEP_TRUE@
++AMTAR = @AMTAR@
++AR = @AR@
++AS = @AS@
++ASSEMBLER_FALSE = @ASSEMBLER_FALSE@
++ASSEMBLER_TRUE = @ASSEMBLER_TRUE@
++ASSEMBLER_sparc32_FALSE = @ASSEMBLER_sparc32_FALSE@
++ASSEMBLER_sparc32_TRUE = @ASSEMBLER_sparc32_TRUE@
++ASSEMBLER_sparc64_FALSE = @ASSEMBLER_sparc64_FALSE@
++ASSEMBLER_sparc64_TRUE = @ASSEMBLER_sparc64_TRUE@
++ASSEMBLER_x86_FALSE = @ASSEMBLER_x86_FALSE@
++ASSEMBLER_x86_TRUE = @ASSEMBLER_x86_TRUE@
++AUTOCONF = @AUTOCONF@
++AUTOHEADER = @AUTOHEADER@
++AUTOMAKE = @AUTOMAKE@
++AVAILABLE_LANGUAGES = @AVAILABLE_LANGUAGES@
++AVAILABLE_LANGUAGES_ERRORS = @AVAILABLE_LANGUAGES_ERRORS@
++AWK = @AWK@
++CC = @CC@
++CCAS = @CCAS@
++CCASFLAGS = @CCASFLAGS@
++CCDEPMODE = @CCDEPMODE@
++CC_VERSION = @CC_VERSION@
++CFLAGS = @CFLAGS@
++CHARSETS_NEED_SOURCE = @CHARSETS_NEED_SOURCE@
++CHARSET_OBJS = @CHARSET_OBJS@
++CHARSET_SRCS = @CHARSET_SRCS@
++CHECK_PID = @CHECK_PID@
++CHMOD = @CHMOD@
++CLIENT_EXTRA_LDFLAGS = @CLIENT_EXTRA_LDFLAGS@
++CLIENT_LIBS = @CLIENT_LIBS@
++CMP = @CMP@
++COMPILATION_COMMENT = @COMPILATION_COMMENT@
++COMPILE_PSTACK_FALSE = @COMPILE_PSTACK_FALSE@
++COMPILE_PSTACK_TRUE = @COMPILE_PSTACK_TRUE@
++CONF_COMMAND = @CONF_COMMAND@
++CP = @CP@
++CPP = @CPP@
++CPPFLAGS = @CPPFLAGS@
++CXX = @CXX@
++CXXCPP = @CXXCPP@
++CXXDEPMODE = @CXXDEPMODE@
++CXXFLAGS = @CXXFLAGS@
++CXXLDFLAGS = @CXXLDFLAGS@
++CXX_VERSION = @CXX_VERSION@
++CYGPATH_W = @CYGPATH_W@
++DEFS = @DEFS@
++DEPDIR = @DEPDIR@
++DOT_FRM_VERSION = @DOT_FRM_VERSION@
++DVIS = @DVIS@
++ECHO = @ECHO@
++ECHO_C = @ECHO_C@
++ECHO_N = @ECHO_N@
++ECHO_T = @ECHO_T@
++EGREP = @EGREP@
++EXEEXT = @EXEEXT@
++F77 = @F77@
++FFLAGS = @FFLAGS@
++FIND_PROC = @FIND_PROC@
++GETCONF = @GETCONF@
++GXX = @GXX@
++HAVE_NETWARE_FALSE = @HAVE_NETWARE_FALSE@
++HAVE_NETWARE_TRUE = @HAVE_NETWARE_TRUE@
++HOSTNAME = @HOSTNAME@
++INSTALL_DATA = @INSTALL_DATA@
++INSTALL_PROGRAM = @INSTALL_PROGRAM@
++INSTALL_SCRIPT = @INSTALL_SCRIPT@
++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
++IS_LINUX = @IS_LINUX@
++KILL = @KILL@
++LD = @LD@
++LDFLAGS = @LDFLAGS@
++LIBDL = @LIBDL@
++LIBOBJS = @LIBOBJS@
++LIBS = @LIBS@
++LIBTOOL = @LIBTOOL@
++LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
++LM_CFLAGS = @LM_CFLAGS@
++LN = @LN@
++LN_CP_F = @LN_CP_F@
++LN_S = @LN_S@
++LOCAL_FALSE = @LOCAL_FALSE@
++LOCAL_TRUE = @LOCAL_TRUE@
++LTLIBOBJS = @LTLIBOBJS@
++MACHINE_TYPE = @MACHINE_TYPE@
++MAINT = @MAINT@
++MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
++MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
++MAKEINFO = @MAKEINFO@
++MAKE_BINARY_DISTRIBUTION_OPTIONS = @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++MAKE_SHELL = @MAKE_SHELL@
++MT_INCLUDES = @MT_INCLUDES@
++MT_LD_ADD = @MT_LD_ADD@
++MV = @MV@
++MYSQLD_DEFAULT_SWITCHES = @MYSQLD_DEFAULT_SWITCHES@
++MYSQLD_EXTRA_LDFLAGS = @MYSQLD_EXTRA_LDFLAGS@
++MYSQLD_USER = @MYSQLD_USER@
++MYSQL_BASE_VERSION = @MYSQL_BASE_VERSION@
++MYSQL_NO_DASH_VERSION = @MYSQL_NO_DASH_VERSION@
++MYSQL_SERVER_SUFFIX = @MYSQL_SERVER_SUFFIX@
++MYSQL_TCP_PORT = @MYSQL_TCP_PORT@
++MYSQL_TCP_PORT_DEFAULT = @MYSQL_TCP_PORT_DEFAULT@
++MYSQL_UNIX_ADDR = @MYSQL_UNIX_ADDR@
++MYSQL_VERSION_ID = @MYSQL_VERSION_ID@
++NOINST_LDFLAGS = @NOINST_LDFLAGS@
++OBJEXT = @OBJEXT@
++PACKAGE = @PACKAGE@
++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
++PACKAGE_NAME = @PACKAGE_NAME@
++PACKAGE_STRING = @PACKAGE_STRING@
++PACKAGE_TARNAME = @PACKAGE_TARNAME@
++PACKAGE_VERSION = @PACKAGE_VERSION@
++PATH_SEPARATOR = @PATH_SEPARATOR@
++PDFMANUAL = @PDFMANUAL@
++PERL = @PERL@
++PERL5 = @PERL5@
++PROTOCOL_VERSION = @PROTOCOL_VERSION@
++PS = @PS@
++RANLIB = @RANLIB@
++RM = @RM@
++SAVE_ASFLAGS = @SAVE_ASFLAGS@
++SAVE_CFLAGS = @SAVE_CFLAGS@
++SAVE_CXXFLAGS = @SAVE_CXXFLAGS@
++SAVE_CXXLDFLAGS = @SAVE_CXXLDFLAGS@
++SAVE_LDFLAGS = @SAVE_LDFLAGS@
++SED = @SED@
++SET_MAKE = @SET_MAKE@
++SHARED_LIB_VERSION = @SHARED_LIB_VERSION@
++SHELL = @SHELL@
++STRIP = @STRIP@
++SYSTEM_TYPE = @SYSTEM_TYPE@
++TAR = @TAR@
++TERMCAP_LIB = @TERMCAP_LIB@
++THREAD_LOBJECTS = @THREAD_LOBJECTS@
++THREAD_LPROGRAMS = @THREAD_LPROGRAMS@
++VERSION = @VERSION@
++WRAPLIBS = @WRAPLIBS@
++YACC = @YACC@
++ac_ct_AR = @ac_ct_AR@
++ac_ct_CC = @ac_ct_CC@
++ac_ct_CXX = @ac_ct_CXX@
++ac_ct_F77 = @ac_ct_F77@
++ac_ct_GETCONF = @ac_ct_GETCONF@
++ac_ct_RANLIB = @ac_ct_RANLIB@
++ac_ct_STRIP = @ac_ct_STRIP@
++am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
++am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
++am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
++am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
++am__include = @am__include@
++am__leading_dot = @am__leading_dot@
++am__quote = @am__quote@
++bdb_includes = @bdb_includes@
++bdb_libs = @bdb_libs@
++bdb_libs_with_path = @bdb_libs_with_path@
++bench_dirs = @bench_dirs@
++bindir = @bindir@
++build = @build@
++build_alias = @build_alias@
++build_cpu = @build_cpu@
++build_os = @build_os@
++build_vendor = @build_vendor@
++datadir = @datadir@
++default_charset = @default_charset@
++docs_dirs = @docs_dirs@
++exec_prefix = @exec_prefix@
++host = @host@
++host_alias = @host_alias@
++host_cpu = @host_cpu@
++host_os = @host_os@
++host_vendor = @host_vendor@
++includedir = @includedir@
++infodir = @infodir@
++innodb_includes = @innodb_includes@
++innodb_libs = @innodb_libs@
++innodb_system_libs = @innodb_system_libs@
++install_sh = @install_sh@
++isam_libs = @isam_libs@
++libdir = @libdir@
++libexecdir = @libexecdir@
++libmysqld_dirs = @libmysqld_dirs@
++linked_client_targets = @linked_client_targets@
++linked_netware_sources = @linked_netware_sources@
++localstatedir = @localstatedir@
++man_dirs = @man_dirs@
++mandir = @mandir@
++netware_dir = @netware_dir@
++oldincludedir = @oldincludedir@
++openssl_includes = @openssl_includes@
++openssl_libs = @openssl_libs@
++orbit_idl = @orbit_idl@
++orbit_includes = @orbit_includes@
++orbit_libs = @orbit_libs@
++prefix = @prefix@
++program_transform_name = @program_transform_name@
++pstack_dirs = @pstack_dirs@
++pstack_libs = @pstack_libs@
++readline_dir = @readline_dir@
++readline_link = @readline_link@
++sbindir = @sbindir@
++server_scripts = @server_scripts@
++sharedstatedir = @sharedstatedir@
++sql_client_dirs = @sql_client_dirs@
++sql_server_dirs = @sql_server_dirs@
++sysconfdir = @sysconfdir@
++target = @target@
++target_alias = @target_alias@
++target_cpu = @target_cpu@
++target_os = @target_os@
++target_vendor = @target_vendor@
++thread_dirs = @thread_dirs@
++tools_dirs = @tools_dirs@
++uname_prog = @uname_prog@
++vio_dir = @vio_dir@
++vio_libs = @vio_libs@
++
++AUTOMAKE_OPTIONS = foreign
++
++# These are built from source in the Docs directory
++EXTRA_DIST = INSTALL-SOURCE README COPYING EXCEPTIONS-CLIENT
++SUBDIRS = . include @docs_dirs@ @readline_dir@ \
++ @thread_dirs@ pstack @sql_client_dirs@ \
++ @sql_server_dirs@ scripts @man_dirs@ tests \
++ BUILD netware os2 @libmysqld_dirs@ \
++ @bench_dirs@ support-files @tools_dirs@
++
++
++# Relink after clean
++linked_sources = linked_client_sources linked_server_sources \
++ linked_libmysql_sources linked_libmysql_r_sources \
++ linked_libmysqld_sources linked_libmysqldex_sources \
++ linked_include_sources @linked_netware_sources@
++
++
++CLEANFILES = $(linked_sources)
++subdir = .
++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
++mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
++CONFIG_HEADER = config.h
++CONFIG_CLEAN_FILES = bdb/Makefile
++DIST_SOURCES =
++
++RECURSIVE_TARGETS = info-recursive dvi-recursive pdf-recursive \
++ ps-recursive install-info-recursive uninstall-info-recursive \
++ all-recursive install-data-recursive install-exec-recursive \
++ installdirs-recursive install-recursive uninstall-recursive \
++ check-recursive installcheck-recursive
++DIST_COMMON = README $(srcdir)/Makefile.in $(srcdir)/configure COPYING \
++ ChangeLog Makefile.am acconfig.h acinclude.m4 aclocal.m4 \
++ config.guess config.h.in config.sub configure configure.in \
++ depcomp install-sh ltconfig ltmain.sh missing mkinstalldirs
++DIST_SUBDIRS = $(SUBDIRS)
++all: config.h
++ $(MAKE) $(AM_MAKEFLAGS) all-recursive
++
++.SUFFIXES:
++
++am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
++ configure.lineno
++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4)
++ cd $(top_srcdir) && \
++ $(AUTOMAKE) --foreign Makefile
++Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status
++ cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)
++
++$(top_builddir)/config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
++ $(SHELL) ./config.status --recheck
++$(srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
++ cd $(srcdir) && $(AUTOCONF)
++
++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in acinclude.m4
++ cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
++
++stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
++ @rm -f stamp-h1
++ cd $(top_builddir) && $(SHELL) ./config.status config.h
++
++$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(top_srcdir)/configure.in $(ACLOCAL_M4) $(top_srcdir)/acconfig.h
++ cd $(top_srcdir) && $(AUTOHEADER)
++ touch $(srcdir)/config.h.in
++
++distclean-hdr:
++ -rm -f config.h stamp-h1
++bdb/Makefile: $(top_builddir)/config.status $(top_srcdir)/bdb/Makefile.in
++ cd $(top_builddir) && $(SHELL) ./config.status $@
++
++mostlyclean-libtool:
++ -rm -f *.lo
++
++clean-libtool:
++ -rm -rf .libs _libs
++
++distclean-libtool:
++ -rm -f libtool
++uninstall-info-am:
++
++# This directory's subdirectories are mostly independent; you can cd
++# into them and run `make' without going through this Makefile.
++# To change the values of `make' variables: instead of editing Makefiles,
++# (1) if the variable is set in `config.status', edit `config.status'
++# (which will cause the Makefiles to be regenerated when you run `make');
++# (2) otherwise, pass the desired values on the `make' command line.
++$(RECURSIVE_TARGETS):
++ @set fnord $$MAKEFLAGS; amf=$$2; \
++ dot_seen=no; \
++ target=`echo $@ | sed s/-recursive//`; \
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ echo "Making $$target in $$subdir"; \
++ if test "$$subdir" = "."; then \
++ dot_seen=yes; \
++ local_target="$$target-am"; \
++ else \
++ local_target="$$target"; \
++ fi; \
++ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++ done; \
++ if test "$$dot_seen" = "no"; then \
++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
++ fi; test -z "$$fail"
++
++mostlyclean-recursive clean-recursive distclean-recursive \
++maintainer-clean-recursive:
++ @set fnord $$MAKEFLAGS; amf=$$2; \
++ dot_seen=no; \
++ case "$@" in \
++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
++ *) list='$(SUBDIRS)' ;; \
++ esac; \
++ rev=''; for subdir in $$list; do \
++ if test "$$subdir" = "."; then :; else \
++ rev="$$subdir $$rev"; \
++ fi; \
++ done; \
++ rev="$$rev ."; \
++ target=`echo $@ | sed s/-recursive//`; \
++ for subdir in $$rev; do \
++ echo "Making $$target in $$subdir"; \
++ if test "$$subdir" = "."; then \
++ local_target="$$target-am"; \
++ else \
++ local_target="$$target"; \
++ fi; \
++ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++ done && test -z "$$fail"
++tags-recursive:
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
++ done
++ctags-recursive:
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
++ done
++
++ETAGS = etags
++ETAGSFLAGS =
++
++CTAGS = ctags
++CTAGSFLAGS =
++
++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ mkid -fID $$unique
++
++TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++ $(TAGS_FILES) $(LISP)
++ tags=; \
++ here=`pwd`; \
++ if (etags --etags-include --version) >/dev/null 2>&1; then \
++ include_option=--etags-include; \
++ else \
++ include_option=--include; \
++ fi; \
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ if test "$$subdir" = .; then :; else \
++ test -f $$subdir/TAGS && \
++ tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
++ fi; \
++ done; \
++ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ test -z "$(ETAGS_ARGS)$$tags$$unique" \
++ || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
++ $$tags $$unique
++
++ctags: CTAGS
++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++ $(TAGS_FILES) $(LISP)
++ tags=; \
++ here=`pwd`; \
++ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ test -z "$(CTAGS_ARGS)$$tags$$unique" \
++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
++ $$tags $$unique
++
++GTAGS:
++ here=`$(am__cd) $(top_builddir) && pwd` \
++ && cd $(top_srcdir) \
++ && gtags -i $(GTAGS_ARGS) $$here
++
++distclean-tags:
++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
++
++top_distdir = .
++distdir = $(PACKAGE)-$(VERSION)
++
++am__remove_distdir = \
++ { test ! -d $(distdir) \
++ || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
++ && rm -fr $(distdir); }; }
++
++GZIP_ENV = --best
++distuninstallcheck_listfiles = find . -type f -print
++distcleancheck_listfiles = find . -type f -print
++
++distdir: $(DISTFILES)
++ $(am__remove_distdir)
++ mkdir $(distdir)
++ $(mkinstalldirs) $(distdir)/bdb $(distdir)/include
++ @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
++ list='$(DISTFILES)'; for file in $$list; do \
++ case $$file in \
++ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
++ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
++ esac; \
++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
++ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
++ if test "$$dir" != "$$file" && test "$$dir" != "."; then \
++ dir="/$$dir"; \
++ $(mkinstalldirs) "$(distdir)$$dir"; \
++ else \
++ dir=''; \
++ fi; \
++ if test -d $$d/$$file; then \
++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
++ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
++ fi; \
++ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
++ else \
++ test -f $(distdir)/$$file \
++ || cp -p $$d/$$file $(distdir)/$$file \
++ || exit 1; \
++ fi; \
++ done
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ if test "$$subdir" = .; then :; else \
++ test -d $(distdir)/$$subdir \
++ || mkdir $(distdir)/$$subdir \
++ || exit 1; \
++ (cd $$subdir && \
++ $(MAKE) $(AM_MAKEFLAGS) \
++ top_distdir="$(top_distdir)" \
++ distdir=../$(distdir)/$$subdir \
++ distdir) \
++ || exit 1; \
++ fi; \
++ done
++ $(MAKE) $(AM_MAKEFLAGS) \
++ top_distdir="$(top_distdir)" distdir="$(distdir)" \
++ dist-hook
++ -find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
++ ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
++ ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
++ ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
++ || chmod -R a+r $(distdir)
++dist-gzip: distdir
++ $(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++ $(am__remove_distdir)
++
++dist dist-all: distdir
++ $(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++ $(am__remove_distdir)
++
++# This target untars the dist file and tries a VPATH configuration. Then
++# it guarantees that the distribution is self-contained by making another
++# tarfile.
++distcheck: dist
++ $(am__remove_distdir)
++ GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
++ chmod -R a-w $(distdir); chmod a+w $(distdir)
++ mkdir $(distdir)/_build
++ mkdir $(distdir)/_inst
++ chmod a-w $(distdir)
++ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
++ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
++ && cd $(distdir)/_build \
++ && ../configure --srcdir=.. --prefix="$$dc_install_base" \
++ $(DISTCHECK_CONFIGURE_FLAGS) \
++ && $(MAKE) $(AM_MAKEFLAGS) \
++ && $(MAKE) $(AM_MAKEFLAGS) dvi \
++ && $(MAKE) $(AM_MAKEFLAGS) check \
++ && $(MAKE) $(AM_MAKEFLAGS) install \
++ && $(MAKE) $(AM_MAKEFLAGS) installcheck \
++ && $(MAKE) $(AM_MAKEFLAGS) uninstall \
++ && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
++ distuninstallcheck \
++ && chmod -R a-w "$$dc_install_base" \
++ && ({ \
++ (cd ../.. && $(mkinstalldirs) "$$dc_destdir") \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
++ distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
++ } || { rm -rf "$$dc_destdir"; exit 1; }) \
++ && rm -rf "$$dc_destdir" \
++ && $(MAKE) $(AM_MAKEFLAGS) dist-gzip \
++ && rm -f $(distdir).tar.gz \
++ && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
++ $(am__remove_distdir)
++ @echo "$(distdir).tar.gz is ready for distribution" | \
++ sed 'h;s/./=/g;p;x;p;x'
++distuninstallcheck:
++ @cd $(distuninstallcheck_dir) \
++ && test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
++ || { echo "ERROR: files left after uninstall:" ; \
++ if test -n "$(DESTDIR)"; then \
++ echo " (check DESTDIR support)"; \
++ fi ; \
++ $(distuninstallcheck_listfiles) ; \
++ exit 1; } >&2
++distcleancheck: distclean
++ @if test '$(srcdir)' = . ; then \
++ echo "ERROR: distcleancheck can only run from a VPATH build" ; \
++ exit 1 ; \
++ fi
++ @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
++ || { echo "ERROR: files left in build directory after distclean:" ; \
++ $(distcleancheck_listfiles) ; \
++ exit 1; } >&2
++check-am: all-am
++check: check-recursive
++all-am: Makefile config.h
++installdirs: installdirs-recursive
++installdirs-am:
++
++install: install-recursive
++install-exec: install-exec-recursive
++install-data: install-data-recursive
++uninstall: uninstall-recursive
++
++install-am: all-am
++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
++
++installcheck: installcheck-recursive
++install-strip:
++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
++ `test -z '$(STRIP)' || \
++ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
++mostlyclean-generic:
++
++clean-generic:
++ -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
++
++distclean-generic:
++ -rm -f $(CONFIG_CLEAN_FILES)
++
++maintainer-clean-generic:
++ @echo "This command is intended for maintainers to use"
++ @echo "it deletes files that may require special tools to rebuild."
++clean: clean-recursive
++
++clean-am: clean-generic clean-libtool mostlyclean-am
++
++distclean: distclean-recursive
++ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
++ -rm -f Makefile
++distclean-am: clean-am distclean-generic distclean-hdr distclean-libtool \
++ distclean-tags
++
++dvi: dvi-recursive
++
++dvi-am:
++
++info: info-recursive
++
++info-am:
++
++install-data-am:
++
++install-exec-am:
++
++install-info: install-info-recursive
++
++install-man:
++
++installcheck-am:
++
++maintainer-clean: maintainer-clean-recursive
++ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
++ -rm -rf $(top_srcdir)/autom4te.cache
++ -rm -f Makefile
++maintainer-clean-am: distclean-am maintainer-clean-generic
++
++mostlyclean: mostlyclean-recursive
++
++mostlyclean-am: mostlyclean-generic mostlyclean-libtool
++
++pdf: pdf-recursive
++
++pdf-am:
++
++ps: ps-recursive
++
++ps-am:
++
++uninstall-am: uninstall-info-am
++
++uninstall-info: uninstall-info-recursive
++
++.PHONY: $(RECURSIVE_TARGETS) CTAGS GTAGS all all-am check check-am clean \
++ clean-generic clean-libtool clean-recursive ctags \
++ ctags-recursive dist dist-all dist-gzip distcheck distclean \
++ distclean-generic distclean-hdr distclean-libtool \
++ distclean-recursive distclean-tags distcleancheck distdir \
++ distuninstallcheck dvi dvi-am dvi-recursive info info-am \
++ info-recursive install install-am install-data install-data-am \
++ install-data-recursive install-exec install-exec-am \
++ install-exec-recursive install-info install-info-am \
++ install-info-recursive install-man install-recursive \
++ install-strip installcheck installcheck-am installdirs \
++ installdirs-am installdirs-recursive maintainer-clean \
++ maintainer-clean-generic maintainer-clean-recursive mostlyclean \
++ mostlyclean-generic mostlyclean-libtool mostlyclean-recursive \
++ pdf pdf-am pdf-recursive ps ps-am ps-recursive tags \
++ tags-recursive uninstall uninstall-am uninstall-info-am \
++ uninstall-info-recursive uninstall-recursive
++
++
++# This is just so that the linking is done early.
++config.h: $(linked_sources)
++
++linked_include_sources:
++ cd include; $(MAKE) link_sources
++ echo timestamp > linked_include_sources
++
++linked_client_sources: @linked_client_targets@
++ cd client; $(MAKE) link_sources
++ echo timestamp > linked_client_sources
++
++linked_libmysql_sources:
++ cd libmysql; $(MAKE) link_sources
++ echo timestamp > linked_libmysql_sources
++
++linked_libmysql_r_sources: linked_libmysql_sources
++ cd libmysql_r; $(MAKE) link_sources
++ echo timestamp > linked_libmysql_r_sources
++
++linked_libmysqld_sources:
++ cd libmysqld; $(MAKE) link_sources
++ echo timestamp > linked_libmysqld_sources
++
++linked_libmysqldex_sources:
++ cd libmysqld/examples; $(MAKE) link_sources
++ echo timestamp > linked_libmysqldex_sources
++
++linked_netware_sources:
++ cd @netware_dir@; $(MAKE) link_sources
++ echo timestamp > linked_netware_sources
++
++#avoid recursive make calls in sql directory
++linked_server_sources:
++ cd sql; rm -f mini_client_errors.c;@LN_CP_F@ ../libmysql/errmsg.c mini_client_errors.c
++ echo timestamp > linked_server_sources
++
++# Create permission databases
++init-db: all
++ $(top_builddir)/scripts/mysql_install_db
++
++bin-dist: all
++ $(top_builddir)/scripts/make_binary_distribution @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++
++# Remove BK's "SCCS" subdirectories from source distribution
++dist-hook:
++ rm -rf `find $(distdir) -type d -name SCCS`
++
++tags:
++ support-files/build-tags
++.PHONY: init-db bin-dist
++
++# Test installation
++
++test:
++ cd mysql-test ; ./mysql-test-run
++# Tell versions [3.59,3.63) of GNU make to not export all variables.
++# Otherwise a system limit (for SysV at least) may be exceeded.
++.NOEXPORT:
diff --git a/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch
new file mode 100644
index 00000000000..6440697bc02
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch
@@ -0,0 +1,776 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 01_MAKEFILES__Docs_Makefile.in.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Creates Docs/Makefile.in
+
+@DPATCH@
+
+--- old/Docs/Makefile.in 2005-03-01 02:08:01.877429040 +0100
++++ new/Docs/Makefile.in 2005-02-28 21:21:24.000000000 +0100
+@@ -0,0 +1,765 @@
++# Makefile.in generated by automake 1.7.9 from Makefile.am.
++# @configure_input@
++
++# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
++# Free Software Foundation, Inc.
++# This Makefile.in is free software; the Free Software Foundation
++# gives unlimited permission to copy and/or distribute it,
++# with or without modifications, as long as this notice is preserved.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
++# PARTICULAR PURPOSE.
++
++@SET_MAKE@
++
++# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
++#
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 2 of the License, or
++# (at your option) any later version.
++#
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++# GNU General Public License for more details.
++#
++# You should have received a copy of the GNU General Public License
++# along with this program; if not, write to the Free Software
++# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++# Process this file with automake to create Makefile.in
++
++srcdir = @srcdir@
++top_srcdir = @top_srcdir@
++VPATH = @srcdir@
++pkgdatadir = $(datadir)/@PACKAGE@
++pkglibdir = $(libdir)/@PACKAGE@
++pkgincludedir = $(includedir)/@PACKAGE@
++top_builddir = .
++
++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
++INSTALL = @INSTALL@
++install_sh_DATA = $(install_sh) -c -m 644
++install_sh_PROGRAM = $(install_sh) -c
++install_sh_SCRIPT = $(install_sh) -c
++INSTALL_HEADER = $(INSTALL_DATA)
++transform = $(program_transform_name)
++NORMAL_INSTALL = :
++PRE_INSTALL = :
++POST_INSTALL = :
++NORMAL_UNINSTALL = :
++PRE_UNINSTALL = :
++POST_UNINSTALL = :
++build_triplet = @build@
++host_triplet = @host@
++target_triplet = @target@
++ACLOCAL = @ACLOCAL@
++ALLOCA = @ALLOCA@
++AMDEP_FALSE = @AMDEP_FALSE@
++AMDEP_TRUE = @AMDEP_TRUE@
++AMTAR = @AMTAR@
++AR = @AR@
++AS = @AS@
++ASSEMBLER_FALSE = @ASSEMBLER_FALSE@
++ASSEMBLER_TRUE = @ASSEMBLER_TRUE@
++ASSEMBLER_sparc32_FALSE = @ASSEMBLER_sparc32_FALSE@
++ASSEMBLER_sparc32_TRUE = @ASSEMBLER_sparc32_TRUE@
++ASSEMBLER_sparc64_FALSE = @ASSEMBLER_sparc64_FALSE@
++ASSEMBLER_sparc64_TRUE = @ASSEMBLER_sparc64_TRUE@
++ASSEMBLER_x86_FALSE = @ASSEMBLER_x86_FALSE@
++ASSEMBLER_x86_TRUE = @ASSEMBLER_x86_TRUE@
++AUTOCONF = @AUTOCONF@
++AUTOHEADER = @AUTOHEADER@
++AUTOMAKE = @AUTOMAKE@
++AVAILABLE_LANGUAGES = @AVAILABLE_LANGUAGES@
++AVAILABLE_LANGUAGES_ERRORS = @AVAILABLE_LANGUAGES_ERRORS@
++AWK = @AWK@
++CC = @CC@
++CCAS = @CCAS@
++CCASFLAGS = @CCASFLAGS@
++CCDEPMODE = @CCDEPMODE@
++CC_VERSION = @CC_VERSION@
++CFLAGS = @CFLAGS@
++CHARSETS_NEED_SOURCE = @CHARSETS_NEED_SOURCE@
++CHARSET_OBJS = @CHARSET_OBJS@
++CHARSET_SRCS = @CHARSET_SRCS@
++CHECK_PID = @CHECK_PID@
++CHMOD = @CHMOD@
++CLIENT_EXTRA_LDFLAGS = @CLIENT_EXTRA_LDFLAGS@
++CLIENT_LIBS = @CLIENT_LIBS@
++CMP = @CMP@
++COMPILATION_COMMENT = @COMPILATION_COMMENT@
++COMPILE_PSTACK_FALSE = @COMPILE_PSTACK_FALSE@
++COMPILE_PSTACK_TRUE = @COMPILE_PSTACK_TRUE@
++CONF_COMMAND = @CONF_COMMAND@
++CP = @CP@
++CPP = @CPP@
++CPPFLAGS = @CPPFLAGS@
++CXX = @CXX@
++CXXCPP = @CXXCPP@
++CXXDEPMODE = @CXXDEPMODE@
++CXXFLAGS = @CXXFLAGS@
++CXXLDFLAGS = @CXXLDFLAGS@
++CXX_VERSION = @CXX_VERSION@
++CYGPATH_W = @CYGPATH_W@
++DEFS = @DEFS@
++DEPDIR = @DEPDIR@
++DOT_FRM_VERSION = @DOT_FRM_VERSION@
++DVIS = @DVIS@
++ECHO = @ECHO@
++ECHO_C = @ECHO_C@
++ECHO_N = @ECHO_N@
++ECHO_T = @ECHO_T@
++EGREP = @EGREP@
++EXEEXT = @EXEEXT@
++F77 = @F77@
++FFLAGS = @FFLAGS@
++FIND_PROC = @FIND_PROC@
++GETCONF = @GETCONF@
++GXX = @GXX@
++HAVE_NETWARE_FALSE = @HAVE_NETWARE_FALSE@
++HAVE_NETWARE_TRUE = @HAVE_NETWARE_TRUE@
++HOSTNAME = @HOSTNAME@
++INSTALL_DATA = @INSTALL_DATA@
++INSTALL_PROGRAM = @INSTALL_PROGRAM@
++INSTALL_SCRIPT = @INSTALL_SCRIPT@
++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
++IS_LINUX = @IS_LINUX@
++KILL = @KILL@
++LD = @LD@
++LDFLAGS = @LDFLAGS@
++LIBDL = @LIBDL@
++LIBOBJS = @LIBOBJS@
++LIBS = @LIBS@
++LIBTOOL = @LIBTOOL@
++LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
++LM_CFLAGS = @LM_CFLAGS@
++LN = @LN@
++LN_CP_F = @LN_CP_F@
++LN_S = @LN_S@
++LOCAL_FALSE = @LOCAL_FALSE@
++LOCAL_TRUE = @LOCAL_TRUE@
++LTLIBOBJS = @LTLIBOBJS@
++MACHINE_TYPE = @MACHINE_TYPE@
++MAINT = @MAINT@
++MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
++MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
++MAKEINFO = @MAKEINFO@
++MAKE_BINARY_DISTRIBUTION_OPTIONS = @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++MAKE_SHELL = @MAKE_SHELL@
++MT_INCLUDES = @MT_INCLUDES@
++MT_LD_ADD = @MT_LD_ADD@
++MV = @MV@
++MYSQLD_DEFAULT_SWITCHES = @MYSQLD_DEFAULT_SWITCHES@
++MYSQLD_EXTRA_LDFLAGS = @MYSQLD_EXTRA_LDFLAGS@
++MYSQLD_USER = @MYSQLD_USER@
++MYSQL_BASE_VERSION = @MYSQL_BASE_VERSION@
++MYSQL_NO_DASH_VERSION = @MYSQL_NO_DASH_VERSION@
++MYSQL_SERVER_SUFFIX = @MYSQL_SERVER_SUFFIX@
++MYSQL_TCP_PORT = @MYSQL_TCP_PORT@
++MYSQL_TCP_PORT_DEFAULT = @MYSQL_TCP_PORT_DEFAULT@
++MYSQL_UNIX_ADDR = @MYSQL_UNIX_ADDR@
++MYSQL_VERSION_ID = @MYSQL_VERSION_ID@
++NOINST_LDFLAGS = @NOINST_LDFLAGS@
++OBJEXT = @OBJEXT@
++PACKAGE = @PACKAGE@
++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
++PACKAGE_NAME = @PACKAGE_NAME@
++PACKAGE_STRING = @PACKAGE_STRING@
++PACKAGE_TARNAME = @PACKAGE_TARNAME@
++PACKAGE_VERSION = @PACKAGE_VERSION@
++PATH_SEPARATOR = @PATH_SEPARATOR@
++PDFMANUAL = @PDFMANUAL@
++PERL = @PERL@
++PERL5 = @PERL5@
++PROTOCOL_VERSION = @PROTOCOL_VERSION@
++PS = @PS@
++RANLIB = @RANLIB@
++RM = @RM@
++SAVE_ASFLAGS = @SAVE_ASFLAGS@
++SAVE_CFLAGS = @SAVE_CFLAGS@
++SAVE_CXXFLAGS = @SAVE_CXXFLAGS@
++SAVE_CXXLDFLAGS = @SAVE_CXXLDFLAGS@
++SAVE_LDFLAGS = @SAVE_LDFLAGS@
++SED = @SED@
++SET_MAKE = @SET_MAKE@
++SHARED_LIB_VERSION = @SHARED_LIB_VERSION@
++SHELL = @SHELL@
++STRIP = @STRIP@
++SYSTEM_TYPE = @SYSTEM_TYPE@
++TAR = @TAR@
++TERMCAP_LIB = @TERMCAP_LIB@
++THREAD_LOBJECTS = @THREAD_LOBJECTS@
++THREAD_LPROGRAMS = @THREAD_LPROGRAMS@
++VERSION = @VERSION@
++WRAPLIBS = @WRAPLIBS@
++YACC = @YACC@
++ac_ct_AR = @ac_ct_AR@
++ac_ct_CC = @ac_ct_CC@
++ac_ct_CXX = @ac_ct_CXX@
++ac_ct_F77 = @ac_ct_F77@
++ac_ct_GETCONF = @ac_ct_GETCONF@
++ac_ct_RANLIB = @ac_ct_RANLIB@
++ac_ct_STRIP = @ac_ct_STRIP@
++am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
++am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
++am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
++am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
++am__include = @am__include@
++am__leading_dot = @am__leading_dot@
++am__quote = @am__quote@
++bdb_includes = @bdb_includes@
++bdb_libs = @bdb_libs@
++bdb_libs_with_path = @bdb_libs_with_path@
++bench_dirs = @bench_dirs@
++bindir = @bindir@
++build = @build@
++build_alias = @build_alias@
++build_cpu = @build_cpu@
++build_os = @build_os@
++build_vendor = @build_vendor@
++datadir = @datadir@
++default_charset = @default_charset@
++docs_dirs = @docs_dirs@
++exec_prefix = @exec_prefix@
++host = @host@
++host_alias = @host_alias@
++host_cpu = @host_cpu@
++host_os = @host_os@
++host_vendor = @host_vendor@
++includedir = @includedir@
++infodir = @infodir@
++innodb_includes = @innodb_includes@
++innodb_libs = @innodb_libs@
++innodb_system_libs = @innodb_system_libs@
++install_sh = @install_sh@
++isam_libs = @isam_libs@
++libdir = @libdir@
++libexecdir = @libexecdir@
++libmysqld_dirs = @libmysqld_dirs@
++linked_client_targets = @linked_client_targets@
++linked_netware_sources = @linked_netware_sources@
++localstatedir = @localstatedir@
++man_dirs = @man_dirs@
++mandir = @mandir@
++netware_dir = @netware_dir@
++oldincludedir = @oldincludedir@
++openssl_includes = @openssl_includes@
++openssl_libs = @openssl_libs@
++orbit_idl = @orbit_idl@
++orbit_includes = @orbit_includes@
++orbit_libs = @orbit_libs@
++prefix = @prefix@
++program_transform_name = @program_transform_name@
++pstack_dirs = @pstack_dirs@
++pstack_libs = @pstack_libs@
++readline_dir = @readline_dir@
++readline_link = @readline_link@
++sbindir = @sbindir@
++server_scripts = @server_scripts@
++sharedstatedir = @sharedstatedir@
++sql_client_dirs = @sql_client_dirs@
++sql_server_dirs = @sql_server_dirs@
++sysconfdir = @sysconfdir@
++target = @target@
++target_alias = @target_alias@
++target_cpu = @target_cpu@
++target_os = @target_os@
++target_vendor = @target_vendor@
++thread_dirs = @thread_dirs@
++tools_dirs = @tools_dirs@
++uname_prog = @uname_prog@
++vio_dir = @vio_dir@
++vio_libs = @vio_libs@
++
++AUTOMAKE_OPTIONS = foreign
++
++# These are built from source in the Docs directory
++EXTRA_DIST = INSTALL-SOURCE README COPYING EXCEPTIONS-CLIENT
++SUBDIRS = . include @docs_dirs@ @readline_dir@ \
++ @thread_dirs@ pstack @sql_client_dirs@ \
++ @sql_server_dirs@ scripts @man_dirs@ tests \
++ BUILD netware os2 @libmysqld_dirs@ \
++ @bench_dirs@ support-files @tools_dirs@
++
++
++# Relink after clean
++linked_sources = linked_client_sources linked_server_sources \
++ linked_libmysql_sources linked_libmysql_r_sources \
++ linked_libmysqld_sources linked_libmysqldex_sources \
++ linked_include_sources @linked_netware_sources@
++
++
++CLEANFILES = $(linked_sources)
++subdir = .
++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
++mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
++CONFIG_HEADER = config.h
++CONFIG_CLEAN_FILES = bdb/Makefile
++DIST_SOURCES =
++
++RECURSIVE_TARGETS = info-recursive dvi-recursive pdf-recursive \
++ ps-recursive install-info-recursive uninstall-info-recursive \
++ all-recursive install-data-recursive install-exec-recursive \
++ installdirs-recursive install-recursive uninstall-recursive \
++ check-recursive installcheck-recursive
++DIST_COMMON = README $(srcdir)/Makefile.in $(srcdir)/configure COPYING \
++ ChangeLog Makefile.am acconfig.h acinclude.m4 aclocal.m4 \
++ config.guess config.h.in config.sub configure configure.in \
++ depcomp install-sh ltconfig ltmain.sh missing mkinstalldirs
++DIST_SUBDIRS = $(SUBDIRS)
++all: config.h
++ $(MAKE) $(AM_MAKEFLAGS) all-recursive
++
++.SUFFIXES:
++
++am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
++ configure.lineno
++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.in $(ACLOCAL_M4)
++ cd $(top_srcdir) && \
++ $(AUTOMAKE) --foreign Makefile
++Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status
++ cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)
++
++$(top_builddir)/config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
++ $(SHELL) ./config.status --recheck
++$(srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
++ cd $(srcdir) && $(AUTOCONF)
++
++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in acinclude.m4
++ cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
++
++stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
++ @rm -f stamp-h1
++ cd $(top_builddir) && $(SHELL) ./config.status config.h
++
++$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(top_srcdir)/configure.in $(ACLOCAL_M4) $(top_srcdir)/acconfig.h
++ cd $(top_srcdir) && $(AUTOHEADER)
++ touch $(srcdir)/config.h.in
++
++distclean-hdr:
++ -rm -f config.h stamp-h1
++bdb/Makefile: $(top_builddir)/config.status $(top_srcdir)/bdb/Makefile.in
++ cd $(top_builddir) && $(SHELL) ./config.status $@
++
++mostlyclean-libtool:
++ -rm -f *.lo
++
++clean-libtool:
++ -rm -rf .libs _libs
++
++distclean-libtool:
++ -rm -f libtool
++uninstall-info-am:
++
++# This directory's subdirectories are mostly independent; you can cd
++# into them and run `make' without going through this Makefile.
++# To change the values of `make' variables: instead of editing Makefiles,
++# (1) if the variable is set in `config.status', edit `config.status'
++# (which will cause the Makefiles to be regenerated when you run `make');
++# (2) otherwise, pass the desired values on the `make' command line.
++$(RECURSIVE_TARGETS):
++ @set fnord $$MAKEFLAGS; amf=$$2; \
++ dot_seen=no; \
++ target=`echo $@ | sed s/-recursive//`; \
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ echo "Making $$target in $$subdir"; \
++ if test "$$subdir" = "."; then \
++ dot_seen=yes; \
++ local_target="$$target-am"; \
++ else \
++ local_target="$$target"; \
++ fi; \
++ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++ done; \
++ if test "$$dot_seen" = "no"; then \
++ $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
++ fi; test -z "$$fail"
++
++mostlyclean-recursive clean-recursive distclean-recursive \
++maintainer-clean-recursive:
++ @set fnord $$MAKEFLAGS; amf=$$2; \
++ dot_seen=no; \
++ case "$@" in \
++ distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
++ *) list='$(SUBDIRS)' ;; \
++ esac; \
++ rev=''; for subdir in $$list; do \
++ if test "$$subdir" = "."; then :; else \
++ rev="$$subdir $$rev"; \
++ fi; \
++ done; \
++ rev="$$rev ."; \
++ target=`echo $@ | sed s/-recursive//`; \
++ for subdir in $$rev; do \
++ echo "Making $$target in $$subdir"; \
++ if test "$$subdir" = "."; then \
++ local_target="$$target-am"; \
++ else \
++ local_target="$$target"; \
++ fi; \
++ (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++ || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++ done && test -z "$$fail"
++tags-recursive:
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
++ done
++ctags-recursive:
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
++ done
++
++ETAGS = etags
++ETAGSFLAGS =
++
++CTAGS = ctags
++CTAGSFLAGS =
++
++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
++ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ mkid -fID $$unique
++
++TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++ $(TAGS_FILES) $(LISP)
++ tags=; \
++ here=`pwd`; \
++ if (etags --etags-include --version) >/dev/null 2>&1; then \
++ include_option=--etags-include; \
++ else \
++ include_option=--include; \
++ fi; \
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ if test "$$subdir" = .; then :; else \
++ test -f $$subdir/TAGS && \
++ tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
++ fi; \
++ done; \
++ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ test -z "$(ETAGS_ARGS)$$tags$$unique" \
++ || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
++ $$tags $$unique
++
++ctags: CTAGS
++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++ $(TAGS_FILES) $(LISP)
++ tags=; \
++ here=`pwd`; \
++ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++ unique=`for i in $$list; do \
++ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++ done | \
++ $(AWK) ' { files[$$0] = 1; } \
++ END { for (i in files) print i; }'`; \
++ test -z "$(CTAGS_ARGS)$$tags$$unique" \
++ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
++ $$tags $$unique
++
++GTAGS:
++ here=`$(am__cd) $(top_builddir) && pwd` \
++ && cd $(top_srcdir) \
++ && gtags -i $(GTAGS_ARGS) $$here
++
++distclean-tags:
++ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
++
++top_distdir = .
++distdir = $(PACKAGE)-$(VERSION)
++
++am__remove_distdir = \
++ { test ! -d $(distdir) \
++ || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
++ && rm -fr $(distdir); }; }
++
++GZIP_ENV = --best
++distuninstallcheck_listfiles = find . -type f -print
++distcleancheck_listfiles = find . -type f -print
++
++distdir: $(DISTFILES)
++ $(am__remove_distdir)
++ mkdir $(distdir)
++ $(mkinstalldirs) $(distdir)/bdb $(distdir)/include
++ @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
++ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
++ list='$(DISTFILES)'; for file in $$list; do \
++ case $$file in \
++ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
++ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
++ esac; \
++ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
++ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
++ if test "$$dir" != "$$file" && test "$$dir" != "."; then \
++ dir="/$$dir"; \
++ $(mkinstalldirs) "$(distdir)$$dir"; \
++ else \
++ dir=''; \
++ fi; \
++ if test -d $$d/$$file; then \
++ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
++ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
++ fi; \
++ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
++ else \
++ test -f $(distdir)/$$file \
++ || cp -p $$d/$$file $(distdir)/$$file \
++ || exit 1; \
++ fi; \
++ done
++ list='$(SUBDIRS)'; for subdir in $$list; do \
++ if test "$$subdir" = .; then :; else \
++ test -d $(distdir)/$$subdir \
++ || mkdir $(distdir)/$$subdir \
++ || exit 1; \
++ (cd $$subdir && \
++ $(MAKE) $(AM_MAKEFLAGS) \
++ top_distdir="$(top_distdir)" \
++ distdir=../$(distdir)/$$subdir \
++ distdir) \
++ || exit 1; \
++ fi; \
++ done
++ $(MAKE) $(AM_MAKEFLAGS) \
++ top_distdir="$(top_distdir)" distdir="$(distdir)" \
++ dist-hook
++ -find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
++ ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
++ ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
++ ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
++ || chmod -R a+r $(distdir)
++dist-gzip: distdir
++ $(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++ $(am__remove_distdir)
++
++dist dist-all: distdir
++ $(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++ $(am__remove_distdir)
++
++# This target untars the dist file and tries a VPATH configuration. Then
++# it guarantees that the distribution is self-contained by making another
++# tarfile.
++distcheck: dist
++ $(am__remove_distdir)
++ GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
++ chmod -R a-w $(distdir); chmod a+w $(distdir)
++ mkdir $(distdir)/_build
++ mkdir $(distdir)/_inst
++ chmod a-w $(distdir)
++ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
++ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
++ && cd $(distdir)/_build \
++ && ../configure --srcdir=.. --prefix="$$dc_install_base" \
++ $(DISTCHECK_CONFIGURE_FLAGS) \
++ && $(MAKE) $(AM_MAKEFLAGS) \
++ && $(MAKE) $(AM_MAKEFLAGS) dvi \
++ && $(MAKE) $(AM_MAKEFLAGS) check \
++ && $(MAKE) $(AM_MAKEFLAGS) install \
++ && $(MAKE) $(AM_MAKEFLAGS) installcheck \
++ && $(MAKE) $(AM_MAKEFLAGS) uninstall \
++ && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
++ distuninstallcheck \
++ && chmod -R a-w "$$dc_install_base" \
++ && ({ \
++ (cd ../.. && $(mkinstalldirs) "$$dc_destdir") \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
++ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
++ distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
++ } || { rm -rf "$$dc_destdir"; exit 1; }) \
++ && rm -rf "$$dc_destdir" \
++ && $(MAKE) $(AM_MAKEFLAGS) dist-gzip \
++ && rm -f $(distdir).tar.gz \
++ && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
++ $(am__remove_distdir)
++ @echo "$(distdir).tar.gz is ready for distribution" | \
++ sed 'h;s/./=/g;p;x;p;x'
++distuninstallcheck:
++ @cd $(distuninstallcheck_dir) \
++ && test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
++ || { echo "ERROR: files left after uninstall:" ; \
++ if test -n "$(DESTDIR)"; then \
++ echo " (check DESTDIR support)"; \
++ fi ; \
++ $(distuninstallcheck_listfiles) ; \
++ exit 1; } >&2
++distcleancheck: distclean
++ @if test '$(srcdir)' = . ; then \
++ echo "ERROR: distcleancheck can only run from a VPATH build" ; \
++ exit 1 ; \
++ fi
++ @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
++ || { echo "ERROR: files left in build directory after distclean:" ; \
++ $(distcleancheck_listfiles) ; \
++ exit 1; } >&2
++check-am: all-am
++check: check-recursive
++all-am: Makefile config.h
++installdirs: installdirs-recursive
++installdirs-am:
++
++install: install-recursive
++install-exec: install-exec-recursive
++install-data: install-data-recursive
++uninstall: uninstall-recursive
++
++install-am: all-am
++ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
++
++installcheck: installcheck-recursive
++install-strip:
++ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
++ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
++ `test -z '$(STRIP)' || \
++ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
++mostlyclean-generic:
++
++clean-generic:
++ -test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
++
++distclean-generic:
++ -rm -f $(CONFIG_CLEAN_FILES)
++
++maintainer-clean-generic:
++ @echo "This command is intended for maintainers to use"
++ @echo "it deletes files that may require special tools to rebuild."
++clean: clean-recursive
++
++clean-am: clean-generic clean-libtool mostlyclean-am
++
++distclean: distclean-recursive
++ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
++ -rm -f Makefile
++distclean-am: clean-am distclean-generic distclean-hdr distclean-libtool \
++ distclean-tags
++
++dvi: dvi-recursive
++
++dvi-am:
++
++info: info-recursive
++
++info-am:
++
++install-data-am:
++
++install-exec-am:
++
++install-info: install-info-recursive
++
++install-man:
++
++installcheck-am:
++
++maintainer-clean: maintainer-clean-recursive
++ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
++ -rm -rf $(top_srcdir)/autom4te.cache
++ -rm -f Makefile
++maintainer-clean-am: distclean-am maintainer-clean-generic
++
++mostlyclean: mostlyclean-recursive
++
++mostlyclean-am: mostlyclean-generic mostlyclean-libtool
++
++pdf: pdf-recursive
++
++pdf-am:
++
++ps: ps-recursive
++
++ps-am:
++
++uninstall-am: uninstall-info-am
++
++uninstall-info: uninstall-info-recursive
++
++.PHONY: $(RECURSIVE_TARGETS) CTAGS GTAGS all all-am check check-am clean \
++ clean-generic clean-libtool clean-recursive ctags \
++ ctags-recursive dist dist-all dist-gzip distcheck distclean \
++ distclean-generic distclean-hdr distclean-libtool \
++ distclean-recursive distclean-tags distcleancheck distdir \
++ distuninstallcheck dvi dvi-am dvi-recursive info info-am \
++ info-recursive install install-am install-data install-data-am \
++ install-data-recursive install-exec install-exec-am \
++ install-exec-recursive install-info install-info-am \
++ install-info-recursive install-man install-recursive \
++ install-strip installcheck installcheck-am installdirs \
++ installdirs-am installdirs-recursive maintainer-clean \
++ maintainer-clean-generic maintainer-clean-recursive mostlyclean \
++ mostlyclean-generic mostlyclean-libtool mostlyclean-recursive \
++ pdf pdf-am pdf-recursive ps ps-am ps-recursive tags \
++ tags-recursive uninstall uninstall-am uninstall-info-am \
++ uninstall-info-recursive uninstall-recursive
++
++
++# This is just so that the linking is done early.
++config.h: $(linked_sources)
++
++linked_include_sources:
++ cd include; $(MAKE) link_sources
++ echo timestamp > linked_include_sources
++
++linked_client_sources: @linked_client_targets@
++ cd client; $(MAKE) link_sources
++ echo timestamp > linked_client_sources
++
++linked_libmysql_sources:
++ cd libmysql; $(MAKE) link_sources
++ echo timestamp > linked_libmysql_sources
++
++linked_libmysql_r_sources: linked_libmysql_sources
++ cd libmysql_r; $(MAKE) link_sources
++ echo timestamp > linked_libmysql_r_sources
++
++linked_libmysqld_sources:
++ cd libmysqld; $(MAKE) link_sources
++ echo timestamp > linked_libmysqld_sources
++
++linked_libmysqldex_sources:
++ cd libmysqld/examples; $(MAKE) link_sources
++ echo timestamp > linked_libmysqldex_sources
++
++linked_netware_sources:
++ cd @netware_dir@; $(MAKE) link_sources
++ echo timestamp > linked_netware_sources
++
++#avoid recursive make calls in sql directory
++linked_server_sources:
++ cd sql; rm -f mini_client_errors.c;@LN_CP_F@ ../libmysql/errmsg.c mini_client_errors.c
++ echo timestamp > linked_server_sources
++
++# Create permission databases
++init-db: all
++ $(top_builddir)/scripts/mysql_install_db
++
++bin-dist: all
++ $(top_builddir)/scripts/make_binary_distribution @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++
++# Remove BK's "SCCS" subdirectories from source distribution
++dist-hook:
++ rm -rf `find $(distdir) -type d -name SCCS`
++
++tags:
++ support-files/build-tags
++.PHONY: init-db bin-dist
++
++# Test installation
++
++test:
++ cd mysql-test ; ./mysql-test-run
++# Tell versions [3.59,3.63) of GNU make to not export all variables.
++# Otherwise a system limit (for SysV at least) may be exceeded.
++.NOEXPORT:
diff --git a/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch b/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch
new file mode 100644
index 00000000000..0fd166d1fc7
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch
@@ -0,0 +1,29 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 33_scripts__mysql_create_system_tables__no_test.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: scripts__mysql_create_system_tables__no_test
+## DP: A user with no password prevents a normal user from login under certain
+## DP: circumstances as it is checked first. See #301741.
+## DP: http://bugs.mysql.com/bug.php?id=6901
+
+@DPATCH@
+--- old/scripts/mysql_system_tables_data.sql 2008-12-04 22:59:44.000000000 +0100
++++ new/scripts/mysql_system_tables_data.sql 2008-12-04 23:00:07.000000000 +0100
+@@ -11,8 +11,6 @@
+ -- Fill "db" table with default grants for anyone to
+ -- access database 'test' and 'test_%' if "db" table didn't exist
+ CREATE TEMPORARY TABLE tmp_db LIKE db;
+-INSERT INTO tmp_db VALUES ('%','test','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');
+-INSERT INTO tmp_db VALUES ('%','test\_%','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');
+ INSERT INTO db SELECT * FROM tmp_db WHERE @had_db_table=0;
+ DROP TABLE tmp_db;
+
+@@ -24,7 +22,5 @@
+ INSERT INTO tmp_user VALUES ('localhost','root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0);
+ REPLACE INTO tmp_user SELECT @current_hostname,'root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0 FROM dual WHERE LOWER( @current_hostname) != 'localhost';
+ REPLACE INTO tmp_user VALUES ('127.0.0.1','root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0);
+-INSERT INTO tmp_user (host,user) VALUES ('localhost','');
+-INSERT INTO tmp_user (host,user) SELECT @current_hostname,'' FROM dual WHERE LOWER(@current_hostname ) != 'localhost';
+ INSERT INTO user SELECT * FROM tmp_user WHERE @had_user_table=0;
+ DROP TABLE tmp_user;
diff --git a/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch b/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch
new file mode 100644
index 00000000000..154bc0ad1c4
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch
@@ -0,0 +1,43 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 38_scripts__mysqld_safe.sh__signals.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Executes /etc/init.d/mysql on signals
+## DP: Reported as http://bugs.mysql.com/bug.php?id=31361
+
+@DPATCH@
+
+--- old/scripts/mysqld_safe.sh 2006-07-29 13:12:34.000000000 +0200
++++ old/scripts/mysqld_safe.sh 2006-07-29 13:14:08.000000000 +0200
+@@ -16,8 +16,6 @@
+ # This command can be used as pipe to syslog. With "-s" it also logs to stderr.
+ ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i"
+
+-trap '' 1 2 3 15 # we shouldn't let anyone kill us
+-
+ umask 007
+
+ defaults=
+@@ -122,7 +122,7 @@
+ # sed buffers output (only GNU sed supports a -u (unbuffered) option)
+ # which means that messages may not get sent to syslog until the
+ # mysqld process quits.
+- cmd="$cmd 2>&1 | logger -t '$syslog_tag_mysqld' -p daemon.error"
++ cmd="$cmd 2>&1 | logger -t '$syslog_tag_mysqld' -p daemon.error & wait"
+ ;;
+ *)
+ echo "Internal program error (non-fatal):" \
+@@ -352,6 +350,13 @@
+ fi
+
+ #
++# From now on, we catch signals to do a proper shutdown of mysqld
++# when signalled to do so.
++#
++trap '/usr/bin/mysqladmin --defaults-extra-file=/etc/mysql/debian.cnf refresh' 1 # HUP
++trap '/usr/bin/mysqladmin --defaults-extra-file=/etc/mysql/debian.cnf shutdown' 2 3 15 # INT QUIT and TERM
++
++#
+ # Uncomment the following lines if you want all tables to be automatically
+ # checked and repaired during startup. You should add sensible key_buffer
+ # and sort_buffer values to my.cnf to improve check performance or require
diff --git a/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch b/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
new file mode 100644
index 00000000000..e79ac71cc7b
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
@@ -0,0 +1,20 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 41_scripts__mysql_install_db.sh__no_test.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: scripts__mysql_install_db.sh__no_test
+## DP: http://bugs.mysql.com/bug.php?id=6901
+
+@DPATCH@
+
+--- mysql-dfsg-5.1-5.1.23rc.orig/scripts/mysql_install_db.sh 2008-01-29 22:41:20.000000000 +0100
++++ mysql-dfsg-5.1-5.1.23rc/scripts/mysql_install_db.sh 2008-02-28 10:08:11.000000000 +0100
+@@ -306,7 +306,7 @@
+ fi
+
+ # Create database directories
+-for dir in $ldata $ldata/mysql $ldata/test
++for dir in $ldata $ldata/mysql
+ do
+ if test ! -d $dir
+ then
diff --git a/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch b/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch
new file mode 100644
index 00000000000..b35ba4912f3
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch
@@ -0,0 +1,24 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 99-unnamed.dpatch by <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Removes unnecessary library dependencies. See #390692
+
+@DPATCH@
+diff -Nur mysql-dfsg-5.1-5.1.31.orig/scripts/mysql_config.sh mysql-dfsg-5.1-5.1.31/scripts/mysql_config.sh
+--- mysql-dfsg-5.1-5.1.31.orig/scripts/mysql_config.sh 2009-01-19 17:30:55.000000000 +0100
++++ mysql-dfsg-5.1-5.1.31/scripts/mysql_config.sh 2009-02-08 17:17:48.000000000 +0100
+@@ -104,10 +104,10 @@
+
+ # Create options
+ # We intentionally add a space to the beginning and end of lib strings, simplifies replace later
+-libs=" $ldflags -L$pkglibdir -lmysqlclient @ZLIB_DEPS@ @NON_THREADED_LIBS@"
++libs=" $ldflags -L$pkglibdir -lmysqlclient"
+ libs="$libs @openssl_libs@ @STATIC_NSS_FLAGS@ "
+-libs_r=" $ldflags -L$pkglibdir -lmysqlclient_r @ZLIB_DEPS@ @LIBS@ @openssl_libs@ "
+-embedded_libs=" $ldflags -L$pkglibdir -lmysqld @LIBDL@ @ZLIB_DEPS@ @LIBS@ @WRAPLIBS@ @innodb_system_libs@ @openssl_libs@ "
++libs_r=" $ldflags -L$pkglibdir -lmysqlclient_r @openssl_libs@ "
++embedded_libs=" $ldflags -L$pkglibdir -lmysqld @LIBDL@ @WRAPLIBS@ @innodb_system_libs@ @openssl_libs@ "
+
+ if [ -r "$pkglibdir/libmygcc.a" ]; then
+ # When linking against the static library with a different version of GCC
diff --git a/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch b/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch
new file mode 100644
index 00000000000..6a5cab91c39
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch
@@ -0,0 +1,23 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 50_mysql-test__db_test.dpatch by Christian Hammers <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Patch 33_scripts__mysql_create_system_tables__no_test removes the
+## DP: rights for anybody to connect to the test database but the test
+## DP: suite depends on them.
+
+@DPATCH@
+
+--- old/mysql-test/mysql-test-run.pl 2009-06-16 14:24:09.000000000 +0200
++++ new/mysql-test/mysql-test-run.pl 2009-07-04 00:03:34.000000000 +0200
+@@ -2717,6 +2717,10 @@
+ mtr_appendfile_to_file("$sql_dir/mysql_system_tables_data.sql",
+ $bootstrap_sql_file);
+
++ mtr_tofile($bootstrap_sql_file, "-- Debian removed the default privileges on the 'test' database\n");
++ mtr_tofile($bootstrap_sql_file, "INSERT INTO mysql.db VALUES ('%','test','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');\n");
++
++
+ # Add test data for timezone - this is just a subset, on a real
+ # system these tables will be populated either by mysql_tzinfo_to_sql
+ # or by downloading the timezone table package from our website
diff --git a/storage/xtradb/build/debian/patches/60_percona_support.dpatch b/storage/xtradb/build/debian/patches/60_percona_support.dpatch
new file mode 100644
index 00000000000..e69d0dd5f76
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/60_percona_support.dpatch
@@ -0,0 +1,16 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+
+@DPATCH@
+
+--- a/scripts/mysql_install_db.sh 2009-08-08 09:20:07.000000000 +0000
++++ b/scripts/mysql_install_db.sh 2009-08-08 09:29:23.000000000 +0000
+@@ -469,6 +469,9 @@
+ echo
+ echo "Please report any problems with the $scriptdir/mysqlbug script!"
+ echo
++ echo "For commercial support please contact Percona at http://www.percona.com/contacts.html"
++ echo
++
+ fi
+
+ exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian b/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian
new file mode 100644
index 00000000000..b245638f9c9
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian
@@ -0,0 +1,4 @@
+FAQ:
+
+Q: My <tab> completition is gone, why?
+A: You have "no-auto-rehash" in the "[mysql]" section of /etc/mysql/my.cnf!
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs b/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs
new file mode 100644
index 00000000000..ceda5922c5d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs
@@ -0,0 +1,3 @@
+usr/bin/
+usr/share/man/man1/
+usr/share/perl5/
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs b/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs
new file mode 100644
index 00000000000..8b8692f0d90
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs
@@ -0,0 +1,3 @@
+debian/additions/innotop/changelog.innotop
+EXCEPTIONS-CLIENT
+README
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.files b/storage/xtradb/build/debian/percona-xtradb-client-5.1.files
new file mode 100644
index 00000000000..9ba5fe35054
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.files
@@ -0,0 +1,39 @@
+usr/bin/innotop
+usr/bin/myisam_ftdump
+usr/bin/mysql
+usr/bin/mysqlaccess
+usr/bin/mysqladmin
+usr/bin/mysqlbug
+usr/bin/mysqlcheck
+usr/bin/mysql_client_test
+usr/bin/mysqldump
+usr/bin/mysqldumpslow
+usr/bin/mysql_find_rows
+usr/bin/mysql_fix_extensions
+usr/bin/mysqlimport
+usr/bin/mysqlreport
+usr/bin/mysqlshow
+usr/bin/mysql_waitpid
+usr/sbin/mysqlmanager
+usr/share/lintian/overrides/percona-xtradb-client-5.1
+usr/share/man/man1/innotop.1
+usr/share/man/man1/myisam_ftdump.1
+usr/share/man/man1/mysql.1
+usr/share/man/man1/mysqlaccess.1
+usr/share/man/man1/mysqladmin.1
+usr/share/man/man1/mysqlbug.1
+usr/share/man/man1/mysqlcheck.1
+usr/share/man/man1/mysqldump.1
+usr/share/man/man1/mysqldumpslow.1
+usr/share/man/man1/mysql_find_rows.1
+usr/share/man/man1/mysql_fix_extensions.1
+usr/share/man/man1/mysqlimport.1
+usr/share/man/man1/mysqlmanager.1
+usr/share/man/man1/mysqlmanagerc.1
+usr/share/man/man1/mysqlmanager-pwgen.1
+usr/share/man/man1/mysqlreport.1
+usr/share/man/man1/mysqlshow.1
+usr/share/man/man1/mysql_tableinfo.1
+usr/share/man/man1/mysql_waitpid.1
+usr/share/man/man1/mysql_client_test.1
+usr/share/perl5/InnoDBParser.pm
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.links b/storage/xtradb/build/debian/percona-xtradb-client-5.1.links
new file mode 100644
index 00000000000..247369fa218
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.links
@@ -0,0 +1,3 @@
+usr/bin/mysqlcheck usr/bin/mysqlrepair
+usr/bin/mysqlcheck usr/bin/mysqlanalyze
+usr/bin/mysqlcheck usr/bin/mysqloptimize
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides
new file mode 100644
index 00000000000..d36909f47f2
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides
@@ -0,0 +1,3 @@
+percona-xtradb-client-5.1: package-has-a-duplicate-relation
+percona-xtradb-client-5.1: wrong-name-for-upstream-changelog usr/share/doc/percona-xtradb-client-5.1/changelog.innotop.gz
+percona-xtradb-client-5.1: pkg-not-in-package-test innotop
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu b/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu
new file mode 100644
index 00000000000..1378555c423
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu
@@ -0,0 +1,3 @@
+# According to /usr/share/menu/ policy 1.4, not /usr/share/doc/debian-policy/
+?package(innotop):needs="text" section="Applications/Data Management"\
+ title="innotop" command="/usr/bin/innotop"
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.dirs b/storage/xtradb/build/debian/percona-xtradb-common.dirs
new file mode 100644
index 00000000000..a5a88ede9c1
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.dirs
@@ -0,0 +1 @@
+etc/mysql/conf.d/
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.files b/storage/xtradb/build/debian/percona-xtradb-common.files
new file mode 100644
index 00000000000..931f37a0237
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.files
@@ -0,0 +1,2 @@
+etc/mysql/my.cnf
+usr/share/percona-xtradb-common/internal-use-only
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides
new file mode 100644
index 00000000000..7f58feb498d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides
@@ -0,0 +1,2 @@
+script-not-executable ./usr/share/percona-xtradb-common/internal-use-only/_etc_init.d_mysql
+script-not-executable ./usr/share/percona-xtradb-common/internal-use-only/_etc_mysql_debian-start
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.postrm b/storage/xtradb/build/debian/percona-xtradb-common.postrm
new file mode 100644
index 00000000000..3ca45870165
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.postrm
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ "$1" = "purge" ]; then
+ rmdir /etc/mysql 2>/dev/null || true
+fi
+
+#DEBHELPER#
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS b/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS
new file mode 100644
index 00000000000..a3042dc2918
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS
@@ -0,0 +1,34 @@
+mysql-dfsg-5.1 (5.1.38-1) unstable; urgency=low
+
+ * Please read http://dev.mysql.com/doc/refman/5.1/en/upgrading-from-5-0.html
+ * Make sure to do a REPAIR TABLE on all tables that use UTF-8 and have a
+ FULLTEXT index.
+
+ -- Christian Hammers <ch@debian.org> Sat, 4 Jul 2009 02:31:21 +0200
+
+mysql-dfsg-5.0 (5.1.14beta-2) unstable; urgency=low
+
+ * The BerkeleyDB Storage Engine is no longer supported. If the options
+ have-bdb or skip-bdb are found, MySQL will not start. If you have BDB
+ tables, you should change them to use another storage engine before
+ upgrading to 5.1.
+
+ -- Monty Taylor <mordred@inaugust.com> Thu, 18 Jan 2007 12:28:21 -0800
+
+mysql-dfsg-5.0 (5.0.45-2) unstable; urgency=low
+
+ * Binary logging is now disabled by default. If you really need it (e.g. on
+ a replication master), remove the comment from the log_bin line in my.cnf.
+
+ -- Norbert Tretkowski <nobse@debian.org> Sat, 10 Nov 2007 16:26:35 +0100
+
+mysql-dfsg-5.0 (5.0.18-9) unstable; urgency=low
+
+ * Rotation of the binary logs is now configured in /etc/mysql/my.cnf with
+ "expire-logs-days" which defaults to 20 days. The old file
+ /etc/mysql/debian-log-rotate.conf should be removed together with
+ /etc/cron.daily/mysql-server after this value has been adjusted. Note that
+ the old variable defined the number of files whereas the new one defines
+ a time span in days.
+
+ -- Christian Hammers <ch@debian.org> Tue, 24 Jan 2006 22:18:21 +0100
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian b/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian
new file mode 100644
index 00000000000..741243f1ec3
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian
@@ -0,0 +1,109 @@
+* MYSQL WON'T START OR STOP?:
+=============================
+You may never ever delete the special mysql user "debian-sys-maint". This
+user together with the credentials in /etc/mysql/debian.cnf are used by the
+init scripts to stop the server as they would require knowledge of the mysql
+root users password else.
+So in most of the times you can fix the situation by making sure that the
+debian.cnf file contains the right password, e.g. by setting a new one
+(remember to do a "flush privileges" then).
+
+* WHAT TO DO AFTER UPGRADES:
+============================
+The privilege tables are automatically updated so all there is left is read
+the changelogs on dev.mysql.com to see if any changes affect custom apps.
+
+* WHAT TO DO AFTER INSTALLATION:
+================================
+The MySQL manual describes certain steps to do at this stage in a separate
+chapter. They are not necessary as the Debian packages does them
+automatically.
+
+The only thing that is left over for the admin is
+ - setting the passwords
+ - creating new users and databases
+ - read the rest of this text
+
+* DOWNGRADING TO 4.0 or 4.1:
+============================
+Unsupported. Period.
+But if you do and get problems or make interesting experiences, mail me, it
+might help others.
+Ok, if you really want, I would recommend to "mysqldump --opt" all tables,
+then purge 4.1, delete /var/lib/mysql, install 4.0 and insert the dumps. Be
+carefully, though, with the "mysql" table, you might not simply overwrite that
+one as the password for the mysql "debian-sys-maint" user is stored in
+/etc/mysql/debian.cnf and needed by /etc/init.d/ to start mysql and check if
+it's alive.
+
+* SOME APPLICATION CAN NO LONGER CONNECT:
+=========================================
+This application is probably linked against libmysqlclient12 or below and
+somebody has created a mysql user with new-style passwords.
+The old_passwords=1 option in /etc/mysql/my.cnf might help. If not the
+application that inserted the user has to be changed or the application that
+tries to connect updated to libmysqlclient14 or -15.
+
+* NETWORKING:
+=============
+For security reasons, the Debian package has enabled networking only on the
+loop-back device using "bind-address" in /etc/mysql/my.cnf. Check with
+"netstat -tlnp" where it is listening. If your connection is aborted
+immediately see if "mysqld: all" or similar is in /etc/hosts.allow and read
+hosts_access(5).
+
+* WHERE IS THE DOCUMENTATION?:
+==============================
+Unfortunately due to licensing restrictions, debian currently not able
+to provide the mysql-doc package in any format. For the most up to date
+documentation, please go to http://dev.mysql.com/doc.
+
+* PASSWORDS:
+============
+It is strongly recommended to set a password for the mysql root user (which
+ /usr/bin/mysql -u root -D mysql -e "update user set password=password('new-password') where user='root'"
+ /usr/bin/mysql -u root -e "flush privileges"
+If you already had a password set add "-p" before "-u" to the lines above.
+
+
+If you are tired to type the password in every time or want to automate your
+scripts you can store it in the file $HOME/.my.cnf. It should be chmod 0600
+(-rw------- username username .my.cnf) to ensure that nobody else can read
+it. Every other configuration parameter can be stored there, too. You will
+find an example below and more information in the MySQL manual in
+/usr/share/doc/mysql-doc or www.mysql.com.
+
+ATTENTION: It is necessary, that a .my.cnf from root always contains a "user"
+line wherever there is a "password" line, else, the Debian maintenance
+scripts, that use /etc/mysql/debian.cnf, will use the username
+"debian-sys-maint" but the password that is in root's .my.cnf. Also note,
+that every change you make in the /root/.my.cnf will affect the mysql cron
+script, too.
+
+ # an example of $HOME/.my.cnf
+ [client]
+ user = your-mysql-username
+ password = enter-your-good-new-password-here
+
+* BIG_ROWS FOR EVEN MORE ROWS IN A TABLE:
+=========================================
+If you ever run out of rows in a table there is the possibility of building
+the package with "-DBIG_ROWS" which, according to a MySQL employee on
+packagers@lists.mysql.com should lead to a 64bit row index (I guess > 2^32
+rows) but also to an approx. 5% performance loss.
+
+* BerkeleyDB Storage Engine
+===========================
+Support for BerkeleyDB has been removed in 5.1, and consequently both the
+have-bdb and skip-bdb configuration options will cause the server to fail.
+Removing the options from /etc/mysql/my.cnf will fix this problem.
+
+* FURTHER NOTES ON REPLICATION
+===============================
+If the MySQL server is acting as a replication slave, you should not
+set --tmpdir to point to a directory on a memory-based filesystem or to
+a directory that is cleared when the server host restarts. A replication
+slave needs some of its temporary files to survive a machine restart so
+that it can replicate temporary tables or LOAD DATA INFILE operations. If
+files in the temporary file directory are lost when the server restarts,
+replication fails.
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.config b/storage/xtradb/build/debian/percona-xtradb-server-5.1.config
new file mode 100644
index 00000000000..75f81c4e4d1
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.config
@@ -0,0 +1,46 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+CNF=/etc/mysql/my.cnf
+
+# Beware that there are two ypwhich one of them needs the 2>/dev/null!
+if test -n "`which ypwhich 2>/dev/null`" && ypwhich >/dev/null 2>&1; then
+ db_input high percona-xtradb-server-5.1/nis_warning || true
+ db_go
+fi
+
+# only ask this question on fresh installs, during "reconfiguration" and when
+# not upgrading from an existing 5.0 installation.
+# there is also an additional check for empty root passwords in the
+# postinst script when the tools are available for us to use.
+if [ "$1" = "configure" ] && ([ -z "$2" ] && [ ! -e "/var/lib/mysql/debian-5.0.flag" ] ) || [ "$1" = "reconfigure" ]; then
+ while :; do
+ RET=""
+ db_input high percona-xtradb-server/root_password || true
+ db_go
+ db_get percona-xtradb-server/root_password
+ # if password isn't empty we ask for password verification
+ if [ -z "$RET" ]; then
+ db_fset percona-xtradb-server/root_password seen false
+ db_fset percona-xtradb-server/root_password_again seen false
+ break
+ fi
+ ROOT_PW="$RET"
+ db_input high percona-xtradb-server/root_password_again || true
+ db_go
+ db_get percona-xtradb-server/root_password_again
+ if [ "$RET" == "$ROOT_PW" ]; then
+ ROOT_PW=''
+ break
+ fi
+ db_fset percona-xtradb-server/password_mismatch seen false
+ db_input critical percona-xtradb-server/password_mismatch
+ db_set percona-xtradb-server/root_password ""
+ db_set percona-xtradb-server/root_password_again ""
+ db_go
+ done
+fi
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs b/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs
new file mode 100644
index 00000000000..29c2e756a00
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs
@@ -0,0 +1,9 @@
+etc/init.d
+etc/logrotate.d
+etc/mysql/conf.d
+usr/bin
+usr/sbin
+usr/share/man/man8
+usr/share/mysql
+var/run/mysqld
+var/lib/mysql-upgrade
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs b/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.files b/storage/xtradb/build/debian/percona-xtradb-server-5.1.files
new file mode 100644
index 00000000000..4741b588136
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.files
@@ -0,0 +1,53 @@
+usr/lib/mysql/*so*
+etc/mysql/debian-start
+etc/mysql/conf.d/mysqld_safe_syslog.cnf
+usr/bin/msql2mysql
+usr/bin/my_print_defaults
+usr/bin/myisamchk
+usr/bin/myisamlog
+usr/bin/myisampack
+usr/bin/mysql_convert_table_format
+usr/bin/mysql_fix_privilege_tables
+usr/bin/mysql_install_db
+usr/bin/mysql_secure_installation
+usr/bin/mysql_setpermission
+usr/bin/mysql_tzinfo_to_sql
+usr/bin/mysql_upgrade
+usr/bin/mysql_zap
+usr/bin/mysqlbinlog
+usr/bin/mysqld_multi
+usr/bin/mysqld_safe
+usr/bin/mysqlhotcopy
+usr/bin/mysqltest
+usr/bin/perror
+usr/bin/replace
+usr/bin/resolve_stack_dump
+usr/bin/resolveip
+usr/sbin/mysqld
+usr/share/doc/percona-xtradb-server-5.1/
+usr/share/lintian/overrides/percona-xtradb-server-5.1
+usr/share/man/man1/msql2mysql.1
+usr/share/man/man1/myisamchk.1
+usr/share/man/man1/myisamlog.1
+usr/share/man/man1/myisampack.1
+usr/share/man/man1/my_print_defaults.1
+usr/share/man/man1/mysqlbinlog.1
+usr/share/man/man1/mysql_convert_table_format.1
+usr/share/man/man1/mysqld_multi.1
+usr/share/man/man1/mysqld_safe.1
+usr/share/man/man1/mysql_fix_privilege_tables.1
+usr/share/man/man1/mysqlhotcopy.1
+usr/share/man/man1/mysql_install_db.1
+usr/share/man/man1/mysql_secure_installation.1
+usr/share/man/man1/mysql_setpermission.1
+usr/share/man/man1/mysql_upgrade.1
+usr/share/man/man1/mysqltest.1
+usr/share/man/man1/mysql_zap.1
+usr/share/man/man1/perror.1
+usr/share/man/man1/replace.1
+usr/share/man/man1/resolveip.1
+usr/share/man/man1/resolve_stack_dump.1
+usr/share/man/man1/innochecksum.1
+usr/share/man/man1/mysql_tzinfo_to_sql.1
+usr/share/man/man8/mysqld.8
+usr/share/mysql/
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.links b/storage/xtradb/build/debian/percona-xtradb-server-5.1.links
new file mode 100644
index 00000000000..082680fe5ed
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.links
@@ -0,0 +1,2 @@
+usr/share/mysql/mysql-test/mysql-test-run.pl usr/share/mysql/mysql-test/mysql-test-run
+usr/share/mysql/mysql-test/mysql-test-run.pl usr/share/mysql/mysql-test/mtr
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides
new file mode 100644
index 00000000000..a3ffb465fd5
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides
@@ -0,0 +1,4 @@
+percona-xtradb-server-5.1: possible-bashism-in-maintainer-script postinst:81 'p{("a".."z","A".."Z",0..9)[int(rand(62))]}'
+percona-xtradb-server-5.1: possible-bashism-in-maintainer-script preinst:33 '${cmd/ */}'
+percona-xtradb-server-5.1: statically-linked-binary ./usr/bin/mysql_tzinfo_to_sql
+percona-xtradb-server-5.1: statically-linked-binary ./usr/sbin/mysqld
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid
new file mode 100644
index 00000000000..00cc5c3e29d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid
@@ -0,0 +1,9 @@
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: $
+mysqld\[[0-9]+\]: Version: .* socket: '/var/run/mysqld/mysqld.sock' port: 3306$
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: started$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server
new file mode 100644
index 00000000000..37f25cb01ea
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server
@@ -0,0 +1,32 @@
+/etc/init.d/mysql\[[0-9]+\]: [0-9]+ processes alive and '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: ?$
+mysqld\[[0-9]+\]: .*InnoDB: Shutdown completed
+mysqld\[[0-9]+\]: .*InnoDB: Started;
+mysqld\[[0-9]+\]: .*InnoDB: Starting shutdown\.\.\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Normal shutdown$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Shutdown complete$
+mysqld\[[0-9]+\]: /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*/usr/sbin/mysqld: Shutdown Complete$
+mysqld\[[0-9]+\]: Version: .* socket
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: ?$
+mysqld_safe\[[0-9]+\]: able to use the new GRANT command!$
+mysqld_safe\[[0-9]+\]: ended$
+mysqld_safe\[[0-9]+\]: http://www.mysql.com$
+mysqld_safe\[[0-9]+\]: NOTE: If you are upgrading from a MySQL <= 3.22.10 you should run$
+mysqld_safe\[[0-9]+\]: PLEASE REMEMBER TO SET A PASSWORD FOR THE MySQL root USER !$
+mysqld_safe\[[0-9]+\]: Please report any problems with the /usr/bin/mysqlbug script!$
+mysqld_safe\[[0-9]+\]: See the manual for more instructions.$
+mysqld_safe\[[0-9]+\]: started$
+mysqld_safe\[[0-9]+\]: Support MySQL by buying support/licenses at https://order.mysql.com$
+mysqld_safe\[[0-9]+\]: The latest information about MySQL is available on the web at$
+mysqld_safe\[[0-9]+\]: the /usr/bin/mysql_fix_privilege_tables. Otherwise you will not be$
+mysqld_safe\[[0-9]+\]: To do so, start the server, then issue the following commands:$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root -h app109 password 'new-password'$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root password 'new-password'$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation
new file mode 100644
index 00000000000..37f25cb01ea
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation
@@ -0,0 +1,32 @@
+/etc/init.d/mysql\[[0-9]+\]: [0-9]+ processes alive and '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: ?$
+mysqld\[[0-9]+\]: .*InnoDB: Shutdown completed
+mysqld\[[0-9]+\]: .*InnoDB: Started;
+mysqld\[[0-9]+\]: .*InnoDB: Starting shutdown\.\.\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Normal shutdown$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Shutdown complete$
+mysqld\[[0-9]+\]: /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*/usr/sbin/mysqld: Shutdown Complete$
+mysqld\[[0-9]+\]: Version: .* socket
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: ?$
+mysqld_safe\[[0-9]+\]: able to use the new GRANT command!$
+mysqld_safe\[[0-9]+\]: ended$
+mysqld_safe\[[0-9]+\]: http://www.mysql.com$
+mysqld_safe\[[0-9]+\]: NOTE: If you are upgrading from a MySQL <= 3.22.10 you should run$
+mysqld_safe\[[0-9]+\]: PLEASE REMEMBER TO SET A PASSWORD FOR THE MySQL root USER !$
+mysqld_safe\[[0-9]+\]: Please report any problems with the /usr/bin/mysqlbug script!$
+mysqld_safe\[[0-9]+\]: See the manual for more instructions.$
+mysqld_safe\[[0-9]+\]: started$
+mysqld_safe\[[0-9]+\]: Support MySQL by buying support/licenses at https://order.mysql.com$
+mysqld_safe\[[0-9]+\]: The latest information about MySQL is available on the web at$
+mysqld_safe\[[0-9]+\]: the /usr/bin/mysql_fix_privilege_tables. Otherwise you will not be$
+mysqld_safe\[[0-9]+\]: To do so, start the server, then issue the following commands:$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root -h app109 password 'new-password'$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root password 'new-password'$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init b/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init
new file mode 100644
index 00000000000..0e0d4f9987f
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init
@@ -0,0 +1,182 @@
+#!/bin/bash
+#
+### BEGIN INIT INFO
+# Provides: mysql
+# Required-Start: $remote_fs $syslog
+# Required-Stop: $remote_fs $syslog
+# Should-Start: $network $named $time
+# Should-Stop: $network $named $time
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Start and stop the mysql database server daemon
+# Description: Controls the main MySQL database server daemon "mysqld"
+# and its wrapper script "mysqld_safe".
+### END INIT INFO
+#
+set -e
+set -u
+${DEBIAN_SCRIPT_DEBUG:+ set -v -x}
+
+test -x /usr/sbin/mysqld || exit 0
+
+. /lib/lsb/init-functions
+
+SELF=$(cd $(dirname $0); pwd -P)/$(basename $0)
+CONF=/etc/mysql/my.cnf
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+
+# priority can be overriden and "-s" adds output to stderr
+ERR_LOGGER="logger -p daemon.err -t /etc/init.d/mysql -i"
+
+# Safeguard (relative paths, core dumps..)
+cd /
+umask 077
+
+# mysqladmin likes to read /root/.my.cnf. This is usually not what I want
+# as many admins e.g. only store a password without a username there and
+# so break my scripts.
+export HOME=/etc/mysql/
+
+## Fetch a particular option from mysql's invocation.
+#
+# Usage: void mysqld_get_param option
+mysqld_get_param() {
+ /usr/sbin/mysqld --print-defaults \
+ | tr " " "\n" \
+ | grep -- "--$1" \
+ | tail -n 1 \
+ | cut -d= -f2
+}
+
+## Do some sanity checks before even trying to start mysqld.
+sanity_checks() {
+ # check for config file
+ if [ ! -r /etc/mysql/my.cnf ]; then
+ log_warning_msg "$0: WARNING: /etc/mysql/my.cnf cannot be read. See README.Debian.gz"
+ echo "WARNING: /etc/mysql/my.cnf cannot be read. See README.Debian.gz" | $ERR_LOGGER
+ fi
+
+ # check for diskspace shortage
+ datadir=`mysqld_get_param datadir`
+ if LC_ALL=C BLOCKSIZE= df --portability $datadir/. | tail -n 1 | awk '{ exit ($4>4096) }'; then
+ log_failure_msg "$0: ERROR: The partition with $datadir is too full!"
+ echo "ERROR: The partition with $datadir is too full!" | $ERR_LOGGER
+ exit 1
+ fi
+}
+
+## Checks if there is a server running and if so if it is accessible.
+#
+# check_alive insists on a pingable server
+# check_dead also fails if there is a lost mysqld in the process list
+#
+# Usage: boolean mysqld_status [check_alive|check_dead] [warn|nowarn]
+mysqld_status () {
+ ping_output=`$MYADMIN ping 2>&1`; ping_alive=$(( ! $? ))
+
+ ps_alive=0
+ pidfile=`mysqld_get_param pid-file`
+ if [ -f "$pidfile" ] && ps `cat $pidfile` >/dev/null 2>&1; then ps_alive=1; fi
+
+ if [ "$1" = "check_alive" -a $ping_alive = 1 ] ||
+ [ "$1" = "check_dead" -a $ping_alive = 0 -a $ps_alive = 0 ]; then
+ return 0 # EXIT_SUCCESS
+ else
+ if [ "$2" = "warn" ]; then
+ echo -e "$ps_alive processes alive and '$MYADMIN ping' resulted in\n$ping_output\n" | $ERR_LOGGER -p daemon.debug
+ fi
+ return 1 # EXIT_FAILURE
+ fi
+}
+
+#
+# main()
+#
+
+case "${1:-''}" in
+ 'start')
+ sanity_checks;
+ # Start daemon
+ log_daemon_msg "Starting MySQL database server" "mysqld"
+ if mysqld_status check_alive nowarn; then
+ log_progress_msg "already running"
+ log_end_msg 0
+ else
+ /usr/bin/mysqld_safe > /dev/null 2>&1 &
+ # 6s was reported in #352070 to be too few when using ndbcluster
+ for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
+ sleep 1
+ if mysqld_status check_alive nowarn ; then break; fi
+ log_progress_msg "."
+ done
+ if mysqld_status check_alive warn; then
+ log_end_msg 0
+ # Now start mysqlcheck or whatever the admin wants.
+ output=$(/etc/mysql/debian-start)
+ [ -n "$output" ] && log_action_msg "$output"
+ else
+ log_end_msg 1
+ log_failure_msg "Please take a look at the syslog"
+ fi
+ fi
+ ;;
+
+ 'stop')
+ # * As a passwordless mysqladmin (e.g. via ~/.my.cnf) must be possible
+ # at least for cron, we can rely on it here, too. (although we have
+ # to specify it explicit as e.g. sudo environments points to the normal
+ # users home and not /root)
+ log_daemon_msg "Stopping MySQL database server" "mysqld"
+ if ! mysqld_status check_dead nowarn; then
+ set +e
+ shutdown_out=`$MYADMIN shutdown 2>&1`; r=$?
+ set -e
+ if [ "$r" -ne 0 ]; then
+ log_end_msg 1
+ [ "$VERBOSE" != "no" ] && log_failure_msg "Error: $shutdown_out"
+ log_daemon_msg "Killing MySQL database server by signal" "mysqld"
+ killall -15 mysqld
+ server_down=
+ for i in 1 2 3 4 5 6 7 8 9 10; do
+ sleep 1
+ if mysqld_status check_dead nowarn; then server_down=1; break; fi
+ done
+ if test -z "$server_down"; then killall -9 mysqld; fi
+ fi
+ fi
+
+ if ! mysqld_status check_dead warn; then
+ log_end_msg 1
+ log_failure_msg "Please stop MySQL manually and read /usr/share/doc/percona-xtradb-server-5.1/README.Debian.gz!"
+ exit -1
+ else
+ log_end_msg 0
+ fi
+ ;;
+
+ 'restart')
+ set +e; $SELF stop; set -e
+ $SELF start
+ ;;
+
+ 'reload'|'force-reload')
+ log_daemon_msg "Reloading MySQL database server" "mysqld"
+ $MYADMIN reload
+ log_end_msg 0
+ ;;
+
+ 'status')
+ if mysqld_status check_alive nowarn; then
+ log_action_msg "$($MYADMIN version)"
+ else
+ log_action_msg "MySQL is stopped."
+ exit 3
+ fi
+ ;;
+
+ *)
+ echo "Usage: $SELF start|stop|restart|reload|force-reload|status"
+ exit 1
+ ;;
+esac
+
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate b/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate
new file mode 100644
index 00000000000..0f0de516b13
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate
@@ -0,0 +1,27 @@
+# - I put everything in one block and added sharedscripts, so that mysql gets
+# flush-logs'd only once.
+# Else the binary logs would automatically increase by n times every day.
+# - The error log is obsolete, messages go to syslog now.
+/var/log/mysql.log /var/log/mysql/mysql.log /var/log/mysql/mysql-slow.log {
+ daily
+ rotate 7
+ missingok
+ create 640 mysql adm
+ compress
+ sharedscripts
+ postrotate
+ test -x /usr/bin/mysqladmin || exit 0
+
+ # If this fails, check debian.conf!
+ MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+ if [ -z "`$MYADMIN ping 2>/dev/null`" ]; then
+ # Really no mysqld or rather a missing debian-sys-maint user?
+ # If this occurs and is not a error please report a bug.
+ if ps cax | grep -q mysqld; then
+ exit 1
+ fi
+ else
+ $MYADMIN flush-logs
+ fi
+ endscript
+}
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst
new file mode 100644
index 00000000000..24a3f37c646
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst
@@ -0,0 +1,277 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin
+
+# This command can be used as pipe to syslog. With "-s" it also logs to stderr.
+ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i"
+
+invoke() {
+ if [ -x /usr/sbin/invoke-rc.d ]; then
+ invoke-rc.d mysql $1
+ else
+ /etc/init.d/mysql $1
+ fi
+}
+
+MYSQL_BOOTSTRAP="/usr/sbin/mysqld --bootstrap --user=mysql --skip-grant-tables"
+
+test_mysql_access() {
+ mysql --no-defaults -u root -h localhost </dev/null >/dev/null 2>&1
+}
+
+# call with $1 = "online" to connect to the server, otherwise it bootstraps
+set_mysql_rootpw() {
+ # forget we ever saw the password. don't use reset to keep the seen status
+ db_set percona-xtradb-server/root_password ""
+
+ tfile=`mktemp`
+ if [ ! -f "$tfile" ]; then
+ return 1
+ fi
+
+ # this avoids us having to call "test" or "[" on $rootpw
+ cat << EOF > $tfile
+USE mysql;
+UPDATE user SET password=PASSWORD("$rootpw") WHERE user='root';
+FLUSH PRIVILEGES;
+EOF
+ if grep -q 'PASSWORD("")' $tfile; then
+ retval=0
+ elif [ "$1" = "online" ]; then
+ mysql --no-defaults -u root -h localhost <$tfile >/dev/null
+ retval=$?
+ else
+ $MYSQL_BOOTSTRAP <$tfile
+ retval=$?
+ fi
+ rm -f $tfile
+ return $retval
+}
+
+# This is necessary because mysql_install_db removes the pid file in /var/run
+# and because changed configuration options should take effect immediately.
+# In case the server wasn't running at all it should be ok if the stop
+# script fails. I can't tell at this point because of the cleaned /var/run.
+set +e; invoke stop; set -e
+
+case "$1" in
+ configure)
+ mysql_datadir=/usr/share/mysql
+ mysql_statedir=/var/lib/mysql
+ mysql_rundir=/var/run/mysqld
+ mysql_logdir=/var/log
+ mysql_cfgdir=/etc/mysql
+ mysql_newlogdir=/var/log/mysql
+ mysql_upgradedir=/var/lib/mysql-upgrade
+
+ # first things first, if the following symlink exists, it is a preserved
+ # copy the old data dir from a mysql upgrade that would have otherwise
+ # been replaced by an empty mysql dir. this should restore it.
+ for dir in DATADIR LOGDIR; do
+ if [ "$dir" = "DATADIR" ]; then targetdir=$mysql_statedir; else targetdir=$mysql_newlogdir; fi
+ savelink="$mysql_upgradedir/$dir.link"
+ if [ -L "$savelink" ]; then
+ # If the targetdir was a symlink before we upgraded it is supposed
+ # to be either still be present or not existing anymore now.
+ if [ -L "$targetdir" ]; then
+ rm "$savelink"
+ elif [ ! -d "$targetdir" ]; then
+ mv "$savelink" "$targetdir"
+ else
+ # this should never even happen, but just in case...
+ mysql_tmp=`mktemp -d -t mysql-symlink-restore-XXXXXX`
+ echo "this is very strange! see $mysql_tmp/README..." >&2
+ mv "$targetdir" "$mysql_tmp"
+ cat << EOF > "$mysql_tmp/README"
+
+if you're reading this, it's most likely because you had replaced /var/lib/mysql
+with a symlink, then upgraded to a new version of mysql, and then dpkg
+removed your symlink (see #182747 and others). the mysql packages noticed
+that this happened, and as a workaround have restored it. however, because
+/var/lib/mysql seems to have been re-created in the meantime, and because
+we don't want to rm -rf something we don't know as much about, we're going
+to leave this unexpected directory here. if your database looks normal,
+and this is not a symlink to your database, you should be able to blow
+this all away.
+
+EOF
+ fi
+ fi
+ rmdir $mysql_upgradedir 2>/dev/null || true
+ done
+
+ # Ensure the existence and right permissions for the database and
+ # log files.
+ if [ ! -d "$mysql_statedir" -a ! -L "$mysql_statedir" ]; then mkdir "$mysql_statedir"; fi
+ if [ ! -d "$mysql_statedir/mysql" -a ! -L "$mysql_statedir/mysql" ]; then mkdir "$mysql_statedir/mysql"; fi
+ if [ ! -d "$mysql_newlogdir" -a ! -L "$mysql_newlogdir" ]; then mkdir "$mysql_newlogdir"; fi
+ # When creating an ext3 jounal on an already mounted filesystem like e.g.
+ # /var/lib/mysql, you get a .journal file that is not modifyable by chown.
+ # The mysql_datadir must not be writable by the mysql user under any
+ # circumstances as it contains scripts that are executed by root.
+ set +e
+ chown -R 0:0 $mysql_datadir
+ chown -R mysql $mysql_statedir
+ chown -R mysql $mysql_rundir
+ chown -R mysql:adm $mysql_newlogdir; chmod 2750 $mysql_newlogdir;
+ for i in log err; do
+ touch $mysql_logdir/mysql.$i
+ chown mysql:adm $mysql_logdir/mysql.$i
+ chmod 0640 $mysql_logdir/mysql.$i
+ done
+ set -e
+
+ # This is important to avoid dataloss when there is a removed
+ # percona-xtradb-server version from Woody lying around which used the same
+ # data directory and then somewhen gets purged by the admin.
+ db_set percona-xtradb-server/postrm_remove_database false || true
+
+ # To avoid downgrades.
+ touch $mysql_statedir/debian-5.1.flag
+
+ # initiate databases. Output is not allowed by debconf :-(
+ # Debian: beware of the bashisms...
+ # Debian: can safely run on upgrades with existing databases
+ set +e
+ /bin/bash /usr/bin/mysql_install_db --rpm 2>&1 | $ERR_LOGGER
+ if [ "$?" != "0" ]; then
+ echo "ATTENTION: An error has occured. More info is in the syslog!"
+ fi
+ set -e
+
+ ## On every reconfiguration the maintenance user is recreated.
+ #
+ # - It is easier to regenerate the password every time but as people
+ # use fancy rsync scripts and file alteration monitors, the existing
+ # password is used and existing files not touched.
+ # - The mysqld statement is like that in mysql_install_db because the
+ # server is not already running. This has some implications:
+ # - The amount of newlines and semicolons in the query is important!
+ # - GRANT is not possible with --skip-grant-tables and "INSERT
+ # (user,host..) VALUES" is not --ansi compliant
+ # - The echo is just for readability. ash's buildin has no "-e" so use /bin/echo.
+ # - The Super_priv, Show_db_priv, Create_tmp_table_priv and Lock_tables_priv
+ # may not be present as old Woody 3.23 databases did not have it and the
+ # admin might not already have run mysql_upgrade which adds them.
+ # As the binlog cron scripts to need at least the Super_priv, I do first
+ # the old query which always succeeds and then the new which may or may not.
+
+ # recreate the credentials file if not present or without mysql_upgrade stanza
+ dc=$mysql_cfgdir/debian.cnf;
+ if [ -e "$dc" -a -n "`fgrep mysql_upgrade $dc 2>/dev/null`" ]; then
+ pass="`sed -n 's/^[ ]*password *= *// p' $dc | head -n 1`"
+ else
+ pass=`perl -e 'print map{("a".."z","A".."Z",0..9)[int(rand(62))]}(1..16)'`;
+ if [ ! -d "$mysql_cfgdir" ]; then install -o 0 -g 0 -m 0755 -d $mysql_cfgdir; fi
+ cat /dev/null > $dc
+ echo "# Automatically generated for Debian scripts. DO NOT TOUCH!" >>$dc
+ echo "[client]" >>$dc
+ echo "host = localhost" >>$dc
+ echo "user = debian-sys-maint" >>$dc
+ echo "password = $pass" >>$dc
+ echo "socket = $mysql_rundir/mysqld.sock" >>$dc
+ echo "[mysql_upgrade]" >>$dc
+ echo "host = localhost" >>$dc
+ echo "user = debian-sys-maint" >>$dc
+ echo "password = $pass" >>$dc
+ echo "socket = $mysql_rundir/mysqld.sock" >>$dc
+ echo "basedir = /usr" >>$dc
+ fi
+ # If this dir chmod go+w then the admin did it. But this file should not.
+ chown 0:0 $dc
+ chmod 0600 $dc
+
+ # update privilege tables
+ password_column_fix_query=`/bin/echo -e \
+ "USE mysql\n" \
+ "ALTER TABLE user CHANGE Password Password char(41) character set latin1 collate latin1_bin DEFAULT '' NOT NULL"`;
+ replace_query=`/bin/echo -e \
+ "USE mysql\n" \
+ "REPLACE INTO user SET " \
+ " host='localhost', user='debian-sys-maint', password=password('$pass'), " \
+ " Select_priv='Y', Insert_priv='Y', Update_priv='Y', Delete_priv='Y', " \
+ " Create_priv='Y', Drop_priv='Y', Reload_priv='Y', Shutdown_priv='Y', " \
+ " Process_priv='Y', File_priv='Y', Grant_priv='Y', References_priv='Y', " \
+ " Index_priv='Y', Alter_priv='Y', Super_priv='Y', Show_db_priv='Y', "\
+ " Create_tmp_table_priv='Y', Lock_tables_priv='Y', Execute_priv='Y', "\
+ " Repl_slave_priv='Y', Repl_client_priv='Y', Create_view_priv='Y', "\
+ " Show_view_priv='Y', Create_routine_priv='Y', Alter_routine_priv='Y', "\
+ " Create_user_priv='Y', Event_priv='Y', Trigger_priv='Y' "`;
+ fix_privs=`/bin/echo -e \
+ "USE mysql;\n" \
+ "ALTER TABLE user ADD column Create_view_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Show_view_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Create_routine_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Alter_routine_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Create_user_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Event_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+ "ALTER TABLE user ADD column Trigger_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " `
+ # Engines supported by etch should be installed per default. The query sequence is supposed
+ # to be aborted if the CREATE TABLE fails due to an already existent table in which case the
+ # admin might already have chosen to remove one or more plugins. Newlines are necessary.
+ install_plugins=`/bin/echo -e \
+ "USE mysql;\n" \
+ "CREATE TABLE plugin (name char(64) COLLATE utf8_bin NOT NULL DEFAULT '', " \
+ " dl char(128) COLLATE utf8_bin NOT NULL DEFAULT '', " \
+ " PRIMARY KEY (name)) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='MySQL plugins';\n" \
+ "INSERT INTO plugin VALUES ('innodb', 'ha_innodb.so');\n" \
+ "INSERT INTO plugin VALUES ('federated', 'ha_federated.so');\n" \
+ "INSERT INTO plugin VALUES ('blackhole', 'ha_blackhole.so');\n" \
+ "INSERT INTO plugin VALUES ('archive', 'ha_archive.so');" `
+
+ # Upgrade password column format before the root password gets set.
+ echo "$password_column_fix_query" | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+
+ db_get percona-xtradb-server/root_password && rootpw="$RET"
+ if ! set_mysql_rootpw; then
+ password_error="yes"
+ fi
+
+ echo "$fix_privs" | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+ echo "$replace_query" | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+ set +e
+ echo "$install_plugins" | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+ set -e
+ ;;
+
+ abort-upgrade|abort-remove|abort-configure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument '$1'" 1>&2
+ exit 1
+ ;;
+esac
+
+# here we check to see if we can connect as root without a password
+# this should catch upgrades from previous versions where the root
+# password wasn't set. if there is a password, or if the connection
+# fails for any other reason, nothing happens.
+if [ "$1" = "configure" ]; then
+ if test_mysql_access; then
+ db_input medium percona-xtradb-server/root_password || true
+ db_go
+ db_get percona-xtradb-server/root_password && rootpw="$RET"
+
+ if ! set_mysql_rootpw "online"; then
+ password_error="yes"
+ fi
+ fi
+
+ if [ "$password_error" = "yes" ]; then
+ db_input high percona-xtradb-server/error_setting_password || true
+ db_go
+ fi
+
+fi
+
+db_stop # in case invoke failes
+
+#DEBHELPER#
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm
new file mode 100644
index 00000000000..083a42bd861
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm
@@ -0,0 +1,83 @@
+#!/bin/bash -e
+
+# It is possible that Debconf has already been removed, too.
+if [ -f /usr/share/debconf/confmodule ]; then
+ . /usr/share/debconf/confmodule
+fi
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+
+# Try to stop the server in a sane way. If it does not success let the admin
+# do it himself. No database directories should be removed while the server
+# is running!
+stop_server() {
+ set +e
+ if [ -x /usr/sbin/invoke-rc.d ]; then
+ invoke-rc.d mysql stop
+ else
+ /etc/init.d/mysql stop
+ fi
+ errno=$?
+ set -e
+
+ if [ "$?" != 0 ]; then
+ echo "Trying to stop the MySQL server resulted in exitcode $?." 1>&2
+ echo "Stop it yourself and try again!" 1>&2
+ exit 1
+ fi
+}
+
+case "$1" in
+ purge|remove|upgrade|failed-upgrade|abort-install|abort-upgrade|disappear)
+ if [ -n "`$MYADMIN ping 2>/dev/null`" ]; then
+ stop_server
+ sleep 2
+ fi
+ ;;
+ *)
+ echo "postrm called with unknown argument '$1'" 1>&2
+ exit 1
+ ;;
+esac
+
+#
+# - Do NOT purge logs or data if another percona-xtradb-server* package is installed (#307473)
+# - Remove the mysql user only after all his owned files are purged.
+#
+if [ "$1" = "purge" -a ! \( -x /usr/sbin/mysqld -o -L /usr/sbin/mysqld \) ]; then
+ # we remove the mysql user only after all his owned files are purged
+ rm -f /var/log/mysql.{log,err}{,.0,.[1234567].gz}
+ rm -rf /var/log/mysql
+
+ db_input high percona-xtradb-server-5.1/postrm_remove_databases || true
+ db_go || true
+ db_get percona-xtradb-server-5.1/postrm_remove_databases || true
+ if [ "$RET" = "true" ]; then
+ # never remove the debian.cnf when the databases are still existing
+ # else we ran into big trouble on the next install!
+ rm -f /etc/mysql/debian.cnf
+ rm -rf /var/lib/mysql
+ rm -rf /var/run/mysqld
+ userdel mysql || true
+ fi
+
+ # (normally) Automatically added by dh_installinit
+ if [ "$1" = "purge" ] ; then
+ update-rc.d mysql remove >/dev/null || exit 0
+ fi
+ # (normally) End automatically added section
+fi
+
+# (normally) Automatically added by dh_installdebconf
+if [ "$1" = purge ] && [ -e /usr/share/debconf/confmodule ]; then
+ . /usr/share/debconf/confmodule
+ db_purge
+fi
+# (normally) End automatically added section
+
+# no DEBHELPER here, "update-rc.d remove" fails if percona-xtradb-server-5.1 is installed
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst b/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst
new file mode 100644
index 00000000000..a338e4edd8f
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst
@@ -0,0 +1,186 @@
+#!/bin/bash -e
+#
+# summary of how this script can be called:
+# * <new-preinst> install
+# * <new-preinst> install <old-version>
+# * <new-preinst> upgrade <old-version>
+# * <old-preinst> abort-upgrade <new-version>
+#
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+DATADIR=/var/lib/mysql
+LOGDIR=/var/log/mysql
+UPGRADEDIR=/var/lib/mysql-upgrade
+
+# Try to stop the server in a sane way. If it does not success let the admin
+# do it himself. No database directories should be removed while the server
+# is running! Another mysqld in e.g. a different chroot is fine for us.
+stop_server() {
+ if [ ! -x /etc/init.d/mysql ]; then return; fi
+
+ set +e
+ if [ -x /usr/sbin/invoke-rc.d ]; then
+ cmd="invoke-rc.d mysql stop"
+ else
+ cmd="/etc/init.d/mysql stop"
+ fi
+ $cmd
+ errno=$?
+ set -e
+
+ # 0=ok, 100=no init script (fresh install)
+ if [ "$errno" != 0 -a "$errno" != 100 ]; then
+ echo "${cmd/ */} returned $errno" 1>&2
+ echo "There is a MySQL server running, but we failed in our attempts to stop it." 1>&2
+ echo "Stop it yourself and try again!" 1>&2
+ db_stop
+ exit 1
+ fi
+}
+
+################################ main() ##########################
+
+this_version=5.1
+
+# Check kernel version
+if dpkg --compare-versions `uname -r` lt 2.6; then
+ /bin/echo -e "\nPROBLEM: MySQL-5.x is currently incompatible with kernel 2.4. Aborting.";
+ /bin/echo -e "See http://bugs.debian.org/416841 for more information.\n"
+ exit 1
+fi
+
+# Abort if an NDB cluster is in use.
+if egrep -q -r '^[^#]*ndb.connectstring' /etc/mysql/; then
+ db_fset percona-xtradb-server/no_upgrade_when_using_ndb seen false || true
+ db_input high percona-xtradb-server/no_upgrade_when_using_ndb || true
+ db_go
+ db_stop
+ exit 1
+fi
+
+# Safe the user from stupidities.
+show_downgrade_warning=0
+for i in `ls $DATADIR/debian-*.flag 2>/dev/null`; do
+ found_version=`echo $i | sed 's/.*debian-\([0-9\.]\+\).flag/\1/'`
+ if dpkg --compare-versions "$this_version" '<<' "$found_version"; then
+ show_downgrade_warning=1
+ break;
+ fi
+done
+if [ "$show_downgrade_warning" = 1 ]; then
+ db_fset percona-xtradb-server-$this_version/really_downgrade seen false || true
+ db_input medium percona-xtradb-server-$this_version/really_downgrade || true
+ db_go
+ db_get percona-xtradb-server-$this_version/really_downgrade || true
+ if [ "$RET" = "true" ]; then
+ rm -f $DATADIR/debian-*.flag
+ touch $DATADIR/debian-$this_version.flag
+ else
+ echo "Aborting downgrade from (at least) $found_version to $this_version." 1>&2
+ echo "If are sure you want to downgrade to $this_version, remove the file" 1>&2
+ echo "$DATADIR/debian-*.flag and try installing again." 1>&2
+ db_stop
+ exit 1
+ fi
+fi
+
+# to be sure
+stop_server
+
+# If we use NIS then errors should be tolerated. It's up to the
+# user to ensure that the mysql user is correctly setup.
+# Beware that there are two ypwhich one of them needs the 2>/dev/null!
+if test -n "`which ypwhich 2>/dev/null`" && ypwhich >/dev/null 2>&1; then
+ set +e
+fi
+
+#
+# Now we have to ensure the following state:
+# /etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false
+# /etc/group: mysql:x:101:
+#
+# Sadly there could any state be present on the system so we have to
+# modify everything carefully i.e. not doing a chown before creating
+# the user etc...
+#
+
+# creating mysql group if he isn't already there
+if ! getent group mysql >/dev/null; then
+ # Adding system group: mysql.
+ addgroup --system mysql >/dev/null
+fi
+
+# creating mysql user if he isn't already there
+if ! getent passwd mysql >/dev/null; then
+ # Adding system user: mysql.
+ adduser \
+ --system \
+ --disabled-login \
+ --ingroup mysql \
+ --home $DATADIR \
+ --gecos "MySQL Server" \
+ --shell /bin/false \
+ mysql >/dev/null
+fi
+
+# end of NIS tolerance zone
+set -e
+
+# if there's a symlink, let's store where it's pointing, because otherwise
+# it's going to be lost in some situations
+for dir in DATADIR LOGDIR; do
+ checkdir=`eval echo "$"$dir`
+ if [ -L "$checkdir" ]; then
+ mkdir -p "$UPGRADEDIR"
+ cp -d "$checkdir" "$UPGRADEDIR/$dir.link"
+ fi
+done
+
+# creating mysql home directory
+if [ ! -d $DATADIR -a ! -L $DATADIR ]; then
+ mkdir $DATADIR
+fi
+
+# checking disc space
+if LC_ALL=C BLOCKSIZE= df --portability $DATADIR/. | tail -n 1 | awk '{ exit ($4>1000) }'; then
+ echo "ERROR: There's not enough space in $DATADIR/" 1>&2
+ db_stop
+ exit 1
+fi
+
+# Since the home directory was created before putting the user into
+# the mysql group and moreover we cannot guarantee that the
+# permissions were correctly *before* calling this script, we fix them now.
+# In case we use NIS and no mysql user is present then this script should
+# better fail now than later..
+# The "set +e" is necessary as e.g. a ".journal" of a ext3 partition is
+# not chgrp'able (#318435).
+set +e
+chown mysql:mysql $DATADIR
+find $DATADIR -follow -not -group mysql -print0 2>/dev/null \
+ | xargs -0 --no-run-if-empty chgrp mysql
+set -e
+
+# Some files below /etc/ were possibly in the percona-xtradb-server-5.0/etch package
+# before. They get overwritten by current ones to avoid unnecessary dpkg questions.
+while read md5 file; do
+ if [ "`md5sum $file 2>/dev/null`" = "$md5 $file" ]; then
+ cp /usr/share/percona-xtradb-common/internal-use-only/`echo $file | sed 's°/°_°g'` $file
+ fi
+done <<EOT
+6691f2fdc5c6d27ff0260eb79813e1bc /etc/init.d/mysql
+b53b9552d44661361d39157c3c7c51d3 /etc/logrotate.d/percona-xtradb-server
+57f3e58f72582ca55100dc1ba0f1a8ae /etc/mysql/debian-start
+EOT
+
+db_stop
+
+#DEBHELPER#
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm b/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm
new file mode 100644
index 00000000000..03e9ea37420
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm
@@ -0,0 +1,8 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+#DEBHELPER#
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates b/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates
new file mode 100644
index 00000000000..efa318640db
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates
@@ -0,0 +1,90 @@
+# These templates have been reviewed by the debian-l10n-english
+# team
+#
+# If modifications/additions/rewording are needed, please ask
+# for an advice to debian-l10n-english@lists.debian.org
+#
+# Even minor modifications require translation updates and such
+# changes should be coordinated with translators and reviewers.
+
+Template: percona-xtradb-server-5.1/really_downgrade
+Type: boolean
+Default: false
+_Description: Really proceed with downgrade?
+ A file named /var/lib/mysql/debian-*.flag exists on this system.
+ .
+ Such file is an indication that a percona-xtradb-server package with a higher
+ version has been installed earlier.
+ .
+ There is no guarantee that the version you're currently installing
+ will be able to use the current databases.
+
+Template: percona-xtradb-server-5.1/nis_warning
+Type: note
+#flag:translate!:3,5
+_Description: Important note for NIS/YP users
+ To use MySQL, the following entries for users and groups should be added
+ to the system:
+ .
+ /etc/passwd : mysql:x:100:101:Percona SQL Server:/var/lib/mysql:/bin/false
+ /etc/group : mysql:x:101:
+ .
+ You should also check the permissions and the owner of the
+ /var/lib/mysql directory:
+ .
+ /var/lib/mysql: drwxr-xr-x mysql mysql
+
+Template: percona-xtradb-server-5.1/postrm_remove_databases
+Type: boolean
+Default: false
+_Description: Remove all Percona SQL databases?
+ The /var/lib/mysql directory which contains the Percona SQL databases is about
+ to be removed.
+ .
+ If you're removing the Percona SQL package in order to later install a more
+ recent version or if a different percona-xtradb-server package is already
+ using it, the data should be kept.
+
+Template: percona-xtradb-server-5.1/start_on_boot
+Type: boolean
+Default: true
+_Description: Start the Percona SQL server on boot?
+ The Percona SQL server can be launched automatically at boot time or manually
+ with the '/etc/init.d/mysql start' command.
+
+Template: percona-xtradb-server/root_password
+Type: password
+_Description: New password for the Percona SQL "root" user:
+ While not mandatory, it is highly recommended that you set a password
+ for the Percona SQL administrative "root" user.
+ .
+ If that field is left blank, the password will not be changed.
+
+Template: percona-xtradb-server/root_password_again
+Type: password
+_Description: Repeat password for the Percona SQL "root" user:
+
+Template: percona-xtradb-server/error_setting_password
+Type: error
+_Description: Unable to set password for the Percona SQL "root" user
+ An error occurred while setting the password for the Percona SQL
+ administrative user. This may have happened because the account
+ already has a password, or because of a communication problem with
+ the Percona SQL server.
+ .
+ You should check the account's password after the package installation.
+ .
+ Please read the /usr/share/doc/percona-xtradb-server-5.1/README.Debian file
+ for more information.
+
+Template: percona-xtradb-server/password_mismatch
+Type: error
+_Description: Password input error
+ The two passwords you entered were not the same. Please try again.
+
+Template: percona-xtradb-server/no_upgrade_when_using_ndb
+Type: error
+_Description: NDB Cluster seems to be in use
+ Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new
+ mysql-cluster package and remove all lines starting with "ndb" from
+ all config files below /etc/mysql/.
diff --git a/storage/xtradb/build/debian/po/POTFILES.in b/storage/xtradb/build/debian/po/POTFILES.in
new file mode 100644
index 00000000000..b3a73d17bd1
--- /dev/null
+++ b/storage/xtradb/build/debian/po/POTFILES.in
@@ -0,0 +1 @@
+[type: gettext/rfc822deb] percona-xtradb-server-5.1.templates
diff --git a/storage/xtradb/build/debian/po/ar.po b/storage/xtradb/build/debian/po/ar.po
new file mode 100644
index 00000000000..6a51c1f8919
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ar.po
@@ -0,0 +1,267 @@
+# translation of templates.po to Arabic
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+#
+# Ossama M. Khayat <okhayat@yahoo.com>, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: templates\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-01 13:04+0300\n"
+"Last-Translator: Ossama M. Khayat <okhayat@yahoo.com>\n"
+"Language-Team: Arabic <support@arabeyes.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms: nplurals=6; plural=n==1 ? 0 : n==0 ? 1 : n==2 ? 2: n%100>=3 && "
+"n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "هل Ùعلاً تريد التثبيط؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "هناك مل٠مسمى /var/lib/mysql/debian-*.flag موجود على هذا النظام."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"هذا المل٠دلالة على أن نسخة أحدث من حزمة mysql-server تم تثبيتها مسبقاً."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"ليست هناك أية ضمانة أن النسخة التي تقوم بتثبيتها ستكون قادرة على استخدام "
+"قواعد البيانات الحالية."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "ملاحظة هامة لمستخدمي NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"كي تستخدم MySQLØŒ يجب إضاÙØ© المÙدخلات التالية الخاصة بالمستخدمين والمجموعات "
+"إلى النظام:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr "عليك أيضاً أن تقوم بالتأكد من صلاحيات مالك المل٠/var/lib/mysql: "
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "إزالة جميع قواعد بيانات MySQL؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr "الدليل /var/lib/mysql الذي يحتوي قواعد بيانات MySQL ستتم إزالته."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"إن كنت تقوم بإزالة حزمة MySQL كي تقوم لاحقاً بتثبيت نسخة أحدث أو إن كانت حزمة "
+"mysql-server مختلÙØ© تستخدمها، Ùيجب إبقاء البيانات."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "تشغيل خادم MySQL عند الإقلاع؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"يمكن تشغيل خادم MySQL آلياً وقت الإقلاع أو يدوياً باستخدام الأمر '/etc/init.d/"
+"mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "كلمة المرور الجديدة لمستخد \"root\" الخاص بـMySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"مع أنه ليس إجبارياً، ولكن من المستحسن أن تقوم بتعيين كلمة مرور خاصة بمستخدم "
+"MySQL الإداري \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "إن ترك الحقل Ùارغاً، Ùلن يتم تغيير كلمة المرور."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "كلمة المرور الجديدة لمستخد \"root\" الخاص بـMySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "تعذر تعيين كلمة مرور للمستخدم \"root\" الخاص بـMySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"حدث خطأ أثناء تعيين كلمة المرور لمستخدم MySQL الإداري. قد يكون هذا حدث بسبب "
+"أن حساب المستخدم له كلمة مرور معيّنة مسبقاً، أو بسبب مشكلة ÙÙŠ الاتصال مع خادم "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "يجب عليك التحقق من كلمة مرور الحساب عقب تثبيت الحزمة."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"الرجاء قراءة المل٠/usr/share/doc/mysql-server-5.1/README.Debian للمزيد من "
+"المعلومات."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "هل تريد دعم اتصالات MySQL من الأجهزة التي تعمل على ديبيان \"sarge\" أو "
+#~ "أقدم؟"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "ÙÙŠ إصدارات عملاء MySQL القديمة من ديبيان، لم تكن كلمات المرور تحÙظ بشكل "
+#~ "آمن. ولقد حل هذه المشكلة بعدها، غير أن العملاء (مثل PHP) المتصلين من "
+#~ "أجهزة تعمل على ديبيان Sarge 3.1 لن يكونوا قادرين على الاتصال باستخدام "
+#~ "الحسابات الحديثة أو الحسابات التي تم تغيير كلمة مرورها."
diff --git a/storage/xtradb/build/debian/po/ca.po b/storage/xtradb/build/debian/po/ca.po
new file mode 100644
index 00000000000..94fadb6d5df
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ca.po
@@ -0,0 +1,342 @@
+# mysql-dfsg (debconf) translation to Catalan.
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+# Aleix Badia i Bosch <abadia@ica.es> 2004
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2004-01-31 19:20GMT\n"
+"Last-Translator: Aleix Badia i Bosch <abadia@ica.es>\n"
+"Language-Team: Debian L10n Catalan <debian-l10n-catalan@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=ISO-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Nota important pels usuaris de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Voleu que el MySQL s'iniciï a l'arrencada ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"El MySQL es pot executar a l'arrencada o només si executeu manualment '/etc/"
+"init.d/mysql start'. Seleccioneu 'sí' si voleu que s'inicialitzi "
+"automàticament."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#, fuzzy
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Per utilitzar la base de dades de MySQL heu d'afegir un usuari i grup "
+#~ "equivalent al següent i assegurar-vos que el directori /var/lib/mysql "
+#~ "tingui els permisos correctes."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#, fuzzy
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr ""
+#~ "Feu una ullada al document: http://www.mysql.com/doc/en/Upgrade.html"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "El MySQL només s'instal·la en cas de tenir un nom d'ordinador central que "
+#~ "no sigui numèric i que es pugui resoldre a través del fitxer /etc/hosts. "
+#~ "Ex. si l'ordre \"hostname\" retorna \"myhostname\", llavors hi ha d'haver "
+#~ "una línia com la següent \"10.0.0.1 myhostname\"."
+
+#, fuzzy
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Es crea un nou usuari de mysql \"debian-sys-maint\". S'utilitza per les "
+#~ "seqüències d'inicialització i aturada del cron, no el suprimiu."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Recordeu posar una contrasenya al superusuari del MySQL. Si utilitzeu un "
+#~ "fitxer /root/.my.cnf, escriviu sempre allà les línies \"user\" i "
+#~ "\"password\".; mai només la contrasenya. Per a més informació feu una "
+#~ "ullada a /usr/share/doc/mysql-server/README.Debian."
+
+#, fuzzy
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "Voleu suprimir totes les bases de dades en purgar el paquet mysql-server ?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "La xarxa està inhabilitada per defecte per a raons de seguretat. La podeu "
+#~ "habilitar descomentant l'opció de skip-networking del fitxer /etc/mysql/"
+#~ "my.cnf."
+
+#~ msgid "security and update notice"
+#~ msgstr "Avís de seguretat i actualització"
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Executeu mysql_fix_privilege_tables"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "S'asseguren els permisos de seguretat de /var/lib/mysql canviant a mysql "
+#~ "tots els GIDs diferents a root i mysql."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "Per habilitar el suport de SSL podeu seguir les instruccions de /usr/"
+#~ "share/doc/mysql-server/"
+
+#~ msgid "mysql_fix_privileges_tables will be executed"
+#~ msgstr "s'executa mysql_fix_privileges_tables"
+
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in the \"mysql\" database. This is done by the "
+#~ "mysql_fix_privilege_tables script during this upgrade regardless of if "
+#~ "the server is currently running or not!"
+#~ msgstr ""
+#~ "Les últimes versions de MySQL tenen un sistema de privilegis més "
+#~ "elaborat. Per utilitzar-lo cal afegir nous camps a les taules de la base "
+#~ "de dades \"mysql\". Aquesta tasca la realitza la seqüència "
+#~ "mysql_fix_privilege_tables durant l'actualització independentment de si "
+#~ "el servidor s'està executant o no!"
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Aquesta seqüència no assigna privilegis d'usuari diferents als que ja "
+#~ "tenia, en cas que us trobéssiu en aquesta situació, poseu-vos en contacte "
+#~ "amb mi."
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "Voleu suprimir tots els continguts de /var/lib/mysql quan es purgui el "
+#~ "paquet mysql-server amb l'ordre \"dpkg --purge mysql-server\". (ex. "
+#~ "suprimir-ho tot inclòs la configuració) ? (per defecte no)"
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "Voleu fer accessible el MySQL via xarxa ?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "Voleu que el MySQL escolti a un port TCP accessible des de la xarxa ? "
+#~ "Aquesta opció no és imprescindible en ordinadors aïllats i podria "
+#~ "provocar un problema de seguretat."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "Permetre el mode chroot ?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "El MySQL es pot executar en una entorn tancat al directori /var/lib/"
+#~ "mysql_jail perquè els usuaris no puguin modificar cap fitxer fora del "
+#~ "directori.Aquesta opció també augmenta la seguretat envers els crackers, "
+#~ "jaque no poden modificar els fitxers del sistema."
diff --git a/storage/xtradb/build/debian/po/cs.po b/storage/xtradb/build/debian/po/cs.po
new file mode 100644
index 00000000000..01d95723aa9
--- /dev/null
+++ b/storage/xtradb/build/debian/po/cs.po
@@ -0,0 +1,361 @@
+#
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+#
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Developers do not need to manually edit POT or PO files.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-01 13:01+0200\n"
+"Last-Translator: Miroslav Kure <kurem@debian.cz>\n"
+"Language-Team: Czech <debian-l10n-czech@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Opravdu pokraÄovat v degradaci?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "V systému existuje soubor /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr "To znamená, že již byl nainstalován balík mysql-server s vyšší verzí."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Neexistuje žádná záruka, že momentálně instalovaná verze bude umět pracovat "
+"se stávajícími databázemi."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Důležitá poznámka pro uživatele NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Abyste mohli MySQL používat, musíte v systému založit následující uživatele "
+"a skupiny:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Také byste měli zkontrolovat vlastníka a oprávnění adresáře /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Odstranit všechny MySQL databáze?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Adresář /var/lib/mysql, ve kterém se nachází MySQL databáze, bude odstraněn."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Jestliže odstraňujete balík MySQL za úÄelem instalace novÄ›jší verze MySQL, "
+"nebo pokud tato data souběžně využívá jiný balík mysql-server, měli byste "
+"data ponechat."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Spustit MySQL server při startu systému?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL se může spouÅ¡tÄ›t automaticky pÅ™i startu systému, nebo ruÄnÄ› příkazem '/"
+"etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nové heslo MySQL uživatele \"root\":"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"PÅ™estože to není nezbytné, je silnÄ› doporuÄeno nastavit heslo u "
+"správcovského MySQL úÄtu \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Ponecháte-li pole prázdné, heslo se nezmění."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nové heslo MySQL uživatele \"root\":"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Nelze nastavit heslo MySQL uživatele \"root\""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Během nastavování hesla pro správcovského uživatele MySQL se vyskytla chyba. "
+"To se mohlo stát třeba proto, protože uživatel již měl heslo nastaveno, nebo "
+"protože nastal problém v komunikaci s MySQL serverem."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Po instalaci balíku byste měli heslo ověřit."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Více informací naleznete v /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Aktualizace nelze provést pokud jsou přítomny tabulky ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Poslední verze MySQL již nemohou používat starý formát tabulek ISAM a "
+#~ "před aktualizací je nutné převést tyto tabulky např. do formátu MyISAM "
+#~ "pomocí \"mysql_convert_table_format\" nebo \"ALTER TABLE x ENGINE=MyISAM"
+#~ "\". Instalace mysql-server-5.1 se nyní přeruší. V případě, že se mezitím "
+#~ "odinstaloval původní mysql-server-4.1, jednoduše jej znovu nainstalujte a "
+#~ "tabulky pÅ™eveÄte."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Podporovat MySQL pÅ™ipojení z poÄítaÄů používajících Debian Sarge nebo "
+#~ "starší?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Způsob, jakým se dříve ukládala hesla, nebyl příliÅ¡ bezpeÄný. To se nyní "
+#~ "zlepšilo, ale nevýhodou je, že se klienti z Debianu 3.1 Sarge (např. PHP) "
+#~ "nebudou moci pÅ™ipojit na nové úÄty, nebo na úÄty, u nichž se heslo "
+#~ "změnilo."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Abyste mohli mysql používat, musíte do následujících souborů přidat "
+#~ "ekvivalentního uživatele a skupinu a zajistit, že /var/lib/mysql má "
+#~ "správná práva (uid/gid se mohou lišit)."
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Odstranit databáze používané všemi verzemi MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Nezadáte-li heslo, žádné zmÄ›ny se s úÄtem neprovedou."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Po skonÄení instalace byste mÄ›li ověřit, že je úÄet chránÄ›n heslem (více "
+#~ "informací naleznete v souboru README.Debian)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Poznámky k aktualizaci"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be "
+#~ "corrupted! This script also enhances the privilege tables but is not "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Po aktualizaci ještě musíte spustit \"mysql_upgrade\", protože jinak by "
+#~ "se tabulky mohly narušit! Tento skript také rozšiřuje tabulky privilegií, "
+#~ "ovšem neměl by uživatelům přidat více práv, než měli dosud."
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Také si pÅ™eÄtÄ›te http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL se nainstaluje pouze v případě, že používáte nenumerické jméno "
+#~ "poÄítaÄe, které se dá pÅ™eložit pÅ™es soubor /etc/hosts. NapÅ™. když příkaz "
+#~ "\"hostname\" vrátí \"diamond\", tak v /etc/hosts musí existovat obdobný "
+#~ "řádek jako \"10.0.0.1 diamond\"."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Bude vytvoÅ™en nový mysql uživatel \"debian-sys-maint\". Tento mysql úÄet "
+#~ "se používá ve startovacích, ukonÄovacích a cronových skriptech. Nemažte "
+#~ "jej."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Nezapomeňte nastavit heslo pro úÄet administrátora MySQL! Používáte-li /"
+#~ "root/.my.cnf, vždy zde zadejte jak řádek \"user\", tak řádek \"password"
+#~ "\". Nikdy zde nezadávejte jenom heslo!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Mám odstranit kompletní adresářový strom /var/lib/mysql, který se používá "
+#~ "pro všechny verze MySQL, tedy ne nutně pouze pro verzi, kterou se "
+#~ "chystáte vyÄistit?"
diff --git a/storage/xtradb/build/debian/po/da.po b/storage/xtradb/build/debian/po/da.po
new file mode 100644
index 00000000000..a44088472a6
--- /dev/null
+++ b/storage/xtradb/build/debian/po/da.po
@@ -0,0 +1,397 @@
+#
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+# Developers do not need to manually edit POT or PO files.
+#
+# Claus Hindsgaul <claus_h@image.dk>, 2005, 2006.
+# Claus Hindsgaul <claus.hindsgaul@gmail.com>, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-30 22:41+0200\n"
+"Last-Translator: Claus Hindsgaul <claus.hindsgaul@gmail.com>\n"
+"Language-Team: Danish\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=ISO-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Ønsker du virkelig at fortsætte nedgraderingen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Der er en fil med navnet /var/lib/mysql/debian-*.flag på dette system."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Sådan en fil tyder på at der tidligere har været installeret en højere "
+"version af mysql-server-pakken."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Det kan ikke garanteres at den version, du er ved at installere, kan benytte "
+"data fra de eksisterende databaser."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Vigtig oplysning til NIS/YP-brugere"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Nedenstående linjer for brugere og grupper skal tilføjes dette system for at "
+"benytte MySQL:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Du bør også tjekke filrettighederne og ejerskabet af mappen /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Fjern alle MySQL-databaser?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Mappen /var/lib/mysql, der indeholder MySQL-databaserne, er ved at blive "
+"fjernet."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Hvis du fjerner MySQL-pakken for senere at installere en nyere version, "
+"eller hvis en anden mysql-server-pakke allerede benytter den, bør dataene "
+"bevares."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Start MySQL-serveren under systemopstart?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL-serveren kan enten startes op automatisk under systemopstarten, eller "
+"manuelt med kommandoen '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Ny adgangskode for MySQL's \"root\"-bruger:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Selvom det ikke kræves, anbefales det kraftigt, at du sætter en adgangskode "
+"for MySQL's administrationsbruger \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Hvis du lader dette felt stå tomt, vil adgangskoden ikke blive ændret."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Ny adgangskode for MySQL's \"root\"-bruger:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Kunne ikke sætte adgangskoden for MySQL's \"root\"-bruger"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Der opstod en fejl, da adgangskoden for MySQL's administrationsbruger blev "
+"forsøgt ændret. Dette kan være sket, fordi brugeren allerede har en "
+"adgangskode, eller fordi der var problemer med at kommunikere med MySQL-"
+"serveren."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr "Du bør tjekke kontoens adgangskode efter pakkeinstallationen."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Se filen /usr/share/doc/mysql-server-5.1/README.Debian for yderligere "
+"oplysninger."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Kan ikke opgradere hvis der er ISAM-tabeller!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Nyere versioner af MySQL kan ikke længere benytte det gamle ISAM-"
+#~ "tabelformat, og det er derfor nødvendigt at konvertere dine tabeller til "
+#~ "f.eks. MyISAM forud for opgraderingen med \"mysql_convert_table_format\" "
+#~ "eller \"ALTER TABLE x ENGINE=MyISAM\". Installationen af mysql-server-5.1 "
+#~ "afbrydes nu. Skulle din gamle mysql-server-4.1 alligevel bliver "
+#~ "afinstalleret, så geninstallér den blot og konverter tabellerne."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Understøt MySQL-forbindelser fra maskiner, der kører Debian \"Sarge\" "
+#~ "eller ældre?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Gamle udgaver af MySQL-klienter på Debian gemte ikke adgangskoderne "
+#~ "sikkert. Dette er blevet forbedret siden da, men klienter (f.eks. PHP) "
+#~ "fra maskiner, der kører Debian 3.1 Sarge vil ikke kunne forbinde til "
+#~ "nyere konti eller konti, hvis adgangskode er blevet ændret."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "For at kunne bruge mysql skal du installere en bruger og en gruppe, der "
+#~ "svarer til nedenstående, og sikre dig at /var/lib/mysql har de rigtige "
+#~ "adgangsrettigheder (uid/gid kan afvige)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Fjern de databaser, der benyttes af samtlige MySQL-versioner?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Hvis du ikke angiver en adgangskode, vil kontoen ikke blive ændret."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Når installationen afsluttes, bør du tjekke at kontoen er ordentligt "
+#~ "beskyttet med en adgangskode (se README.Debian for yderligere "
+#~ "oplysninger)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Opdateringstips"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be "
+#~ "corrupted! This script also enhances the privilege tables but is not "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Du skal køre \"mysql_upgrade\" efter opgraderingen, da tabellerne eller "
+#~ "kan blive ødelagt! Dette script forbedrer også rettighedstabellerne, men "
+#~ "burde ikke give nogen bruger flere rettigheder, end han havde tidligere,"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Læs også http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid "Install Hints"
+#~ msgstr "Installationstips"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Ved opgraderinger fra MySQL 3.23, der fulgte med Debian Woody, kan de "
+#~ "symbolske /var/lib/mysql or /var/log/mysql blive fjernet ved et uheld, og "
+#~ "må genskabes manuelt."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL vil kun blive installeret, hvis du har et ikke-numerisk værtsnavn, "
+#~ "som kan slås op i filen /ets/hosts. Hvis f.eks. kommandoen \"hostname\" "
+#~ "svarer med \"mitvaertsnavn\", skal du have en linje a'la \"10.0.0.1 "
+#~ "mitvaertsnavn\" i /etc/hosts."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Det vil blive oprettet en ny mysql-bruger, \"debian-sys-maint\". Denne "
+#~ "mysql-konto bruges i start/stop-cron-scripterne. Slet den ikke."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Husk at sætte en ADGANGSKODE for MySQLs root-bruger! Hvis du bruger en /"
+#~ "etc/.my.cnf, så skriv altid \"user\"- og \"password\"-linjer ind her, "
+#~ "ikke kun adgangskoden!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Skal jeg fjerne hele mappetræet /var/lib/mysql, som benyttes af alle "
+#~ "MySQL-versioner, ikke kun den version, du er ved at slette?"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "En sjælden gang imellem, f.eks. ved nye hovedversioner, sker det at "
+#~ "rettighedssystemet forbedres. For at gøre brug af dette, skal "
+#~ "mysql_fix_privilege_tables køres manuelt. Scriptet vil ikke give nogen "
+#~ "bruger flere rettigheder, end vedkommende havde tidligere,"
diff --git a/storage/xtradb/build/debian/po/de.po b/storage/xtradb/build/debian/po/de.po
new file mode 100644
index 00000000000..c5dcdaff234
--- /dev/null
+++ b/storage/xtradb/build/debian/po/de.po
@@ -0,0 +1,277 @@
+# translation of mysql-dfsg-5.1_5.0.41-2_de.po to german
+#
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+# Developers do not need to manually edit POT or PO files.
+#
+# Alwin Meschede <ameschede@gmx.de>, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.41-2_de\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-29 16:05+0200\n"
+"Last-Translator: Alwin Meschede <ameschede@gmx.de>\n"
+"Language-Team: german <debian-l10n-german@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Möchten Sie wirklich eine ältere Version einspielen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Auf diesem System existiert eine Datei mit dem Namen /var/lib/mysql/debian-*."
+"flag"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Diese Datei ist ein Hinweis darauf, dass früher ein MySQL-Server-Paket mit "
+"einer höheren Version installiert war."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Es kann nicht garantiert werden, dass die gegenwärtig zu installierende "
+"Version dessen Daten benutzen kann."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Wichtige Anmerkung für NIS/YP-Benutzer!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Um MySQL benutzen zu können, sollten die folgenden Benutzer und Gruppen dem "
+"System hinzugefügt werden:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Sie sollten außerdem Besitzer und Zugriffsrechte des Verzeichnisses /var/lib/"
+"mysql überprüfen:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Alle MySQL-Datenbanken entfernen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Das Verzeichnis /var/lib/mysql mit den MySQL-Datenbanken soll entfernt "
+"werden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Falls geplant ist, nur eine höhere Version von MySQL zu installieren oder "
+"ein anderes mysql-server-Paket dieses bereits benutzt, sollten die Daten "
+"behalten werden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Soll MySQL automatisch beim Booten starten?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Der MySQL-Dienst kann entweder automatisch beim Systemstart oder manuell "
+"durch Eingabe des Befehls »/etc/init.d/mysql start« gestartet werden."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Neues Passwort für den MySQL »root«-Benutzer:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Obwohl es nicht zwingend erforderlich ist, wird nachdrücklich empfohlen für "
+"den administrativen MySQL »root«-Benutzer ein Passwort zu setzen."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Wenn dieses Feld freigelassen wird, wird das Passwort nicht geändert."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Neues Passwort für den MySQL »root«-Benutzer:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Konnte für den MySQL-»root«-Benutzer kein Passwort setzen"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Beim setzen des Passworts für den administrativen MySQL-Benutzer ist ein "
+"Fehler aufgetreten. Dies könnte daran liegen, dass der Benutzer bereits ein "
+"Passwort hat oder dass es ein Problem mit der Kommunikation mit dem MySQL-"
+"Server gibt."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Sie sollten das Passwort des administrativen Benutzers nach der "
+"Paketinstallation prüfen."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Für weitere Informationen lesen Sie /usr/share/doc/mysql-server-5.1/README."
+"Debian"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Sollen MySQL-Verbindungen von Rechnern mit Debian »Sarge« oder älter "
+#~ "unterstützt werden?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Alte Versionen der MySQL-Clients für Debian speicherten Passwörter nicht "
+#~ "sehr sicher. Dies wurde verbessert, allerdings werden Clients (z. B. PHP) "
+#~ "von Hosts mit Debian 3.1 Sarge sich nicht mehr mit MySQL-Konten verbinden "
+#~ "können, die neu angelegt werden oder deren Passwort geändert wird. Siehe "
+#~ "auch /usr/share/doc/mysql-server-5.1/README.Debian."
diff --git a/storage/xtradb/build/debian/po/es.po b/storage/xtradb/build/debian/po/es.po
new file mode 100644
index 00000000000..a37942fd59d
--- /dev/null
+++ b/storage/xtradb/build/debian/po/es.po
@@ -0,0 +1,405 @@
+# mysql-dfsg-5 translation to spanish
+# Copyright (C) 2005-2007 Software in the Public Interest, SPI Inc.
+# This file is distributed under the same license as the XXXX package.
+#
+# Changes:
+# - Initial translation
+# Jesus Aneiros, 2006
+# - Updated
+# Javier Fernandez-Sanguino, 2006-2007
+# - Revision
+# Nacho Barrientos Arias
+# Fernando Cerezal
+# David Martínez Moreno
+# Ricardo Mones
+# Carlos Galisteo
+# Javier Fernandez-Sanguino
+#
+#
+# Traductores, si no conoce el formato PO, merece la pena leer la
+# documentación de gettext, especialmente las secciones dedicadas a este
+# formato, por ejemplo ejecutando:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+#
+# Equipo de traducción al español, por favor lean antes de traducir
+# los siguientes documentos:
+#
+# - El proyecto de traducción de Debian al español
+# http://www.debian.org/intl/spanish/
+# especialmente las notas y normas de traducción en
+# http://www.debian.org/intl/spanish/notas
+#
+# - La guía de traducción de po's de debconf:
+# /usr/share/doc/po-debconf/README-trans
+# o http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Si tiene dudas o consultas sobre esta traducción consulte con el último
+# traductor (campo Last-Translator) y ponga en copia a la lista de
+# traducción de Debian al español (<debian-l10n-spanish@lists.debian.org>)
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.24-3\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-28 22:21+0200\n"
+"Last-Translator: Javier Fernández-Sanguino <jfs@debian.org>\n"
+"Language-Team: Debian l10 Spanish <debian-l10n-spanish@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "¿Desea realmente continuar con la desactualización?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Existe un archivo con el nombre /var/lib/mysql/debian-*.flag en este sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Este fichero indica que se instaló previamente una versión superior del "
+"paquete mysql-server."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"No se puede garantizar que la versión que está instalando pueda usar la base "
+"de datos actual."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para los usuarios de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Deben añadirse las siguientes entradas para usuarios y grupos en el sistema "
+"para poder utilizar MySQL:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"También debería comprobar los permisos y el propietario del directorio /var/"
+"lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "¿Desea eliminar todas las bases de datos MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"El directorio /var/lib/mysql contiene bases de datos MySQL que van a "
+"eliminarse."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Debería mantener los datos si tiene planificado instalar una versión de "
+"MySQL más reciente o si hay un paquete «mysql-server» distinto que los está "
+"utilizando."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "¿Debería ejecutarse el servidor MySQL al iniciarse el sistema?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"El servidor MySQL puede iniciarse en el momento de arranque del sistema o "
+"manualmente si escribe la orden «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nueva contraseña para el usuario «root» de MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Se recomienda que configure una contraseña para el usuario "
+"«root» (administrador) de MySQL, aunque no es obligatorio."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "No se modificará la contraseña si deja el espacio en blanco."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nueva contraseña para el usuario «root» de MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "No se pudo fijar la contraseña para el usuario «root» de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Se produjo un error mientras intentaba fijar la contraseña para el usuario "
+"administrador de MySQL. Esto puede haber sucedido porque la cuenta ya tenía "
+"una contraseña o porque se produjo un error de comunicación con el servidor "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Debería comprobar la contraseña de la cuenta después de la instalación del "
+"paquete."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Consulte /usr/share/doc/mysql-server-5.1/README.Debian para más información."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "¡No se puede actualizar si ya hay tablas ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Las versiones recientes de MySQL ya no soportan el antiguo formato de "
+#~ "tabla ISAM. Antes de realizar la actualización es necesario convertir sus "
+#~ "tablas a por ejemplo, MyISAM, usando «mysql_convert_table_format» o «ALTER "
+#~ "TABLE x ENGINE=MyISAM». Se va a interrumpir ahora la instalación de mysql-"
+#~ "server-5.1. Si aún así su mysql-server-4.1 se elimina aún así, puede "
+#~ "reinstalarlo para convertir ese tipo de tablas."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "¿Soportar las conexiones MySQL establecidadas desde sistemas que ejecutan "
+#~ "Debian Sarge o versiones anteriores?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "No era muy segura la forma en la que se almacenaban las contraseñas en "
+#~ "versiones anteriores del cliente de MySQL en Debian. Este problema se ha "
+#~ "mejorado posteriormente con el inconveniente, sin embargo, de que "
+#~ "clientes (por ejemplo, PHP) en sistemas que ejecutan Debian 3.1 «Sarge» no "
+#~ "podrán conectarse a cuentas que son nuevas o a las que se le haya "
+#~ "cambiado la contraseña."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar mysql debe instalar un usuario y grupo equivalente al "
+#~ "siguiente y asegurarse de que /var/lib/mysql tiene los permisos correctos "
+#~ "(los valores del «uid» y del «gid» pueden ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr ""
+#~ "¿Eliminar las bases de datos utilizadas por todas las versiones de MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "No se hará ningún cambio en la cuenta si no introduce una contraseña."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Debería confirmar que la contraseña está correctamente protegida con una "
+#~ "contraseña cuando termine la instalación (consulte el fichero README."
+#~ "Debian si desea más información)."
+
+#~ msgid "Install Hints"
+#~ msgstr "Sugerencias para la instalación"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Al actualizar a la versión de MySQL 3.23, la vrsión proporcionada en "
+#~ "Debian Woody, se eliminan de manera accidental, los enlaces simbólicos a «/"
+#~ "var/lib/mysql» o «/var/log/mysql» y tienen que restaurarse manualmente."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "Sólo se instalará MySQL si tiene un nombre de equipo que no sea una "
+#~ "dirección IP y pueda resolverse a través del archivo /etc/hosts. Por "
+#~ "ejemplo, si la orden «hostname» devuelve «MiNombreEquipo» entonces deberá "
+#~ "existir una línea «10.0.0.1 MiNombreEquipo» en dicho archivo."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Se creará un nuevo usuario «debian-sys-maint». Esta cuenta de mysql se "
+#~ "utilizará en los scripts de inicio y parada y en los scripts «cron». No la "
+#~ "elimine."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "¡Por favor, recuerde crear una CONTRASEÑA para el usuario «root» de MySQL! "
+#~ "¡Si utiliza /root/.my.cnf debe escribir las líneas «user» y «password» en "
+#~ "dicho archivo, no incluya sólo la contraseña!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "¿Debería eliminar el árbol de directorio /var/lib/mysql completo? Tenga "
+#~ "en cuenta que lo utilizan todas las versiones de MySQL y no sólo la que "
+#~ "está a punto de purgar."
diff --git a/storage/xtradb/build/debian/po/eu.po b/storage/xtradb/build/debian/po/eu.po
new file mode 100644
index 00000000000..b091e6c01cf
--- /dev/null
+++ b/storage/xtradb/build/debian/po/eu.po
@@ -0,0 +1,295 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# Piarres BEobide <pi@beobide.net>, 2006.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.26-3-debconf_eu\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-19 09:33+0100\n"
+"Last-Translator: Piarres Beobide <pi@beobide.net>\n"
+"Language-Team: Euskara <Librezale@librezale.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"X-Generator: Pootle 0.10.1\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Benetan bertsio zaharragora itzuli nahi duzu?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Oharra: /var/lib/mysql/debian-*.flag dago.. Honek aurretik bertsio "
+"berriagoko mysql-zerbitzari bat instalatu dela adierazten du. Ezin da "
+"ziurtatu bertsio honek datu horiek erabili ahal izango dituenik."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP erabiltzaileentzat ohar garrantzitsua!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Script-a /var/lib/mysql data direktorioa ezabatzera doa. MySQL bertsio "
+"berriago bat instalatu behar bada edo beste mysql-server pakete bat berau "
+"erabiltzen ari bada, datuak mantendu egingo dira."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Sistema abiaraztean MySQL abiarazi behar al da?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL abiaraztean automatikoki abiarazi daiteke edo eskuz /etc/init.d/mysql "
+"start' eginaz."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "MySQL \"root\" erabiltzailearen pasahitz berria:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Oso gomendagarria da MySQL administratzaile \"root\" erabiltzaileari "
+"pasahitz bat ezartzea."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "MySQL \"root\" erabiltzailearen pasahitz berria:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Ezinda MySQL \"root\" erabiltzailearen pasahitza ezarri"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Dirudienez errore bat gertatu da MySQL administratzaile kontuaren pasahitza "
+"ezartzean. Hau erabiltzaileak dagoeneko pasahitz bat duelako edo MySQL "
+"zerbitzariarekiko konexioan erroreak daudelako gertatu daiteke."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Debian \"sarge\" edo zaharragoak erabiltzen duten ostalarietatik MySQL "
+#~ "konexioak onartu?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Pasahitzak biltegiratzeko modua ez da oso ziurra. Hau hobetua izan da "
+#~ "baina Debian 3.1 Sarge erabiltzaileak ezingo dira kontu berri edo "
+#~ "pasahitza aldatu duten kontuetara konektatu. Begiratu /usr/share/doc/"
+#~ "mysql-server-5.1/README.Debian argibide gehiagorako."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Mysql erabili ahal izateko beharrezko erabiltzaile eta taldea sortu eta /"
+#~ "var/lib/mysql-ek beharrezko baimenak dituela ziurtatu behar duzu (uid/gid-"
+#~ "a ezberdina izan daiteke)"
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "MySQL bertsio guztiek erabilitako databaseak ezabatu?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Ez baduzu pasahitzik ezartzen ez da aldaketarik egingo kontuan."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Instalazio amaitzean, kontua pasahitzez babesturik dagoela ziurtatu "
+#~ "beharko zenuke (README.Debian irakurri xehetasun gehiagotarako)"
diff --git a/storage/xtradb/build/debian/po/fr.po b/storage/xtradb/build/debian/po/fr.po
new file mode 100644
index 00000000000..b4dcce8658b
--- /dev/null
+++ b/storage/xtradb/build/debian/po/fr.po
@@ -0,0 +1,274 @@
+# translation of fr.po to French
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+#
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Developers do not need to manually edit POT or PO files.
+#
+# Christian Perrier <bubulle@debian.org>, 2004, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: fr\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-19 22:43+0200\n"
+"Last-Translator: Christian Perrier <bubulle@debian.org>\n"
+"Language-Team: French <debian-l10n-french@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"debian.org>\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms: Plural-Forms: nplurals=2; plural=n>1;\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Faut-il vraiment revenir à la version précédente ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Un fichier /var/lib/mysql/debian-*.flag est présent sur ce système."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Cela indique qu'une version plus récente du paquet mysql-server a été "
+"précédemment installée."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr "Il n'est pas garanti que cette version puisse en utiliser les données."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Note importante pour les utilisateurs NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Pour pouvoir utiliser MySQL, les utilisateurs et les groupes suivants "
+"doivent être ajoutés au système :"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Vous devez également vérifier le propriétaire et les permissions du "
+"répertoire /var/lib/mysql :"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Faut-il supprimer toutes les bases de données MySQL ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Le répertoire /var/lib/mysql qui contient les bases de données de MySQL va "
+"être supprimé."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Si vous prévoyez d'installer une version plus récente de MySQL ou si un "
+"autre paquet mysql-server les utilise déjà, vous devriez les conserver."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Faut-il lancer MySQL au démarrage ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL peut être lancé soit au démarrage, soit en entrant la commande « /etc/"
+"init.d/mysql start »."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nouveau mot de passe du superutilisateur de MySQL :"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Il est très fortement recommandé d'établir un mot de passe pour le compte "
+"d'administration de MySQL (« root »)."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Si ce champ est laissé vide, le mot de passe ne sera pas changé."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nouveau mot de passe du superutilisateur de MySQL :"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+"Impossible de changer le mot de passe de l'utilisateur « root » de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Une erreur s'est produite lors du changement de mot de passe du compte "
+"d'administration. Un mot de passe existait peut-être déjà ou il n'a pas été "
+"possible de communiquer avec le serveur MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Vous devriez vérifier le mot de passe de ce compte après l'installation du "
+"paquet."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Veuillez consulter le fichier /usr/share/doc/mysql-server-5.1/README.Debian "
+"pour plus d'informations."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Gérer les connexions d'hôtes qui utilisent les versions Debian « sarge » "
+#~ "ou antérieures  ?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "La méthode de stockage des mots de passe n'était pas très sûre dans les "
+#~ "version précédentes de ce paquet. Cette méthode a été améliorée mais les "
+#~ "modifications empêchent la connexion avec de nouveaux comptes ou des "
+#~ "comptes dont le mot de passe a été modifié, pour les clients (p. ex. PHP) "
+#~ "depuis des hôtes qui utilisent Debian 3.1 « sarge »."
diff --git a/storage/xtradb/build/debian/po/gl.po b/storage/xtradb/build/debian/po/gl.po
new file mode 100644
index 00000000000..ef845649f18
--- /dev/null
+++ b/storage/xtradb/build/debian/po/gl.po
@@ -0,0 +1,264 @@
+# Galician translation of mysql-dfsg-5.1's debconf templates
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Jacobo Tarrio <jtarrio@debian.org>, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-20 09:44+0200\n"
+"Last-Translator: Jacobo Tarrio <jtarrio@debian.org>\n"
+"Language-Team: Galician <proxecto@trasno.net>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "¿Quere pasar a unha versión anterior?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Neste sistema hai un ficheiro chamado /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Este ficheiro indica que antes se instalou un paquete mysql-server cunha "
+"versión superior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Non se pode garantir que a versión que está a instalar poida empregar as "
+"bases de datos actuais."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para os usuarios de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para empregar MySQL deberían engadirse ao sistema as seguintes entradas de "
+"usuarios e grupos:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Tamén debería comprobar os permisos e o propietario do directorio /var/lib/"
+"mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "¿Eliminar tódalas bases de datos de MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Hase eliminar o directorio /var/lib/mysql, que contén as bases de datos de "
+"MySQL."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se está a eliminar o paquete MySQL para instalar despois unha versión máis "
+"recente ou se xa hai un paquete mysql-server diferente a empregalo, debería "
+"conservar os datos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "¿Iniciar o servidor MySQL co ordenador?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Pódese iniciar automaticamente o servidor MySQL ao iniciar o ordenador, ou "
+"manualmente coa orde \"/etc/init.d/mysql start\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Novo contrasinal para o usuario \"root\" de MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Aínda que non é obrigatorio, recoméndase encarecidamente que estableza un "
+"contrasinal para o usuario administrativo \"root\" de MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Se deixa o campo en branco, non se ha cambiar o contrasinal."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Novo contrasinal para o usuario \"root\" de MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Non se puido establecer o contrasinal do usuario \"root\" de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Houbo un erro ao establecer o contrasinal do usuario administrativo de "
+"MySQL. Puido ocorrer porque o usuario xa teña un contrasinal ou debido a un "
+"problema de comunicacións co servidor MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Debería comprobar o contrasinal da conta trala instalación do paquete."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Consulte o ficheiro /usr/share/doc/mysql-server-5.1/README.Debian para máis "
+"información."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "¿Soportar as conexións a MySQL de máquinas que empreguen Debian \"sarge\" "
+#~ "ou anterior?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nas versións antigas dos clientes MySQL de Debian, os contrasinais non se "
+#~ "armacenaban de xeito seguro. Isto mellorouse desde aquela; nembargantes, "
+#~ "os clientes (tales coma PHP) das máquinas que executen Debian 3.1 Sarge "
+#~ "non se han poder conectar a contas recentes ou a contas nas que se "
+#~ "cambiara o contrasinal."
diff --git a/storage/xtradb/build/debian/po/it.po b/storage/xtradb/build/debian/po/it.po
new file mode 100644
index 00000000000..405709d60c3
--- /dev/null
+++ b/storage/xtradb/build/debian/po/it.po
@@ -0,0 +1,266 @@
+# Italian (it) translation of debconf templates for mysql-dfsg-5.1
+# Copyright (C) 2006 Software in the Public Interest
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Luca Monducci <luca.mo@tiscali.it>, 2006, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.38 italian debconf templates\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-22 15:43+0200\n"
+"Last-Translator: Luca Monducci <luca.mo@tiscali.it>\n"
+"Language-Team: Italian <debian-l10n-italian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Procedere realmente con l'abbassamento di versione?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Su questo sistema esiste un file con nome /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Quel file indica che in precedenza è stata installata una versione superiore "
+"del pacchetto mysql-server."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Non è garantito che la versione che si sta installando sia in grado di usare "
+"i database presenti."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante per gli utenti NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Per usare MySQL i seguenti utenti e gruppi devono essere aggiunti al sistema:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Inoltre si devono verificare i permessi e il proprietario della directory /"
+"var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Eliminare tutti i database MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"La directory /var/lib/mysql contenente i database di MySQL sta per essere "
+"eliminata."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se si rimuove il pacchetto MySQL per poi installare una versione più recente "
+"oppure se sono già in uso da un altro pacchetto mysql-server, i dati non "
+"devono essere eliminati."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Lanciare il server MySQL all'avvio?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Il server MySQL può essere lanciato automaticamente all'avvio del sistema "
+"oppure manualmente con il comando «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nuova password per l'utente «root» di MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Sebbene non sia obbligatoria, si raccomanda d'impostare una password per "
+"l'utente d'amministrazione «root» di MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Se questo campo è lasciato vuoto, la password non viene cambiata."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nuova password per l'utente «root» di MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Impossibile impostare la password per l'utente «root» di MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Si è verificato un errore durante l'impostazione della password per l'utente "
+"d'amministrazione di MySQL. Questo può essere accaduto perché l'utente ha "
+"già una password oppure a causa di un problema di connessione con il server "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Al termine dell'installazione si deve verificare la password dell'account."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Per maggiori informazioni si consulti il file /usr/share/doc/mysql-server-"
+"5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Supporto a connessioni MySQL da macchine con Debian «sarge» o antecedente"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nelle precedenti versioni dei client MySQL su Debian le password non "
+#~ "erano memorizzate in modo sicuro. Questo è stato migliorato ma i client "
+#~ "(per esempio PHP) presenti su una macchina con Debian 3.1 Sarge non sono "
+#~ "più in grado di connettersi a un nuovo account né ad account le cui "
+#~ "password siano state cambiate."
diff --git a/storage/xtradb/build/debian/po/ja.po b/storage/xtradb/build/debian/po/ja.po
new file mode 100644
index 00000000000..16af16b4d9f
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ja.po
@@ -0,0 +1,394 @@
+#
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+#
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Developers do not need to manually edit POT or PO files.
+#
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.32-6\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 22:25+0900\n"
+"Last-Translator: Hideki Yamane (Debian-JP) <henrich@debian.or.jp>\n"
+"Language-Team: Japanese <debian-japanese@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "本当ã«ãƒ€ã‚¦ãƒ³ã‚°ãƒ¬ãƒ¼ãƒ‰ã—ã¾ã™ã‹?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"警告: /var/lib/mysql/debian-*.flag ファイルãŒå­˜åœ¨ã—ã¦ã„ã¾ã™ã€‚ã“ã‚Œã¯ã€ä»¥å‰ã«ã‚ˆ"
+"ã‚Šæ–°ã—ã„ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã® mysql-server パッケージãŒã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã•ã‚Œã¦ã„ãŸã“ã¨ã‚’示"
+"ã—ã¾ã™ã€‚データをã“ã®ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã§ä½¿ãˆã‚‹ã‹ã©ã†ã‹ã€ä¿è¨¼ã§ãã¾ã›ã‚“。"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP ユーザã¸é‡è¦ãªæ³¨æ„!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"ã“ã®ã‚¹ã‚¯ãƒªãƒ—トã¯ãƒ‡ãƒ¼ã‚¿ã®ãƒ‡ã‚£ãƒ¬ã‚¯ãƒˆãƒª /var/lib/mysql を削除ã™ã‚‹ãŸã‚ã®ã‚‚ã®ã§"
+"ã™ã€‚å˜ã«æ–°ã—ã„ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã® MySQL をインストールã—よã†ã¨ã—ã¦ã„ã‚‹ã€ã‚ã‚‹ã„ã¯åˆ¥"
+"ã® mysql-server パッケージを既ã«ä½¿ã£ã¦ã„ã‚‹å ´åˆã€ãƒ‡ãƒ¼ã‚¿ã¯ä¿æŒã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾"
+"ã™ã€‚"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "MySQL をシステム起動時ã«é–‹å§‹ã—ã¾ã™ã‹?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL ã®èµ·å‹•æ–¹æ³•ã«ã¤ã„ã¦ã€ã‚·ã‚¹ãƒ†ãƒ èµ·å‹•æ™‚ã«è‡ªå‹•çš„ã«é–‹å§‹ã™ã‚‹ã‹ã€ã‚ã‚‹ã„㯠'/etc/"
+"init.d/mysql start' ã¨æ‰‹ã§å…¥åŠ›ã—ãŸæ™‚ã®ã¿èµ·å‹•ã™ã‚‹ã‹ã‚’é¸ã¹ã¾ã™ã€‚"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "MySQL ã® \"root\" ユーザã«å¯¾ã™ã‚‹æ–°ã—ã„パスワード:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"MySQL を管ç†ã™ã‚‹ \"root\" ユーザã®ãƒ‘スワードを設定ã™ã‚‹ã“ã¨ã‚’å¼·ããŠå‹§ã‚ã—ã¾"
+"ã™ã€‚"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "MySQL ã® \"root\" ユーザã«å¯¾ã™ã‚‹æ–°ã—ã„パスワード:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "MySQL ã® \"root\" ユーザã®ãƒ‘スワードを設定ã§ãã¾ã›ã‚“"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"MySQL ã®ç®¡ç†è€…ユーザã«å¯¾ã—ã¦ãƒ‘スワードを設定ã—よã†ã¨ã—ãŸéš›ã€ã‚¨ãƒ©ãƒ¼ãŒç™ºç”Ÿã—ãŸ"
+"よã†ã§ã™ã€‚ã“ã‚Œã¯æ—¢ã«ç®¡ç†è€…ユーザã«ãƒ‘スワードãŒè¨­å®šã•ã‚Œã¦ã„ãŸã‹ã€MySQL サーãƒ"
+"ã¨ã®æŽ¥ç¶šã«å•é¡ŒãŒã‚ã£ãŸãŸã‚ã ã¨æ€ã‚ã‚Œã¾ã™ã€‚"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "See /usr/share/doc/mysql-server-5.1/README.Debian for more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"詳細㯠/usr/share/doc/mysql-server-5.1/README.Debian ã‚’å‚ç…§ã—ã¦ãã ã•ã„。"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "ISAM テーブルãŒã‚ã‚‹å ´åˆã¯ã‚¢ãƒƒãƒ—グレードã§ãã¾ã›ã‚“!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "MySQL ã®æœ€è¿‘ã®ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã§ã¯ä»¥å‰ã® ISAM テーブル形å¼ã¯åˆ©ç”¨ã§ããªããªã£ã¦ã„"
+#~ "ã¾ã™ã€‚ãã®ãŸã‚ã€ä¾‹ãˆã° \"mysql_convert_table_format\" ã‚ã‚‹ã„㯠\"ALTER "
+#~ "TABLE x ENGINE=MyISAM\" ã¨ã—ã¦ã‚¢ãƒƒãƒ—グレードå‰ã« MyISAM ã«ã‚³ãƒ³ãƒãƒ¼ãƒˆã™ã‚‹ã“"
+#~ "ã¨ãªã©ãŒå¿…è¦ã§ã™ã€‚mysql-server-5.1 ã®ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã‚’中断ã—ã¾ã™ã€‚以å‰ã® "
+#~ "mysql-server-4.1 ãŒå‰Šé™¤ã•ã‚Œã¦ã—ã¾ã£ãŸå ´åˆã§ã‚ã£ã¦ã‚‚ã€ãƒ†ãƒ¼ãƒ–ルをコンãƒãƒ¼ãƒˆ"
+#~ "ã™ã‚‹ãŸã‚ã«å†ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã‚’ã—ã¦ãã ã•ã„。"
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Debian \"Sarge\" ã‚ã‚‹ã„ã¯ãれよりもå¤ã„ãƒãƒ¼ã‚¸ãƒ§ãƒ³ãŒç¨¼åƒã—ã¦ã„るホストã‹ã‚‰"
+#~ "ã® MySQL 接続をサãƒãƒ¼ãƒˆã—ã¾ã™ã‹?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "パスワードã®ä¿å­˜æ–¹æ³•ã¯ã€ã‚ã¾ã‚Šå®‰å…¨ãªæ–¹æ³•ã§è¡Œã‚ã‚Œã¦ã„ã¾ã›ã‚“ã§ã—ãŸã€‚ã“ã‚Œã¯æ”¹"
+#~ "å–„ã•ã‚Œã¾ã—ãŸãŒã€Debian 3.1 Sarge ãŒç¨¼åƒã—ã¦ã„るホストã‹ã‚‰ã‚¯ãƒ©ã‚¤ã‚¢ãƒ³ãƒˆ "
+#~ "(PHP ãªã©) ãŒæ–°ã—ã„アカウントやパスワードãŒå¤‰æ›´ã•ã‚ŒãŸã‚¢ã‚«ã‚¦ãƒ³ãƒˆã«ã¯æŽ¥ç¶šã§"
+#~ "ããªããªã‚‹ã¨ã„ã†æ¬ ç‚¹ã‚‚ã§ã¦ã„ã¾ã™ã€‚詳細ã«ã¤ã„ã¦ã¯ /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian ã‚’å‚ç…§ã—ã¦ãã ã•ã„。"
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "mysql を利用ã™ã‚‹ã«ã¯ 以下ã®ãƒ¦ãƒ¼ã‚¶ã¨ã‚°ãƒ«ãƒ¼ãƒ—を作æˆã—ã€/var/lib/mysql ãŒæ­£ã—"
+#~ "ã„権é™ã«ãªã£ã¦ã„ã‚‹ã‹ã©ã†ã‹ã‚’確èªã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾ã™ (ãŠãらã uid/gid ãŒé•"
+#~ "ã„ã¾ã™)。"
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "å…¨ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã® MySQL ã§åˆ©ç”¨ã•ã‚Œã¦ã„るデータベースを削除ã—ã¾ã™ã‹?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "パスワードを入力ã—ãªã„å ´åˆã€ã‚¢ã‚«ã‚¦ãƒ³ãƒˆã«å¯¾ã—ã¦å¤‰æ›´ã¯åŠ ãˆã‚‰ã‚Œã¾ã›ã‚“。"
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "インストールãŒçµ‚了ã—ãŸéš›ã€ã‚¢ã‚«ã‚¦ãƒ³ãƒˆãŒãƒ‘スワードã§ãã¡ã‚“ã¨ä¿è­·ã•ã‚Œã¦ã„ã‚‹ã‹"
+#~ "ã©ã†ã‹ã‚’確èªã—ã¦ãã ã•ã„ (詳細ã«ã¤ã„ã¦ã¯ README.Debian ã‚’å‚ç…§ã—ã¦ãã ã•"
+#~ "ã„)。"
+
+#~ msgid "Install Hints"
+#~ msgstr "インストールã®ãƒ’ント"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Debian Woody ã§ãƒªãƒªãƒ¼ã‚¹ã•ã‚ŒãŸ MySQL 3.23 ã‹ã‚‰ã®ã‚¢ãƒƒãƒ—グレードã§ã¯ã€/var/"
+#~ "lib/mysql ã‚ã‚‹ã„㯠/var/log/mysql ã®ä»£ã‚ã‚Šã«ã‚·ãƒ³ãƒœãƒªãƒƒã‚¯ãƒªãƒ³ã‚¯ã¯å¶ç„¶ã«ã‚‚削"
+#~ "除ã•ã‚Œã¦ã—ã¾ã£ã¦ã„ã‚‹ã®ã§ã€æ‰‹å‹•ã§ã®ãƒªã‚¹ãƒˆã‚¢ãŒå¿…è¦ã«ãªã‚Šã¾ã™ã€‚"
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL 㯠/etc/hosts ファイル経由ã§è§£æ±ºã§ãる「数字ã®ã¿ã§æ§‹æˆã•ã‚Œã¦ãªã„ã€ãƒ›"
+#~ "ストåã®å ´åˆã®ã¿ã€ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã•ã‚Œã¾ã™ã€‚ã¤ã¾ã‚Šã€\"hostname\" コマンド㌠"
+#~ "\"myhostname\" ã‚’è¿”ã™ãªã‚‰ã€\"10.0.0.1 myhostname\" ã¨ã„ã†è¡ŒãŒ /etc/hosts "
+#~ "ファイルã«ã‚ã‚‹ã¯ãšã§ã™ã€‚"
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "æ–°è¦ã« mysql ユーザã¨ã—㦠\"debian-sys-maint\" ãŒä½œæˆã•ã‚Œã¾ã™ã€‚ã“ã® mysql "
+#~ "アカウント㯠start/stop 時㨠cron スクリプトã§åˆ©ç”¨ã•ã‚Œã¾ã™ã€‚消ã•ãªã„ã§ãã "
+#~ "ã•ã„。"
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "MySQL ã® root ユーザã«å¯¾ã—ã¦ã€Œãƒ‘スワードã®è¨­å®šã€ã‚’忘れãªã„ã§ãã ã•ã„! /"
+#~ "root/.my.cnf を使ã£ã¦ã„ã‚‹å ´åˆã€ã“ã®ãƒ•ã‚¡ã‚¤ãƒ«ä¸­ã® \"user\" 行㨠\"password"
+#~ "\" 行を記述ã—ã¦ãã ã•ã„。決ã—ã¦ãƒ‘スワードã ã‘ã§ã¯ã„ã‘ã¾ã›ã‚“!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "ã“ã‚Œã‹ã‚‰ purge ã—よã†ã¨ã—ã¦ã„ã‚‹ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã ã‘ã§ã¯ç„¡ãã€å…¨ã¦ã®ãƒãƒ¼ã‚¸ãƒ§ãƒ³ã® "
+#~ "MySQL ㌠/var/lib/mysql ディレクトリを使用ã—ã¦ã„ã¾ã™ã€‚ã“ã®ãƒ‡ã‚£ãƒ¬ã‚¯ãƒˆãƒªã‚’完"
+#~ "å…¨ã«å‰Šé™¤ã—ã¾ã™ã‹?"
+
+#~ msgid "Update Hints"
+#~ msgstr "æ›´æ–°ã®ãƒ’ント"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "ã¾ã‚Œã«ã€ã¤ã¾ã‚Šã¯æ–°ã—ã„メジャーãƒãƒ¼ã‚¸ãƒ§ãƒ³ã«ãŠã„ã¦ã€ç‰¹æ¨©ã‚·ã‚¹ãƒ†ãƒ ãŒæ”¹å–„ã•ã‚Œã¾"
+#~ "ã™ã€‚ã“れを行ã†ã«ã¯ã€mysql_fix_privilege_table を手動ã§å®Ÿè¡Œã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾"
+#~ "ã™ã€‚スクリプトã¯ã€ã©ã®ã‚ˆã†ãªãƒ¦ãƒ¼ã‚¶ã«å¯¾ã—ã¦ã‚‚ã€ä»¥å‰ã«ä¿æŒã—ã¦ã„ãŸä»¥ä¸Šã®æ¨©é™"
+#~ "を与ãˆã‚‹ã‚ˆã†ã«ã¯ãªã£ã¦ã„ã¾ã›ã‚“。"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "http://www.mysql.com/doc/ja/Upgrade.html ã‚‚å‚ç…§ã—ã¦ãã ã•ã„"
diff --git a/storage/xtradb/build/debian/po/nb.po b/storage/xtradb/build/debian/po/nb.po
new file mode 100644
index 00000000000..1235cdc1b2c
--- /dev/null
+++ b/storage/xtradb/build/debian/po/nb.po
@@ -0,0 +1,297 @@
+# translation of mysql_nb.po to Norwegian Bokmål
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+#
+# Bjørn Steensrud <bjornst@powertech.no>, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql_nb\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 12:13+0100\n"
+"Last-Translator: Bjørn Steensrud <bjornst@powertech.no>\n"
+"Language-Team: Norwegian Bokmål <i18n-nb@lister.ping.uio.no>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.2\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Er du sikker på at du vil nedgradere?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"ADVARSEL: Fila /var/lib/mysql/debian-*.flag finnes. Dette viser at en mysql-"
+"server-pakke med et høyere versjonsnummer har vært installert før. Det kan "
+"ikke garanteres at denne versjonen kan bruke data fra den høyere versjonen."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Viktig merknad for NIS/YP-brukere!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Dette skriptet skal til å fjerne data-mappa /var/lib/mysql. Denne mappa bør "
+"beholdes hvis det bare skal installeres en høyere MySQL-versjon, eller hvis "
+"en annen mysql-server-pakke allerede bruker den."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Skal MySQL startes ved maskinoppstart?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan startes automatisk når maskinen starter, eller bare hvis du "
+"skriver «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nytt passord for MySQLs «root»-bruker:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Det anbefales sterkt at du oppgir et passord for den administrative «root»-"
+"brukeren i MySQl."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nytt passord for MySQLs «root»-bruker:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Klarer ikke angi passord for MySQLs «root»-bruker"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Det ser ut til at det oppsto en feil mens det ble satt et passord for MySQLs "
+"administrative bruker. Dette kan være fordi brukeren allerede har et "
+"passord, eller fordi det var et kommunikasjonsproblem med MySQL-tjeneren."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Skal MySQL-tilkoblinger støttes fra vertsmaskiner som kjører Debian "
+#~ "«sarge» eller eldre?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Passord ble tidligere lagret på en lite sikker måte. Dette er nå "
+#~ "forbedret, med den ulempen at klienter (f.eks. PHP) fra verter som kjører "
+#~ "Debian 3.1 Sarge ikke vil kunne koble til en konto som er ny eller har "
+#~ "fått endret passordet. Se /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "For å bruke MySQL må du installere en bruker og gruppe tilsvarende den "
+#~ "nedenfor og se til at /var/lib/mysql har riktige rettigheter (uid/gid kan "
+#~ "være forskjellig)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Skal databasene brukt av alle MySQL-versjoner fjernes?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Hvis du ikke oppgir et passord blir det ikke gjort noen endringer med "
+#~ "kontoen."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Når installasjonen er ferdig bør det sjekkes at kontoen er ordentlig "
+#~ "beskyttet med et passord (mer informasjon finnes i README.Debian)."
diff --git a/storage/xtradb/build/debian/po/nl.po b/storage/xtradb/build/debian/po/nl.po
new file mode 100644
index 00000000000..bfe418117e6
--- /dev/null
+++ b/storage/xtradb/build/debian/po/nl.po
@@ -0,0 +1,302 @@
+# Dutch mysql-dfsg-5.1 po-debconf translation,
+# Copyright (C) 2006 THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Vincent Zweije <zweije@xs4all.nl>, 2006.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.30-1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2006-02-19 10:20+0100\n"
+"Last-Translator: Thijs Kinkhorst <thijs@debian.org>\n"
+"Language-Team: Debian-Dutch <debian-l10n-dutch@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Wilt u echt een oude versie herstellen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Waarschuwing: waarschijnlijk is een hogere versie van het mysql-server "
+"pakket geïnstalleerd geweest (het bestand /var/lib/mysql/debian-*.flag "
+"bestaat). Er is geen garantie dat de gegevensbestanden, bewerkt met die "
+"hogere versie, kunnen worden gebruikt met de versie van mysql die u nu "
+"installeert."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Belangrijke opmerking voor gebruikers van NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Het script staat op het punt de datamap /var/lib/mysql te verwijderen. Als "
+"het plan alleen maar is om een hogere MySQL versie te installeren, of als "
+"een ander mysql-serverpakket de datamap al gebruikt, dan zou de data moeten "
+"worden behouden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Moet MySQL starten als de computer start?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan automatisch starten bij het starten van de computer, of slechts "
+"wanneer u '/etc/init.d/mysql start' handmatig uitvoert."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nieuw wachtwoord voor de MySQL \"root\"-gebruiker:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Het wordt sterk aangeraden een wachtwoord in te stellen voor de "
+"administratieve MySQL \"root\"-gebruiker."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nieuw wachtwoord voor de MySQL \"root\"-gebruiker:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Kan het wachtwoord voor de MySQL \"root\"-gebruiker niet instellen"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Er lijkt een fout te zijn opgetreden bij het instellen van het wachtwoord "
+"van de MySQL administratieve gebruiker. Dat kan komen doordat de gebruiker "
+"al een wachtwoord heeft, of omdat er een probleem was bij het communiceren "
+"met de MySQL server."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Moet u MySQL-verbindingen accepteren van computers die Debian \"sarge\" "
+#~ "of ouder draaien?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "De wijze waarop wachtwoorden werden bewaard was niet erg veilig. Dit is "
+#~ "verbeterd, maar helaas zullen programma's van computers die Debian 3.1 "
+#~ "Sarge draaien, geen verbinding meer kunnen maken met accounts die nieuw "
+#~ "zijn, of waarvan het wachtwoord is gewijzigd. Zie /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Om mysql te gebruiken dient u een gebruiker en groep aan te maken, "
+#~ "gelijkwaardig aan onderstaand voorbeeld, en u dient ervoor te zorgen dat /"
+#~ "var/lib/mysql de bijbehorende toegangsrechten heeft (uid en gid mogen "
+#~ "anders zijn)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Databases verwijderen die door alle MySQL versies worden gebruikt?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Indien u geen wachtwoord opgeeft zal het account niet worden gewijzigd."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Wanneer de installatie klaar is, dient u te verifiëren dat het account "
+#~ "netjes beschermd is met een wachtwoord (zie README.Debian voor meer "
+#~ "informatie)."
diff --git a/storage/xtradb/build/debian/po/pt.po b/storage/xtradb/build/debian/po/pt.po
new file mode 100644
index 00000000000..3372b11b06b
--- /dev/null
+++ b/storage/xtradb/build/debian/po/pt.po
@@ -0,0 +1,322 @@
+# Portuguese translation for mysql-dfsg-5.1's debconf messages
+# Copyright (C) 2006 Miguel Figueiredo <elmig@debianpt.org>
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Miguel Figueiredo <elmig@debianpt.org>
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-05 21:01+0100\n"
+"Last-Translator: Miguel Figueiredo <elmig@debianpt.org>\n"
+"Language-Team: Portuguese <traduz@debianpt.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Deseja mesmo fazer downgrade?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Existe um ficheiro chamado /var/lib/mysql/debian-*.flag neste sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Tal ficheiro significa que anteriormente foi instalado um pacote mysql-"
+"server com um número de versão superior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Não existe nenhuma garantia que a versão que está actualmente a instalar "
+"seja capaz de utilizar as bases de dados actuais."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para utilizadores de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para utilizar o MySQL, têm de ser acrescentadas as seguintes entradas para "
+"os utilizadores e grupos:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Deve também verificar as permissões e o dono do directório /var/lib/mysql :"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Remover todas as bases de dados MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"O directório /var/lib/mysql que contém as bases de dados MySQL está prestes "
+"a ser removido."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se está a remover o pacote MySQL de modo a posteriormente instalar uma "
+"versão mais recente ou se um pacote mysq-server já está os está a utilizar, "
+"os dados devem ser mantidos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Iniciar o servidor MySQL no arranque?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"O MySQL pode ser automaticamente lançado no arranque ou manualmente através "
+"do comando '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nova palavra-passe para o utilizador \"root\" do MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Embora não seja mandatório, É fortemente recomendado que defina uma palavra-"
+"passe para o utilizador administrativo \"root\" do MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+"Se esse campo for deixado em branco, a palavra-passe não irá ser alterada."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nova palavra-passe para o utilizador \"root\" do MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+"Não foi possível definir a palavra-passe para o utilizador \"root\" do MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Ocorreu um erro enquanto era definida a palavra-passe para o utilizador "
+"administrativo do MySQL. Isto pode ter acontecido porque a cona já tem uma "
+"palavra-passe, ou porque ocorreu um problema ao comunicação com o servidor "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Você deve verificar a palavra-passe da conta após a instalação do pacote."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Para mais informação por favor leia o ficheiro /usr/share/doc/mysql-server-"
+"5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Não é possível actualizar se estiverem presentes tabelas ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "As versões recentes de MySQL já não podem utilizar o antigo formato de "
+#~ "tabelas ISAM e é por isso necessário converter as suas tabelas pra e.g. "
+#~ "MyISAM antes da actualização, utilizando \"mysql_convert_table_format\" "
+#~ "ou \"ALTER TABLE x ENGINE=MyISAM\". A instalação de mysql-server-5.1 irá "
+#~ "agora ser cancelada. Se o seu antigo mysql-server-4.1 for removido apenas "
+#~ "reinstale para converter essas tabelas."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportar ligações MySQL de máquinas que corram Debian \"sarge\" ou mais "
+#~ "antigos?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nas versões antigas de clientes de MySQL em Debian, as palavras-passe não "
+#~ "eram guardadas de forma segura. Isto foi melhorado desde aí, no entanto "
+#~ "os clientes (como o PHP) de máquinas que corram Debian 3.1 Sarge não irão "
+#~ "conseguir ligar-se a contas novas ou cuja palavra-passe foi alterada."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar mysql e instalar um utilizador e grupo equivalentes para o "
+#~ "seguinte e assegurar-se que /var/lib/mysql têm as permissões correctas (o "
+#~ "uid/gid podem ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Remover as bases de dados utilizadas por todas as versões de MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Se não disponibilizar uma password não serão feitas alterações nesta "
+#~ "conta."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Quando terminar a instalação, deve verificar se a conta está devidamente "
+#~ "protegida com uma password (para mais informações veja README.Debian)."
diff --git a/storage/xtradb/build/debian/po/pt_BR.po b/storage/xtradb/build/debian/po/pt_BR.po
new file mode 100644
index 00000000000..e04a5a2393d
--- /dev/null
+++ b/storage/xtradb/build/debian/po/pt_BR.po
@@ -0,0 +1,458 @@
+# Brazilian Portuguese (pt_BR) debconf template translation for
+# Debian's mysql-dfsg source package.
+# Debian-BR Project <debian-l10n-portuguese@lists.debian.org>
+# André Luís Lopes, <andrelop@debian.org> , 2004
+# André Luís Lopes, <andrelop@debian.org> , 2006
+# André Luís Lopes, <andrelop@debian.org> , 2007
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-21 15:59-0300\n"
+"Last-Translator: André Luís Lopes <andrelop@debian.org>\n"
+"Language-Team: Debian-BR Project <debian-l10n-portuguese@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"pt_BR utf-8\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Realmente proceder com o rebaixamento de versão?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Um arquivo de nome /var/lib/mysql/debian-*.flag existe no sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"A presença de um arquivo como este é uma indicação de que um pacote mysql-"
+"server com um número de versão mais alto já foi instalado anteriormente."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Não há garantias de que a versão que você está instalando no momento "
+"conseguirá utilizar as bases de dados existentes."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Aviso importante para usuários NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para utilizar o MySQL, as seguintes entradas para usuários e grupos devem "
+"ser adicionadas ao sistema:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Você deverá também checar as permissões e o dono do diretório /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Remover todas as bases de dados do MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"O diretório /var/lib/mysql, o qual contém as bases de dados do MySQL, está "
+"prestes a ser removido."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Caso você esteja removendo o pacote MySQL para posteriormente instalar uma "
+"versão mais recente ou, caso uma versão diferente do pacote mysql-server "
+"esteja sendo utilizada, os dados deverão ser mantidos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Iniciar o servidor MySQL junto a inicialização da máquina?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"O servidor MySQL pode ser iniciado automaticamente junto a inicialização da "
+"máquina ou manualmente com o comando '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nova senha para o usuário \"root\" do MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Apesar de não ser mandatório, é altamente recomendado que você defina uma "
+"senha para o usuário administrativo \"root\" do MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Caso este campo seja deixado em branco, a senha não sera mudada."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nova senha para o usuário \"root\" do MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Impossível definir senha para o usuário \"root\" do MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Um erro ocorreu durante a definição da senha para o usuário administrativo "
+"do MySQL. Isso pode ter acontecido devido a esse usuário já possuir uma "
+"senha definida ou devido a ocorrência de um problema de comunicação com o "
+"servidor MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Você deverá checar a senha dessa conta após a instalação deste pacote."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Por favor, leia o arquivo /usr/share/doc/mysql-server-5.1/README.Debian para "
+"maiores informações."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportar conexões MySQL originadas de hosts executando o Debian \"sarge\" "
+#~ "ou mais antigos ?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Em versões antigas dos clientes MySQL no Debian, as senhas não eram "
+#~ "armazenadas de forma segura. Isto foi corrigido desde então, porém, "
+#~ "clientes (como o PHP) em hosts executando o Debian 3.1 Sarge não serão "
+#~ "capazes de conectar em contas recentes ou contas as quais as senhas "
+#~ "tenham sido modificadas."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar o MySQL, você deve instalar um usuário e um grupo "
+#~ "equivalentes ao usuário e grupo a seguir para se certificar de que o "
+#~ "diretório /var/lib/mysql possua as permissões correctas (o uid/gid podem "
+#~ "ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Remover as bases de dados utilizadas por todas as versões do MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Caso você não forneça uma senha, nenhuma mudança será feita na conta."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Quando a instalação finalizar, você deverá verificar se a conta está "
+#~ "apropriadamente protegida com uma senha (consulte o arquivo README.Debian "
+#~ "para maiores informações)."
+
+#~ msgid "internal"
+#~ msgstr "interno"
+
+#~ msgid "Only internally used."
+#~ msgstr "Somente utilizado internamente."
+
+#, fuzzy
+#~ msgid "Update Hints"
+#~ msgstr "Dicas de atualização"
+
+#, fuzzy
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Raramente, por exemplo, em novas versões maiores, o sistema de "
+#~ "privilégios é melhorado. Para fazer uso disso, o script "
+#~ "mysql_fix_privilege_tables deve ser executado manualmente. O script não "
+#~ "atribuirá a nenhum usuário mais direitos do que os mesmos já possuíam "
+#~ "anteriormente."
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Por favor, leia http://www.mysql.com/doc/en/Upgrade.html"
+
+#, fuzzy
+#~ msgid "Install Hints"
+#~ msgstr "Dicas de instalação"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "O MySQL será instalado somente caso você possua um nome de host NÃO "
+#~ "NUMÉRICO que possa ser resolvido através do arquivo /etc/hosts, ou seja, "
+#~ "caso o comando \"hostname\" retorne \"myhostname\", uma linha como "
+#~ "\"10.0.0.1 myhostname\" deverá existir no arquivo /etc/hosts."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Um novo usuário MySQL de nome \"debian-sys-maint\" será criado. Essa "
+#~ "conta MySQL é utilizada pelos scripts de inicialização/parada e pelos "
+#~ "scripts cron. Não remova esse usuário."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Por favor, lembre-se de definir uma SENHA para o usuário root do MySQL ! "
+#~ "Caso você utilize um arquivo /root/.my.cnf, sempre inclua as linhas \"user"
+#~ "\" e \"password\" nesse arquivo, nunca somente a senha ! Consulte o "
+#~ "arquivo /usr/share/doc/mysql-server/README.Debian para mais informações."
+
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "Todas as base de dados sob o diretório /var/lib/mysql devem ser removidas "
+#~ "quando você remover o pacote pacote mysql-server ?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "O suporte ao funcionamento em rede está desativado por padrão por "
+#~ "questões de segurança. Você poderá ativá-lo comentando a opção 'skip-"
+#~ "networking' no arquivo /etc/mysql/my.cnf."
+
+#~ msgid "security and update notice"
+#~ msgstr "aviso de segurança e actualização"
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "Devo remover tudo abaixo de /var/lib/mysql quando fizer o purge do pacote "
+#~ "mysql-server com o comando \"dpkg --purge mysql-server\" (ou seja, "
+#~ "remover tudo incluíndo a configuração)? (o padrão é não remover)"
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "Fazer com que o MySQL seja acessível via rede?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "O MySQL deve aguardar ligações numa porta TCP acessível via rede? Isto "
+#~ "não é necessário para uso num único computador e pode ser um problema de "
+#~ "segurança."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "Activar o modo chroot?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "O MySQL é capaz de se prender no diretório /var/lib/mysql_jail, assim os "
+#~ "utilizadores não poderão modificar ficheiros fora deste directório. Isto "
+#~ "aumenta também a resistência contra crackers, pois eles não poderão "
+#~ "modificar arquivos de sistema."
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Por favor execute mysql_fix_privilege_tables !"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "Permissões seguras para o diretório /var/lib/mysql serão asseguradas "
+#~ "substituíndo GIDs diferentes de root e mysql por mysql."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "Instruções sobre como activar o suporte de SSL estão disponíveis no "
+#~ "directório /usr/share/doc/mysql-server/."
+
+#, fuzzy
+#~ msgid "mysql_fix_privileges_tables should be executed"
+#~ msgstr "mysql_fix_privileges_tables será executado"
+
+#, fuzzy
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in the \"mysql\" database. This will not happen automatically."
+#~ msgstr ""
+#~ "As últimas versões do MySQL possuem um sistema de privilégios melhorado e "
+#~ "mais refinado. Para utilizá-lo, alguns novos campos devem ser adicionados "
+#~ "as tabelas na base de dados \"mysql\". Isto é feito pelo script "
+#~ "mysql_fix_privileges_tables durante esta actualização independente do "
+#~ "servidor estar a correr ou não !"
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Este script não deverá fornecer mais direitos a um utilizador além dos "
+#~ "quais ele já possua anteriormente. SE encontrar um caso desses, por favor "
+#~ "entre em contacto com o mantainer deste pacote Debian."
diff --git a/storage/xtradb/build/debian/po/ro.po b/storage/xtradb/build/debian/po/ro.po
new file mode 100644
index 00000000000..37902bfd913
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ro.po
@@ -0,0 +1,319 @@
+# Romanian translation of mysql-dfsg.
+# Copyright (C) 2006 THE mysql-dfsg'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the mysql-dfsg package.
+#
+# Stan Ioan-Eugen <stan.ieugen@gmail.com>, 2006.
+msgid ""
+msgstr ""
+"Project-Id-Version: po-debconf://mysql-dfsg\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2006-12-20 21:27+0200\n"
+"Last-Translator: stan ioan-eugen <stan.ieugen@gmail.com>\n"
+"Language-Team: romanian <debian-l10n-romanian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Sunteţi sigur că doriţi să instalaţi o versiune mai veche?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"AVERTISMENT: Fişierul /var/lib/mysql/debian-*.flag există. Acest lucru "
+"indică faptul că anterior a fost instalată o versiune nouă a pachetului "
+"mysql-server. Nu se poate garanta că versiunea instalată acum poate folosi "
+"datele versiunii instalate anterior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Notă importantă pentru utilizatorii NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Scriptul urmează să şteargă directorul de date /var/lib/mysql. Dacă plănuiţi "
+"doar să instalaţi o versiune nouă MySQL sau datele sunt folosite de către un "
+"alt pachet mysql-server, atunci ar trebui păstraţi datele."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Doriţi ca MySQL să pornească la initializarea sistemului?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL poate porni automat la iniţializarea sistemului sau doar dacă rulaţi "
+"comanda „/etc/init.d/mysql startâ€."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Noua parolă pentru utilizatorul „root†al MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Este recomandat să stabiliţi o parolă pentru utilizatorul administrativ "
+"„root†al MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Noua parolă pentru utilizatorul „root†al MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Nu s-a putut stabili parola pentru utilizatorul „root†al MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Se pare că a intervenit o eroare în stabilirea parolei pentru utilizatorul "
+"administrativ al MySQL. Acest lucru se poate întâmpla dacă utilizatorul are "
+"deja o parolă, sau a existat o problemă în comunicarea cu serverul MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Nu se poate face actualizarea dacă sunt prezente tabele ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Versiunile recente MySQL nu mai pot folosi vechiul format de tabele ISAM "
+#~ "şieste necesar să convertiţi tabelele dumneavoastră de ex. la formatul "
+#~ "MyISAM înainte de a face actualizarea folosind comanda "
+#~ "„mysql_convert_table_format†sau „ALTER TABLE x ENGINE=MyISAMâ€. "
+#~ "Instalarea mysql-server-5.1 va eşua. În caz că ştergeţiversiunea "
+#~ "anterioară mysql-server-4.1 va trebui reinstalată pentru a converti "
+#~ "tabelele."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportaţi conexiuni MySQL de la staţii ce rulează sistemul Debian „sarge†"
+#~ "sau mai vechi?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Modul în care erau păstrate parolele nu era foarte sigur. Acest lucru a "
+#~ "fost îmbunătăţitcu dezajantajul că clienţii (de ex. PHP) de pe staţii ce "
+#~ "rulează sistemul Debian 3.1 Sargenu se vor putea conecta la conturi noi "
+#~ "sau ale căror parole au fost schimbate. Citiţi /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Pentru a folosi mysql trebuie să adăugaţi un utilizator şi grup "
+#~ "echivalent şi să vă asiguraţi că /var/lib/mysql are permisiunile "
+#~ "stabilite corect (uid/gid pot aveavalori diferite)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:\tmysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group:\tmysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql:\tdrwxr-xr-x\tmysql\tmysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Doriţi să ştergeţi bazele de date folosite de toate versiune MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Dacă nu introduceţi nici o parolă, nici o schimbare nu va fi luată în "
+#~ "considerare."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "După finalizarea instalării, ar trebui să verificaţi dacă contul este "
+#~ "protejat cu o parolă (citiţi fişierul README.Debian pentru informaţii "
+#~ "suplimentare)."
diff --git a/storage/xtradb/build/debian/po/ru.po b/storage/xtradb/build/debian/po/ru.po
new file mode 100644
index 00000000000..14867533070
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ru.po
@@ -0,0 +1,305 @@
+# translation of mysql-dfsg-5.1_5.0.32-6_ru.po to Russian
+# Russian messages:
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'#
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+# Developers do not need to manually edit POT or PO files.
+# Ilgiz Kalmetev <translator@ilgiz.pp.ru>, 2003.
+# Yuriy Talakan' <yt@amur.elektra.ru>, 2005, 2006.
+# Yuriy Talakan' <yt@drsk.ru>, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.32-6_ru\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-19 11:28+0900\n"
+"Last-Translator: Yuriy Talakan' <yt@drsk.ru>\n"
+"Language-Team: Russian <debian-l10n-russian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.9.1\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Ð’Ñ‹ дейÑтвительно желаете понизить верÑию?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Ð’ÐИМÐÐИЕ: Ðайден файл /var/lib/mysql/debian-*.flag. Это означает, что ранее "
+"был уÑтановлен пакет mysql-server более выÑокой верÑии. Ðевозможно "
+"гарантировать, что Ñ‚ÐµÐºÑƒÑ‰Ð°Ñ Ð²ÐµÑ€ÑÐ¸Ñ Ñможет иÑпользовать его данные."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Важное замечание Ð´Ð»Ñ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»ÐµÐ¹ NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Сценарий ÑобираетÑÑ ÑƒÐ´Ð°Ð»Ð¸Ñ‚ÑŒ директорию данных /var/lib/mysql. ЕÑли "
+"планируетÑÑ ÑƒÑтановить новую верÑию MySQL или еÑÑ‚ÑŒ другие пакеты mysql-"
+"server, иÑпользующие Ñту директорию, то данные надо Ñохранить."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "ЗапуÑкать MySQL при загрузке ÑиÑтемы?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL может запуÑкатьÑÑ Ð¿Ñ€Ð¸ загрузке ÑиÑтемы, либо только еÑли вы вручную "
+"введете команду '/etc/init.d/mysql start'. "
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Ðовый пароль Ð´Ð»Ñ MySQL Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ \"root\":"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Крайне рекомендуетÑÑ ÑƒÑтановить пароль Ð´Ð»Ñ Ð°Ð´Ð¼Ð¸Ð½Ð¸Ñтративного MySQL "
+"Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Ðовый пароль Ð´Ð»Ñ MySQL Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ \"root\":"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Ðевозможно задать пароль MySQL пользователю \"root\""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Ð’ процеÑÑе Ð·Ð°Ð´Ð°Ð½Ð¸Ñ Ð¿Ð°Ñ€Ð¾Ð»Ñ Ð°Ð´Ð¼Ð¸Ð½Ð¸Ñтративного MySQL Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ Ð¿Ñ€Ð¾Ð¸Ð·Ð¾ÑˆÐ»Ð° "
+"ошибка. Это могло произойти еÑли у Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ ÑƒÐ¶Ðµ был задан пароль, или "
+"из-за проблем ÑÐ¾ÐµÐ´Ð¸Ð½ÐµÐ½Ð¸Ñ Ñ Ñервером MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Ðужна поддержка подключений к MySQL Ñ Ð¼Ð°ÑˆÐ¸Ð½, работающих под Debian \"sarge"
+#~ "\" или Ñтарше?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Метод Ñ…Ñ€Ð°Ð½ÐµÐ½Ð¸Ñ Ð¿Ð°Ñ€Ð¾Ð»Ñ Ð±Ñ‹Ð» не очень безопаÑен. Это было Ñделано из-за "
+#~ "того, клиенты (например, PHP) Ñ Ð¼Ð°ÑˆÐ¸Ð½, работающих под Debian 3.1 Sarge не "
+#~ "Ñмогут подключитьÑÑ Ðº учетной запиÑи еÑли она новаÑ, или пароль был "
+#~ "изменен. См. /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Чтобы иÑпользовать mysql, Ð’Ñ‹ должны уÑтановить Ñквивалентные Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ñ‚ÐµÐ»Ñ "
+#~ "и группу, как указано ниже и убедитьÑÑ, что /var/lib/mysql имеет "
+#~ "правильные права (uid/gid могут отличатьÑÑ)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Удалить базы данных, иÑпользуемые вÑеми верÑиÑми MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "ЕÑли вы не зададите пароль, то ÑƒÑ‡ÐµÑ‚Ð½Ð°Ñ Ð·Ð°Ð¿Ð¸ÑÑŒ не будет изменена."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Когда уÑтановка завершитÑÑ, вы должны убедитьÑÑ, что ÑƒÑ‡ÐµÑ‚Ð½Ð°Ñ Ð·Ð°Ð¿Ð¸ÑÑŒ "
+#~ "защищена паролем (подробную информацию Ñм. в README.Debian)."
diff --git a/storage/xtradb/build/debian/po/sv.po b/storage/xtradb/build/debian/po/sv.po
new file mode 100644
index 00000000000..ea9da131e58
--- /dev/null
+++ b/storage/xtradb/build/debian/po/sv.po
@@ -0,0 +1,400 @@
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+# Developers do not need to manually edit POT or PO files.
+# , fuzzy
+#
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.21-3\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 14:48+0100\n"
+"Last-Translator: Andreas Henriksson <andreas@fatal.se>\n"
+"Language-Team: Swedish <tp-sv@listor.tp-sv.se>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=iso-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Poedit-Language: Swedish\n"
+"X-Poedit-Country: SWEDEN\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Vill du verkligen nedgradera?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"VARNING: Filen /var/lib/mysql/debian-*.flag existerar. Detta betyder att "
+"paketet mysql-server med högre versionsnummer har installerats tidigare. Det "
+"kan inte garanteras att denna version kan använda dess data."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Viktig notering för NIS/YP-användare!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Scriptet kommer strax ta bort data-katalogen /var/lib/mysql. Om det "
+"planerade var att bara installera en högre MySQL-version eller om ett annan "
+"mysql-server paket redan använde det, skall datan sparas."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Ska MySQL startas när systemet startar upp?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan startas när systemet startas upp eller endast om du manuellt "
+"skriver '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nytt lösenord för MySQLs \"root\"-användare:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Det är starkt rekommenderat att du sätter ett lösenord för MySQLs "
+"administrativa \"root\"-användare."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nytt lösenord för MySQLs \"root\"-användare:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Lyckades inte sätta lösenord för MySQLs \"root\"-användare"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user. This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Det verkar som ett fel uppstod när det skulle sättas ett lösenord för MySQLs "
+"administrativa användare. Detta kan ha skett för att användaren redan har "
+"ett lösenord satt, eller på grund av problem med att kommunicera med MySQL-"
+"servern."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "See /usr/share/doc/mysql-server-5.1/README.Debian for more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr "Se /usr/share/doc/mysql-server-5.1/README.Debian för mer information."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Kan inte uppgradera om ISAM-tabeller finns!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Senaste versionerna av MySQL kan inte längre använda gamla ISAM-"
+#~ "tabellformatet och det är nödvändigt att konvertera dina tabeller till "
+#~ "exempelvis MyISAM före uppgradering med \"mysql_convert_table_format\" "
+#~ "eller \"ALTER TABLE x ENGINE=MyISAM\". Installationen av mysql-server-5.1 "
+#~ "kommer nu att avbrytas. Om ditt gamla mysql-server-4.1-paket tas bort är "
+#~ "det bara att installera om det för att konvertera de tabellerna."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Behöver du MySQL-anslutningar från system som kör Debian \"Sarge\" eller "
+#~ "äldre?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Sättet som lösenorden lagrades på var inte särskilt säkert. Detta har "
+#~ "förbättrats på bekostnad av att klienter (t.ex. PHP) från system som kör "
+#~ "Debian 3.1 Sarge inte kan ansluta till konton som är nya eller vars "
+#~ "lösenord har ändrats. Se /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "För att använda MySQL måste du installera en motsvarande användare och "
+#~ "grupp till följande och se till att /var/lib/mysql har korrekta "
+#~ "rättigheter satta (uid/gid kan vara olika)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Ta bort databaserna som används av alla MySQL-versioner?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Om du inte anger ett lösenord kommer inga ändringar att göras för kontot."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "När installationen är klar, bör du kontrollera så att kontot är riktigt "
+#~ "skyddat av ett lösenord (läs README.Debian för mer information)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Uppdateringstips"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be "
+#~ "corrupted! This script also enhances the privilege tables but is not "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Du måste köra \"mysql_upgrade\" efter uppgraderingen, annars kan "
+#~ "tabellerna vara skadade! Detta skript utökar även privilegietabellerna "
+#~ "men är inte tänkte att ge någon användare mer befogenhet än vad han hade "
+#~ "tidigare,"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Läs även http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid "Install Hints"
+#~ msgstr "Installationstips"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Vid uppgraderingar från MySQL 3.23 som skickades med Debian Woody har "
+#~ "symboliska länkar i /var/lib/mysql eller /var/log/mysql av misstag tagits "
+#~ "bort och måste manuellt återskapas."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL kan endast installeras om du har ett icke-numeriskt värdnamn som "
+#~ "kan slås upp via filen /etc/hosts. Exempelvis om kommandot \"hostname\" "
+#~ "returnerar \"mittnamn\" så bör det finnas en rad som liknar \"10.0.0.1 "
+#~ "mittnamn\"."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "En ny MySQL-användare kallad \"debian-sys-maint\" kommer att skapas. "
+#~ "Detta MySQL-konto används för start/stopp och cron-skript. Ta inte bort "
+#~ "det."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Tänk på att sätta ett LÖSENORD för MySQL:s root-användare! Om du "
+#~ "använder /root/.my.cnf, skriv då alltid en \"user\"-rad och en \"password"
+#~ "\"-rad i den, aldrig med endast lösenordet!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Ska jag ta bort hela katalogträdet i /var/lib/mysql som används av alla "
+#~ "MySQL-versioner och inte bara för den som du nu kommer att rensa ut?"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Sällan, exempelvis i nya större versioner, har behörighetssystemet "
+#~ "förbättrats. För att använda det måste skriptet "
+#~ "mysql_fix_privilege_tables köras manuellt. Skriptet är inte tänkt att ge "
+#~ "någon användare högre behörighet än han hade tidigare."
diff --git a/storage/xtradb/build/debian/po/templates.pot b/storage/xtradb/build/debian/po/templates.pot
new file mode 100644
index 00000000000..bbddfe37f83
--- /dev/null
+++ b/storage/xtradb/build/debian/po/templates.pot
@@ -0,0 +1,187 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+msgid "Start the Percona SQL server on boot?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
diff --git a/storage/xtradb/build/debian/po/tr.po b/storage/xtradb/build/debian/po/tr.po
new file mode 100644
index 00000000000..e19ddeedcfa
--- /dev/null
+++ b/storage/xtradb/build/debian/po/tr.po
@@ -0,0 +1,342 @@
+# Turkish translation of mysql-server.
+# This file is distributed under the same license as the mysql-server package.
+# Gürkan Aslan <gurkan@iaslan.com>, 2004
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2004-06-05 08:53+0300\n"
+"Last-Translator: Gürkan Aslan <gurkan@iaslan.com>\n"
+"Language-Team: Turkish <debian-l10n-turkish@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP kullanıcıları için önemli not!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "MySQL açılış sırasında başlatılsın mı?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL açılış sırasında veya '/etc/init.d/mysql start' komutunu vermeniz "
+"halinde elle başlatılabilir. Eğer açılışta otomatik olarak başlatılmasını "
+"istiyorsanız burada 'evet'i seçin."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Mysql'i kullanmak için aşağıdakiyle eşdeğer bir kullanıcı ve grup "
+#~ "tanımlamalı, ve /var/lib/mysql izinlerinin uygun şekilde ayarlandığından "
+#~ "emin olmalısınız (uid/gid farklı olabilir)."
+
+#~ msgid ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group: mysql:x:101:"
+#~ msgstr "/etc/group: mysql:x:101:"
+
+#~ msgid "/var/lib/mysql: drwxr-xr-x mysql mysql"
+#~ msgstr "/var/lib/mysql: drwxr-xr-x mysql mysql"
+
+#, fuzzy
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Lütfen http://www.mysql.com/doc/en/Upgrade.html belgesini okuyun"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL sadece /etc/hosts dosyası yoluyla çözülebilir NUMERİK OLMAYAN bir "
+#~ "makine adına sahipseniz kurulacaktır. Örneğin, eğer \"hostname\" komutu "
+#~ "\"makinem\" ismini döndürüyorsa, bu dosya içinde \"10.0.0.1 makinem\" "
+#~ "gibi bir satır olmalıdır."
+
+#, fuzzy
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Yeni mysql kullanıcısı \"debian-sys-maint\" yaratılacak. Bu hesap, "
+#~ "başlangıç betiklerinde ve cron içinde kullanılıyor. Bu hesabı silmeyin."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Lütfen MySQL root kullanıcısı için bir PAROLA girmeyi unutmayın! Eğer /"
+#~ "root/.my.cnf kullanıyorsanız, \"user\" ve \"password\" satırlarını her "
+#~ "zaman buraya ekleyin, sadece parolayı değil! Daha fazla bilgi için /usr/"
+#~ "share/doc/mysql-server/README.Debian dosyasını okuyun."
+
+#, fuzzy
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "mysql-server paketi kaldırıldıktan sonra bütün veritabanları silinsin mi?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "Ağ, öntanımlı olarak güvenlik gerekçeleriyle devre dışı bırakıldı. Bu "
+#~ "özelliği /etc/mysql/my.cnf dosyası içindeki \"skip-networking\" "
+#~ "seçeneğini kaldırarak etkinleştirebilirsiniz."
+
+#~ msgid "security and update notice"
+#~ msgstr "güvenlik ve güncelleme duyurusu"
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "mysql-server paketini temizlemek için \"dpkg --purge mysql-server\" "
+#~ "komutunu kullandığınızda (yani yapılandırma dahil herşeyi silmek) /var/"
+#~ "lib/mysql altındaki herşeyi sileyim mi? (öntanımlı cevap hayır'dır)."
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Lütfen mysql_fix_privilege_tables komutunu çalıştırın!"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "/var/lib/mysql'in izinlerinin güvenli olmasını sağlamak amacıyla, buna "
+#~ "ait GID'leri root ve mysql'den farklı olacak şekilde değiştireceğim."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "SSL desteğini nasıl etkinleştirebileceğinize ilişkin talimatlar /usr/"
+#~ "share/doc/mysql-server/ içinde."
+
+#~ msgid "mysql_fix_privileges_tables will be executed"
+#~ msgstr "mysql_fix_privileges_tables çalıştırılacak"
+
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in the \"mysql\" database. This is done by the "
+#~ "mysql_fix_privilege_tables script during this upgrade regardless of if "
+#~ "the server is currently running or not!"
+#~ msgstr ""
+#~ "En son MySQL sürümleri zenginleştirilmiş, daha ayrıntılandırılmış bir "
+#~ "ayrıcalık (privilege) sistemine sahiptir. Yeni sistemi kullanmak için, "
+#~ "\"mysql\" veritabanındaki tablolara bazı yeni alanlar eklenmelidir. Bu "
+#~ "işlem, sunucunun çalışıp çalışmamasına bağlı olmaksızın "
+#~ "mysql_fix_privilege_tables betiği tarafından bu yükseltme sırasında "
+#~ "yapılır."
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Bu betiğin hiç bir kullanıcıya öncekinden daha fazla hak kazandırmadığı "
+#~ "varsayılıyor. Eğer bunun aksinde bir durumla karşılaşırsanız, lütfen "
+#~ "benimle bağlantıya geçin."
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "MySQL network üzerinden ulaşılabilir olsun mu?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "MySQL ağ üzerinde ulaşılabilen bir TCP portunu dinlesin mi? Tek olan bir "
+#~ "bilgisayar için bu ayar gerekli değildir ve bir güvenlik sorunu "
+#~ "oluÅŸturabilir."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "chroot kipi etkinleÅŸtirilsin mi?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "MySQL kendini /var/lib/mysql_jail dizinine hapsederek kullanıcıların bu "
+#~ "dizin dışındaki hiç bir dosyayı değiştirmemesini sağlayabilir. Bu "
+#~ "düzenleme, sistem dosyalarını değiştirmelerini engelleyeceğinden, "
+#~ "cracker'lara karşı dayanıklılığı arttırır."
diff --git a/storage/xtradb/build/debian/rules b/storage/xtradb/build/debian/rules
new file mode 100755
index 00000000000..f166009da2c
--- /dev/null
+++ b/storage/xtradb/build/debian/rules
@@ -0,0 +1,322 @@
+#!/usr/bin/make -f
+
+export DH_VERBOSE=1
+
+PACKAGE=percona-xtradb-dfsg-5.1
+
+include /usr/share/dpatch/dpatch.make
+
+TMP=$(CURDIR)/debian/tmp/
+
+ARCH = $(shell dpkg-architecture -qDEB_BUILD_ARCH)
+ARCH_OS = $(shell dpkg-architecture -qDEB_BUILD_ARCH_OS)
+DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+DEB_HOST_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEBVERSION = $(shell dpkg-parsechangelog | awk '/^Version: / { print $$2 }' | sed 's/^.*-//' )
+
+DEB_SOURCE_PACKAGE ?= $(strip $(shell egrep '^Source: ' debian/control | cut -f 2 -d ':'))
+DEB_VERSION ?= $(shell dpkg-parsechangelog | egrep '^Version:' | cut -f 2 -d ' ')
+DEB_NOEPOCH_VERSION ?= $(shell echo $(DEB_VERSION) | cut -d: -f2-)
+DEB_UPSTREAM_VERSION ?= $(shell echo $(DEB_NOEPOCH_VERSION) | sed 's/-[^-]*$$//')
+DEB_UPSTREAM_VERSION_MAJOR_MINOR := $(shell echo $(DEB_UPSTREAM_VERSION) | sed -r -n 's/^([0-9]+\.[0-9]+).*/\1/p')
+
+DISTRIBUTION = $(shell echo "Percona SQL Server (GPL), XtraDB 10")
+
+MAKE_J = -j$(shell if [ -f /proc/cpuinfo ] ; then grep -c processor.* /proc/cpuinfo ; else echo 1 ; fi)
+ifeq (${MAKE_J}, -j0)
+ MAKE_J = -j1
+endif
+
+MAKE_TEST_TARGET=test-force
+ifneq ($(findstring $(DEB_BUILD_OPTIONS),fulltest),)
+# make test-bt is the testsuite run by the MySQL build team
+# before a release, but it is long
+ MAKE_TEST_TARGET=test-bt
+endif
+
+USE_ASSEMBLER=--enable-assembler
+
+#ifneq ($(findstring $(ARCH), alpha amd64 arm armel ia64 i386 hppa mipsel powerpc s390 sparc),)
+# TESTSUITE_FAIL_CMD=true
+#else
+ TESTSUITE_FAIL_CMD=exit 1
+#endif
+
+# This causes seg11 crashes if LDAP is used for groups in /etc/nsswitch.conf
+# so it is disabled by default although, according to MySQL, it brings >10%
+# performance gain if enabled. See #299382.
+ifeq ($(STATIC_MYSQLD), 1)
+ USE_STATIC_MYSQLD=--with-mysqld-ldflags=-all-static
+endif
+
+configure: patch configure-stamp
+configure-stamp:
+ @echo "RULES.configure-stamp"
+ dh_testdir
+
+ifneq ($(ARCH_OS),hurd)
+ if [ ! -d /proc/self ]; then echo "/proc IS NEEDED" 1>&2; exit 1; fi
+endif
+
+ sh -c 'PATH=$${MYSQL_BUILD_PATH:-"/bin:/usr/bin"} \
+ CC=$${MYSQL_BUILD_CC:-gcc} \
+ CFLAGS=$${MYSQL_BUILD_CFLAGS:-"-O3 -DBIG_JOINS=1 ${FORCE_FPIC_CFLAGS}"} \
+ CXX=$${MYSQL_BUILD_CXX:-g++} \
+ CXXFLAGS=$${MYSQL_BUILD_CXXFLAGS:-"-O3 -DBIG_JOINS=1 -felide-constructors -fno-exceptions -fno-rtti ${FORCE_FPIC_CFLAGS}"} \
+ ./configure \
+ --build=${DEB_BUILD_GNU_TYPE} \
+ --host=${DEB_HOST_GNU_TYPE} \
+ \
+ --prefix=/usr \
+ --exec-prefix=/usr \
+ --libexecdir=/usr/sbin \
+ --datadir=/usr/share \
+ --localstatedir=/var/lib/mysql \
+ --includedir=/usr/include \
+ --infodir=/usr/share/info \
+ --mandir=/usr/share/man \
+ \
+ --with-server-suffix="-$(DEBVERSION)" \
+ --with-comment="($(DISTRIBUTION))" \
+ --with-system-type="debian-linux-gnu" \
+ \
+ --enable-shared \
+ --enable-static \
+ --enable-thread-safe-client \
+ $(USE_ASSEMBLER) \
+ --enable-local-infile \
+ $(FORCE_FPIC) \
+ --with-fast-mutexes \
+ --with-big-tables \
+ --with-unix-socket-path=/var/run/mysqld/mysqld.sock \
+ --with-mysqld-user=mysql \
+ --with-libwrap \
+ $(USE_STATIC_MYSQLD) \
+ --with-ssl \
+ --without-docs \
+ --with-extra-charsets=all \
+ --with-plugins=max-no-ndb \
+ \
+ --without-embedded-server \
+ --with-embedded-privilege-control'
+
+ # --sysconfdir=/etc/mysql -- Appends /etc/mysql after ~/ in the my.cnf search patch!
+ #
+ # --with-debug
+
+ touch configure-stamp
+
+
+build: build-stamp
+build-stamp: configure
+ dh_testdir
+
+ $(MAKE) $(MAKE_J)
+
+ifeq ($(findstring $(DEB_BUILD_OPTIONS),nocheck),)
+ if [ ! -f testsuite-stamp ] ; then \
+ $(MAKE) $(MAKE_TEST_TARGET) || $(TESTSUITE_FAIL_CMD) ; \
+ fi
+endif
+
+ touch testsuite-stamp
+
+ touch build-stamp
+
+
+clean: clean-patched unpatch
+ rm -rf debian/patched
+clean-patched:
+ @echo "RULES.clean-patched"
+ dh_testdir
+ dh_testroot
+ rm -f configure-stamp
+ rm -f build-stamp
+ rm -f testsuite-stamp
+
+ [ ! -f Makefile ] || $(MAKE) clean
+ [ ! -d mysql-test/var ] || rm -rf mysql-test/var
+
+ # We like to see how long this is neccessary
+ @echo "CRUFT BEGIN"
+ @find -type l -print0 | xargs --no-run-if-empty -0 rm -v
+ @find -name .deps -type d -print0 | xargs --no-run-if-empty -0 rm -rfv
+ @rm -vrf ndb/docs/.doxy* ndb/docs/*html ndb/docs/*pdf innobase/autom4te.cache
+ @for i in \
+ readline/Makefile \
+ sql-bench/Makefile \
+ scripts/make_win_binary_distribution \
+ scripts/mysqlbug \
+ sql/gen_lex_hash \
+ sql/lex_hash.h \
+ strings/ctype_autoconf.c \
+ config.log \
+ config.cache \
+ ; \
+ do \
+ rm -vf $$i; \
+ done
+ @echo "CRUFT END"
+
+ debconf-updatepo
+ dh_clean -v
+
+
+install:
+install: build
+ @echo "RULES.install"
+ dh_testdir
+ dh_testroot
+ dh_clean -k
+ dh_installdirs
+
+ # some self written manpages which hopefully
+ # gets overwritten sooner or later with upstreams
+ mkdir -p $(TMP)/usr/share/man/man1/
+ mkdir -p $(TMP)/usr/share/man/man8/
+ cp debian/additions/*.1 $(TMP)/usr/share/man/man1/
+ mkdir -p $(TMP)/etc/mysql/conf.d/
+ cp debian/additions/mysqld_safe_syslog.cnf $(TMP)/etc/mysql/conf.d/
+ ln -s mysqlmanager.1 $(TMP)/usr/share/man/man1/mysqlmanager-pwgen.1
+ ln -s mysqlmanager.1 $(TMP)/usr/share/man/man1/mysqlmanagerc.1
+
+ # make install (trailing slash needed for innobase)
+ $(MAKE) install DESTDIR=$(TMP)/
+
+ # After installing, remove rpath to make lintian happy.
+ set +e; \
+ find ./debian/tmp/ -type f -print0 \
+ | xargs -0 --no-run-if-empty chrpath -k 2>/dev/null \
+ | fgrep RPATH= \
+ | cut -d: -f 1 \
+ | xargs --no-run-if-empty chrpath -d; \
+ set -e
+
+ # libmysqlclient: move shared libraries (but not the rest like libheap.a & co)
+ mv $(TMP)/usr/lib/mysql/libmysqlclient* $(TMP)/usr/lib
+ perl -pi -e 's#/usr/lib/mysql#/usr/lib#' $(TMP)/usr/lib/libmysqlclient.la
+ perl -pi -e 's#/usr/lib/mysql#/usr/lib#' $(TMP)/usr/lib/libmysqlclient_r.la
+ # Check if our beloved versioned symbols are really there
+ if [ "`objdump -T $(TMP)/usr/lib/libmysqlclient.so.16.0.0 | grep -c libmysqlclient_16`" -lt 500 ]; then \
+ echo "ERROR: versioned symbols are absent"; \
+ exit 1; \
+ fi
+
+ # libmysqlclient-dev: forgotten header file since 3.23.25?
+ cp include/my_config.h $(TMP)/usr/include/mysql/
+ cp include/my_dir.h $(TMP)/usr/include/mysql/
+
+ # percona-xtradb-common: We now provide our own config file.
+ install -d $(TMP)/etc/mysql
+ install -m 0644 debian/additions/my.cnf $(TMP)/etc/mysql/my.cnf
+
+ # percona-xtradb-client
+ install -m 0755 debian/additions/mysqlreport $(TMP)/usr/bin/
+ install -m 0755 debian/additions/innotop/innotop $(TMP)/usr/bin/
+ install -m 0644 debian/additions/innotop/innotop.1 $(TMP)/usr/share/man/man1/
+ install -m 0644 -D debian/additions/innotop/InnoDBParser.pm $(TMP)/usr/share/perl5/InnoDBParser.pm
+
+ # percona-xtradb-server
+ install -m 0755 scripts/mysqld_safe $(TMP)/usr/bin/mysqld_safe
+ mkdir -p $(TMP)/usr/share/doc/percona-xtradb-server-5.1/examples
+ mv $(TMP)/usr/share/mysql/*cnf $(TMP)/usr/share/doc/percona-xtradb-server-5.1/examples/
+ rm -vf $(TMP)/usr/share/mysql/mi_test_all* \
+ $(TMP)/usr/share/mysql/mysql-log-rotate \
+ $(TMP)/usr/share/mysql/mysql.server \
+ $(TMP)/usr/share/mysql/binary-configure
+ nm -n sql/mysqld |gzip -9 > $(TMP)/usr/share/doc/percona-xtradb-server-5.1/mysqld.sym.gz
+ install -m 0755 debian/additions/echo_stderr $(TMP)/usr/share/mysql/
+ install -m 0755 debian/additions/debian-start $(TMP)/etc/mysql/
+ install -m 0755 debian/additions/debian-start.inc.sh $(TMP)/usr/share/mysql/
+ # lintian overrides
+ mkdir -p $(TMP)/usr/share/lintian/overrides/
+ cp debian/percona-xtradb-common.lintian-overrides $(TMP)/usr/share/lintian/overrides/percona-xtradb-common
+ cp debian/percona-xtradb-server-5.1.lintian-overrides $(TMP)/usr/share/lintian/overrides/percona-xtradb-server-5.1
+ cp debian/percona-xtradb-client-5.1.lintian-overrides $(TMP)/usr/share/lintian/overrides/percona-xtradb-client-5.1
+
+ # For 5.0 -> 5.1 transition
+ d=$(TMP)/usr/share/percona-xtradb-common/internal-use-only/; \
+ mkdir -p $$d; \
+ cp debian/percona-xtradb-server-5.1.mysql.init $$d/_etc_init.d_mysql; \
+ cp debian/percona-xtradb-server-5.1.logrotate $$d/_etc_logrotate.d_percona-xtradb-server; \
+ cp debian/additions/debian-start $$d/_etc_mysql_debian-start;
+
+ dh_movefiles
+
+# Build architecture-independent files here.
+binary-indep: build install
+ @echo "RULES.binary-indep"
+ dh_testdir -i
+ dh_testroot -i
+ dh_installdebconf -i
+ dh_installdocs -i
+ dh_installexamples -i
+ dh_installmenu -i
+ dh_installlogrotate -i
+ dh_installinit -i
+ dh_installcron -i
+ dh_installman -i
+ dh_installinfo -i
+ dh_installlogcheck -i
+ dh_installchangelogs -i
+ dh_link -i
+ dh_compress -i
+ dh_fixperms -i
+ dh_installdeb -i
+ dh_perl -i
+ dh_gencontrol -i
+ dh_md5sums -i
+ dh_builddeb -i
+
+# Build architecture-dependent files here.
+binary-arch: build install
+ @echo "RULES.binary-arch"
+ dh_testdir
+ dh_testroot
+
+ dh_installdebconf -a
+ dh_installdocs -a
+ dh_installexamples -a
+ dh_installmenu -a
+ dh_installlogrotate -a --name percona-xtradb-server
+ # Start mysql in runlevel 19 before 20 where apache, proftpd etc gets
+ # started which might depend on a running database server.
+ dh_installinit -a --name=mysql -- defaults 19 21
+ dh_installcron -a --name percona-xtradb-server
+ dh_installman -a
+ dh_installinfo -a
+ dh_installlogcheck -a
+ dh_installchangelogs -a
+ dh_strip -a
+ dh_link -a # .so muss nach .so.1.2.3 installier werden!
+ dh_compress -a
+ dh_fixperms -a
+ dh_makeshlibs -a
+ dh_makeshlibs -plibmysqlclient16 -V'libmysqlclient16 (>= 5.1.21-1)'
+ dh_installdeb -a
+ dh_perl -a
+ dh_shlibdeps -a -l debian/libmysqlclient16/usr/lib -L libmysqlclient16
+ dh_gencontrol -a
+ dh_md5sums -a
+ dh_builddeb -a
+
+source diff:
+ @echo >&2 'source and diff are obsolete - use dpkg-source -b'; false
+
+binary: binary-indep binary-arch
+
+get-orig-source:
+ @wget -nv -T10 -t3 \
+ -O /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz \
+ http://ftp.gwdg.de/pub/misc/mysql/Downloads/MySQL-$(DEB_UPSTREAM_VERSION_MAJOR_MINOR)/mysql-$(DEB_UPSTREAM_VERSION).tar.gz
+ @tar xfz /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz -C /tmp
+ @rm -rf /tmp/mysql-$(DEB_UPSTREAM_VERSION)/Docs
+ @rm -rf /tmp/mysql-$(DEB_UPSTREAM_VERSION)/debian
+ @mv /tmp/mysql-$(DEB_UPSTREAM_VERSION) /tmp/$(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+ @cd /tmp ; tar czf $(DEB_SOURCE_PACKAGE)_$(DEB_UPSTREAM_VERSION).orig.tar.gz $(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+ @rm -f /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz
+ @rm -rf /tmp/$(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+
+.PHONY: clean clean-patched configure build binary binary-indep binary-arch install patch unpatch
+
+# vim: ts=8
diff --git a/storage/xtradb/build/debian/source.lintian-overrides b/storage/xtradb/build/debian/source.lintian-overrides
new file mode 100644
index 00000000000..7a93dd28f84
--- /dev/null
+++ b/storage/xtradb/build/debian/source.lintian-overrides
@@ -0,0 +1,2 @@
+maintainer-script-lacks-debhelper-token debian/mysql-server-5.1.postinst
+maintainer-script-lacks-debhelper-token debian/mysql-server-5.1.postrm
diff --git a/storage/xtradb/build/debian/watch b/storage/xtradb/build/debian/watch
new file mode 100644
index 00000000000..f6fdd67bd8d
--- /dev/null
+++ b/storage/xtradb/build/debian/watch
@@ -0,0 +1,3 @@
+version=3
+opts="uversionmangle=s/-(rc|beta)/$1/" \
+ ftp://sunsite.informatik.rwth-aachen.de/pub/mirror/www.mysql.com/Downloads/MySQL-5.1/mysql-([\d\.]*(?:-beta|-rc)?).tar.gz debian
diff --git a/storage/xtradb/build/percona-sql.spec b/storage/xtradb/build/percona-sql.spec
new file mode 100644
index 00000000000..d0fa6f2a41d
--- /dev/null
+++ b/storage/xtradb/build/percona-sql.spec
@@ -0,0 +1,1644 @@
+#############################################################################
+#
+# This is the spec file for the distribution specific RPM files
+#
+##############################################################################
+
+##############################################################################
+# Some common macro definitions
+##############################################################################
+
+# Required arguments
+# mysqlversion - e.g. 5.1.37
+# pluginversion - Version of InnoDB plugin taken as the basis, e.g. 1.0.3
+# redhatversion - 5 or 4
+# xtradbversion - The XtraDB release, eg. 6
+
+%define mysql_vendor Percona, Inc
+%define redhatversion %(cat /etc/redhat-release | awk '{ print $3}' | awk -F. '{ print $1}')
+%define community 1
+%define mysqlversion 5.1.45
+%define pluginversion 1.0.6
+%define xtradbversion 10
+%define distribution rhel%{redhatversion}
+%define release %{xtradbversion}.%{distribution}
+
+%define mysqld_user mysql
+%define mysqld_group mysql
+%define mysqldatadir /var/lib/mysql
+%define see_base For a description of MySQL see the base MySQL RPM or http://www.mysql.com
+
+# ------------------------------------------------------------------------------
+# Meta information, don't remove!
+# ------------------------------------------------------------------------------
+# norootforbuild
+
+# ------------------------------------------------------------------------------
+# On SuSE 9 no separate "debuginfo" package is built. To enable basic
+# debugging on that platform, we don't strip binaries on SuSE 9. We
+# disable the strip of binaries by redefining the RPM macro
+# "__os_install_post" leaving out the script calls that normally does
+# this. We do this in all cases, as on platforms where "debuginfo" is
+# created, a script "find-debuginfo.sh" will be called that will do
+# the strip anyway, part of separating the executable and debug
+# information into separate files put into separate packages.
+#
+# Some references (shows more advanced conditional usage):
+# http://www.redhat.com/archives/rpm-list/2001-November/msg00257.html
+# http://www.redhat.com/archives/rpm-list/2003-February/msg00275.html
+# http://www.redhat.com/archives/rhl-devel-list/2004-January/msg01546.html
+# http://lists.opensuse.org/archive/opensuse-commit/2006-May/1171.html
+# ------------------------------------------------------------------------------
+%define __os_install_post /usr/lib/rpm/brp-compress
+
+# ------------------------------------------------------------------------------
+# We don't package all files installed into the build root by intention -
+# See BUG#998 for details.
+# ------------------------------------------------------------------------------
+%define _unpackaged_files_terminate_build 0
+
+# ------------------------------------------------------------------------------
+# RPM build tools now automatically detects Perl module dependencies. This
+# detection gives problems as it is broken in some versions, and it also
+# give unwanted dependencies from mandatory scripts in our package.
+# Might not be possible to disable in all RPM tool versions, but here we
+# try. We keep the "AutoReqProv: no" for the "test" sub package, as disabling
+# here might fail, and that package has the most problems.
+# See http://fedoraproject.org/wiki/Packaging/Perl#Filtering_Requires:_and_Provides
+# http://www.wideopen.com/archives/rpm-list/2002-October/msg00343.html
+# ------------------------------------------------------------------------------
+%undefine __perl_provides
+%undefine __perl_requires
+
+##############################################################################
+# Command line handling
+##############################################################################
+
+# ----------------------------------------------------------------------
+# use "rpmbuild --with yassl" or "rpm --define '_with_yassl 1'" (for RPM 3.x)
+# to build with yaSSL support (off by default)
+# ----------------------------------------------------------------------
+%{?_with_yassl:%define YASSL_BUILD 1}
+%{!?_with_yassl:%define YASSL_BUILD 0}
+
+# ----------------------------------------------------------------------
+# use "rpmbuild --without libgcc" or "rpm --define '_without_libgcc 1'" (for RPM 3.x)
+# to include libgcc (as libmygcc) (on by default)
+# ----------------------------------------------------------------------
+%{!?_with_libgcc: %{!?_without_libgcc: %define WITH_LIBGCC 1}}
+%{?_with_libgcc:%define WITH_LIBGCC 1}
+%{?_without_libgcc:%define WITH_LIBGCC 0}
+
+
+# On SuSE 9 no separate "debuginfo" package is built. To enable basic
+# debugging on that platform, we don't strip binaries on SuSE 9. We
+# disable the strip of binaries by redefining the RPM macro
+# "__os_install_post" leaving out the script calls that normally does
+# this. We do this in all cases, as on platforms where "debuginfo" is
+# created, a script "find-debuginfo.sh" will be called that will do
+# the strip anyway, part of separating the executable and debug
+# information into separate files put into separate packages.
+#
+# Some references (shows more advanced conditional usage):
+# http://www.redhat.com/archives/rpm-list/2001-November/msg00257.html
+# http://www.redhat.com/archives/rpm-list/2003-February/msg00275.html
+# http://www.redhat.com/archives/rhl-devel-list/2004-January/msg01546.html
+# http://lists.opensuse.org/archive/opensuse-commit/2006-May/1171.html
+
+%define __os_install_post /usr/lib/rpm/brp-compress
+
+%define server_suffix -51
+%define package_suffix -51
+%define ndbug_comment Percona SQL Server (GPL), XtraDB %{xtradbversion}
+%define debug_comment Percona SQL Server - Debug (GPL), XtraDB %{xtradbversion}
+%define commercial 0
+%define YASSL_BUILD 1
+%define EMBEDDED_BUILD 0
+%define PARTITION_BUILD 1
+%define CLUSTER_BUILD 0
+%define COMMUNITY_BUILD 1
+%define INNODB_BUILD 1
+%define PERCONA_PLUGIN_BUILD 1
+%define MARIA_BUILD 0
+%define NORMAL_TEST_MODE test-bt
+%define DEBUG_TEST_MODE test-bt-debug
+
+%define BUILD_DEBUG 0
+
+
+%if %{COMMUNITY_BUILD}
+%define cluster_package_prefix -cluster
+%else
+%define cluster_package_prefix -
+%endif
+
+%define lic_type GNU GPL v2
+%define lic_files COPYING README
+%define src_dir mysql-%{mysqlversion}
+
+Source1: percona-xtradb-%{pluginversion}-%{xtradbversion}.tar.gz
+Patch0: percona-support.patch
+
+Patch01: show_patches.patch
+Patch02: slow_extended.patch
+Patch03: profiling_slow.patch
+Patch04: microsec_process.patch
+Patch05: userstat.patch
+Patch06: optimizer_fix.patch
+Patch07: mysql-test_for_xtradb.diff
+Patch08: show_temp_51.patch
+
+
+%define perconaxtradbplugin percona-xtradb-%{pluginversion}-%{xtradbversion}.tar.gz
+
+##############################################################################
+# Main spec file section
+##############################################################################
+
+Name: Percona-XtraDB%{package_suffix}
+Summary: Percona-XtraDB: a very fast and reliable SQL database server
+Group: Applications/Databases
+Version: %{mysqlversion}
+Release: %{release}
+Distribution: Red Hat Enterprise Linux %{redhatversion}
+License: GPL version 2 http://www.gnu.org/licenses/gpl-2.0.html
+Source: %{src_dir}.tar.gz
+URL: http://www.percona.com/
+Packager: %{mysql_vendor} MySQL Development Team <mysql-dev@percona.com>
+Vendor: %{mysql_vendor}
+Provides: msqlormysql MySQL-server Percona-XtraDB-server
+BuildRequires: gperf perl readline-devel gcc-c++ ncurses-devel zlib-devel libtool automake autoconf time ccache bison
+
+# Think about what you use here since the first step is to
+# run a rm -rf
+BuildRoot: %{_tmppath}/%{name}-%{version}-build
+
+# From the manual
+%description
+The Percona-XtraDB software delivers a very fast, multi-threaded, multi-user,
+and robust SQL (Structured Query Language) database server. Percona-XtraDB Server
+is intended for mission-critical, heavy-load production systems as well
+as for embedding into mass-deployed software.
+
+Percona Inc. provides commercial support of Percona-XtraDB Server.
+For more information visist our web site http://www.percona.com/
+
+##############################################################################
+# Sub package definition
+##############################################################################
+
+%package -n Percona-XtraDB-server%{package_suffix}
+Summary: %{ndbug_comment} for Red Hat Enterprise Linux %{redhatversion}
+Group: Applications/Databases
+Requires: chkconfig coreutils shadow-utils grep procps
+Provides: msqlormysql mysql-server MySQL-server Percona-XtraDB-server
+Obsoletes: MySQL mysql mysql-server MySQL-server MySQL-server-community MySQL-server-percona
+
+%description -n Percona-XtraDB-server%{package_suffix}
+The Percona-XtraDB software delivers a very fast, multi-threaded, multi-user,
+and robust SQL (Structured Query Language) database server. Percona-XtraDB Server
+is intended for mission-critical, heavy-load production systems as well
+as for embedding into mass-deployed software.
+
+Percona Inc. provides commercial support of Percona-XtraDB Server.
+For more information visist our web site http://www.percona.com/
+
+This package includes the Percona-XtraDB server binary
+%if %{INNODB_BUILD}
+(configured including XtraDB)
+%endif
+as well as related utilities to run and administer a Percona-XtraDB server.
+
+If you want to access and work with the database, you have to install
+package "Percona-XtraDB-client%{package_suffix}" as well!
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-client%{package_suffix}
+Summary: Percona-XtraDB - Client
+Group: Applications/Databases
+Obsoletes: mysql-client MySQL-client MySQL-client-community MySQL-client-percona
+Provides: mysql-client MySQL-client Percona-XtraDB-client mysql MySQL
+
+%description -n Percona-XtraDB-client%{package_suffix}
+This package contains the standard Percona-XtraDB clients and administration tools.
+
+%{see_base}
+
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-test%{package_suffix}
+Requires: mysql-client perl
+Summary: Percona-XtraDB - Test suite
+Group: Applications/Databases
+Provides: mysql-test MySQL-test Percona-XtraDB-test
+Obsoletes: mysql-test MySQL-test MySQL-test-community MySQL-test-percona
+AutoReqProv: no
+
+%description -n Percona-XtraDB-test%{package_suffix}
+This package contains the Percona-XtraDB regression test suite.
+
+%{see_base}
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-devel%{package_suffix}
+Summary: Percona-XtraDB - Development header files and libraries
+Group: Applications/Databases
+Provides: mysql-devel MySQL-devel Percona-XtraDB-devel
+Obsoletes: mysql-devel MySQL-devel MySQL-devel-community MySQL-devel-percona
+
+%description -n Percona-XtraDB-devel%{package_suffix}
+This package contains the development header files and libraries
+necessary to develop Percona-XtraDB client applications.
+
+%{see_base}
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-shared%{package_suffix}
+Summary: Percona-XtraDB - Shared libraries
+Group: Applications/Databases
+Provides: mysql-shared MySQL-shared Percona-XtraDB-shared
+# Obsoletes below to correct old missing Provides:/Obsoletes
+Obsoletes: mysql-shared MySQL-shared-standard MySQL-shared-pro
+Obsoletes: MySQL-shared-pro-cert MySQL-shared-pro-gpl
+Obsoletes: MySQL-shared-pro-gpl-cert MySQL-shared MySQL-shared-community MySQL-shared-percona
+
+%description -n Percona-XtraDB-shared%{package_suffix}
+This package contains the shared libraries (*.so*) which certain
+languages and applications need to dynamically load and use MySQL.
+
+# ------------------------------------------------------------------------------
+
+%if %{PERCONA_PLUGIN_BUILD}
+
+%package -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+Requires: Percona-XtraDB-devel
+Summary: Percona XtraDB Storage engine for MySQL
+Group: Applications/Databases
+Provides: percona-xtradb-plugin Percona-XtraDB-plugin
+Obsoletes: percona-xtradb-plugin Percona-XtraDB-plugin
+
+%description -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+This package contains the Percona-XtraDB storage engine for MySQL server.
+
+An enhanced version of the InnoDB storage engine, including all
+of InnoDB's robust, reliable ACID-compliant design and advanced
+MVCC architecture, and builds on that solid foundation with more
+features, more tunability, more metrics, and more scalability.
+In particular, it is designed to scale better on many cores,
+to use memory more efficiently, and to be more convenient and useful.
+The new features are especially designed to reduce the need for
+awkward workarounds to many of InnoDB's limitations. We choose
+features and fixes based on customer requests and on our best
+judgment as a high-performance consulting company.
+
+%endif
+
+##############################################################################
+#
+##############################################################################
+
+%prep
+
+%setup -n %{src_dir}
+
+%patch01 -p1
+%patch02 -p1
+%patch03 -p1
+%patch04 -p1
+%patch05 -p1
+%patch06 -p1
+%patch07 -p1
+%patch08 -p1
+
+if [ "%{redhatversion}" = "5" ] ; then
+tar xfz $RPM_SOURCE_DIR/%{perconaxtradbplugin} -C storage/innobase --strip-components=1
+else
+tar xfz $RPM_SOURCE_DIR/%{perconaxtradbplugin} -C storage/innobase --strip-path=1
+fi
+%patch0 -p1
+
+cd storage/innobase && bash -x ./setup.sh
+
+##############################################################################
+# The actual build
+##############################################################################
+
+%build
+
+BuildMySQL() {
+# Get flags from environment. RPM_OPT_FLAGS seems not to be set anywhere.
+CFLAGS=${CFLAGS:-$RPM_OPT_FLAGS}
+CXXFLAGS=${CXXFLAGS:-$RPM_OPT_FLAGS}
+# Evaluate current setting of $DEBUG
+if [ $DEBUG -gt 0 ] ; then
+ OPT_COMMENT='--with-comment="%{debug_comment}"'
+ OPT_DEBUG='--with-debug'
+ CFLAGS=`echo " $CFLAGS " | \
+ sed -e 's/ -O[0-9]* / /' -e 's/ -unroll2 / /' -e 's/ -ip / /' \
+ -e 's/^ //' -e 's/ $//'`
+ CXXFLAGS=`echo " $CXXFLAGS " | \
+ sed -e 's/ -O[0-9]* / /' -e 's/ -unroll2 / /' -e 's/ -ip / /' \
+ -e 's/^ //' -e 's/ $//'`
+else
+ OPT_COMMENT='--with-comment="%{ndbug_comment}"'
+ OPT_DEBUG=''
+fi
+
+echo "BUILD =================="
+echo $*
+
+# The --enable-assembler simply does nothing on systems that does not
+# support assembler speedups.
+sh -c "CFLAGS=\"$CFLAGS\" \
+ CXXFLAGS=\"$CXXFLAGS\" \
+ AM_CPPFLAGS=\"$AM_CPPFLAGS\" \
+ LDFLAGS=\"$LDFLAGS\" \
+ ./configure \
+ $* \
+ --enable-assembler \
+ --enable-local-infile \
+ --with-mysqld-user=%{mysqld_user} \
+ --with-unix-socket-path=/var/lib/mysql/mysql.sock \
+ --with-pic \
+ --prefix=/ \
+%if %{CLUSTER_BUILD}
+ --with-extra-charsets=all \
+%else
+ --with-extra-charsets=complex \
+%endif
+%if %{YASSL_BUILD}
+ --with-ssl \
+%else
+ --without-ssl \
+%endif
+ --exec-prefix=%{_exec_prefix} \
+ --libexecdir=%{_sbindir} \
+ --libdir=%{_libdir} \
+ --sysconfdir=%{_sysconfdir} \
+ --datadir=%{_datadir} \
+ --localstatedir=%{mysqldatadir} \
+ --infodir=%{_infodir} \
+ --includedir=%{_includedir} \
+ --mandir=%{_mandir} \
+ --enable-thread-safe-client \
+ --enable-profiling \
+%if %{?ndbug_comment:1}0
+ $OPT_COMMENT \
+%endif
+ $OPT_DEBUG \
+%if %{commercial}
+ --with-libedit \
+%else
+ --with-readline \
+%endif
+ ; make "
+}
+# end of function definition "BuildMySQL"
+
+
+BuildServer() {
+BuildMySQL "--enable-shared \
+%if %{?server_suffix:1}0
+ --with-server-suffix='%{server_suffix}' \
+%endif
+%if %{CLUSTER_BUILD}
+ --with-plugin-ndbcluster \
+%else
+ --without-plugin-ndbcluster \
+%endif
+%if %{MARIA_BUILD}
+ --with-plugin-maria \
+ --with-maria-tmp-tables \
+%else
+ --without-plugin-maria \
+%endif
+%if %{INNODB_BUILD}
+ --with-plugin-innobase \
+ --without-plugin-innodb_plugin \
+%else
+ --without-plugin-innobase \
+ --without-plugin-innodb_plugin \
+%endif
+%if %{PARTITION_BUILD}
+ --with-plugin-partition \
+%else
+ --without-plugin-partition \
+%endif
+ --with-plugin-csv \
+ --with-plugin-archive \
+ --with-plugin-blackhole \
+ --with-plugin-federated \
+%if %{EMBEDDED_BUILD}
+ --with-embedded-server \
+%else
+ --without-embedded-server \
+%endif
+ --without-bench \
+ --with-zlib-dir=bundled \
+ --with-big-tables"
+
+if [ -n "$MYSQL_CONFLOG_DEST" ] ; then
+ cp -fp config.log "$MYSQL_CONFLOG_DEST"
+fi
+
+#if [ -f sql/.libs/mysqld ] ; then
+# nm --numeric-sort sql/.libs/mysqld > sql/mysqld.sym
+#else
+# nm --numeric-sort sql/mysqld > sql/mysqld.sym
+#fi
+}
+# end of function definition "BuildServer"
+
+
+RBR=$RPM_BUILD_ROOT
+MBD=$RPM_BUILD_DIR/%{src_dir}
+
+# Clean up the BuildRoot first
+[ "$RBR" != "/" ] && [ -d $RBR ] && rm -rf $RBR;
+mkdir -p $RBR%{_libdir}/mysql $RBR%{_sbindir}
+
+# Use gcc for C and C++ code (to avoid a dependency on libstdc++ and
+# including exceptions into the code
+if [ -z "$CXX" -a -z "$CC" ] ; then
+ export CC="gcc" CXX="gcc"
+fi
+
+if [ "%{redhatversion}" = "5" ] ; then
+export CFLAGS="-static-libgcc -O2 -fno-omit-frame-pointer -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -mtune=generic"
+export CXXFLAGS="-static-libgcc -O2 -fno-omit-frame-pointer -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -mtune=generic"
+fi
+
+if [ "%{redhatversion}" != "5" ] ; then
+export CFLAGS="-static-libgcc -O2 -g -fno-omit-frame-pointer -pipe "
+export CXXFLAGS="-static-libgcc -O2 -g -fno-omit-frame-pointer -pipe "
+fi
+
+
+# Create the shared libs seperately to avoid a dependency for the client utilities
+DEBUG=0
+BuildMySQL "--enable-shared"
+
+# Install shared libraries
+cp -av libmysql/.libs/*.so* $RBR/%{_libdir}
+cp -av libmysql_r/.libs/*.so* $RBR/%{_libdir}
+mkdir -p $RBR%{_libdir}/mysql/plugin
+cp -av storage/innobase/.libs/*.so* $RBR%{_libdir}/mysql/plugin
+cp -av storage/innobase/scripts/install_innodb_plugins.sql $RBR%{_libdir}/mysql/plugin
+
+pushd $RBR%{_libdir}/mysql
+tar cfz percona-xtradb-%{pluginversion}-%{xtradbversion}-%{mysqlversion}.$RPM_ARCH.tar.gz plugin
+mv percona-xtradb-%{pluginversion}-%{xtradbversion}-%{mysqlversion}.$RPM_ARCH.tar.gz %{_topdir}
+popd
+
+##############################################################################
+
+# Include libgcc.a in the devel subpackage (BUG 4921)
+%if %{WITH_LIBGCC}
+libgcc=`$CC $CFLAGS --print-libgcc-file`
+install -m 644 "$libgcc" $RBR%{_libdir}/mysql/libmygcc.a
+%endif
+
+##############################################################################
+
+# Now create a debug server
+%if %{BUILD_DEBUG}
+DEBUG=1
+make clean
+
+( BuildServer ) # subshell, so that CFLAGS + CXXFLAGS are modified only locally
+
+if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+ MTR_BUILD_THREAD=auto make %{DEBUG_TEST_MODE}
+fi
+
+# Get the debug server and its .sym file from the build tree
+#if [ -f sql/.libs/mysqld ] ; then
+# cp sql/.libs/mysqld $RBR%{_sbindir}/mysqld-debug
+#else
+# cp sql/mysqld $RBR%{_sbindir}/mysqld-debug
+#fi
+#cp libmysqld/libmysqld.a $RBR%{_libdir}/mysql/libmysqld-debug.a
+#cp sql/mysqld.sym $RBR%{_libdir}/mysql/mysqld-debug.sym
+
+%endif
+
+# Now, the default server
+DEBUG=0
+make clean
+
+BuildServer
+
+if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+ MTR_BUILD_THREAD=auto make %{NORMAL_TEST_MODE}
+fi
+
+# Now, build plugin
+#BUILDSO=0
+#make clean
+
+#BuildServer
+
+#if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+# MTR_BUILD_THREAD=auto make %{NORMAL_TEST_MODE}
+#fi
+
+%install
+RBR=$RPM_BUILD_ROOT
+MBD=$RPM_BUILD_DIR/%{src_dir}
+
+# Ensure that needed directories exists
+install -d $RBR%{_sysconfdir}/{logrotate.d,init.d}
+install -d $RBR%{mysqldatadir}/mysql
+install -d $RBR%{_datadir}/mysql-test
+install -d $RBR%{_datadir}/mysql/SELinux/RHEL4
+install -d $RBR%{_includedir}
+install -d $RBR%{_libdir}
+install -d $RBR%{_mandir}
+install -d $RBR%{_sbindir}
+install -d $RBR%{_libdir}/mysql/plugin
+
+make DESTDIR=$RBR benchdir_root=%{_datadir} install
+
+# install symbol files ( for stack trace resolution)
+#install -m644 $MBD/sql/mysqld.sym $RBR%{_libdir}/mysql/mysqld.sym
+
+# Install logrotate and autostart
+install -m644 $MBD/support-files/mysql-log-rotate \
+ $RBR%{_sysconfdir}/logrotate.d/mysql
+install -m755 $MBD/support-files/mysql.server \
+ $RBR%{_sysconfdir}/init.d/mysql
+
+# in RPMs, it is unlikely that anybody should use "sql-bench"
+rm -fr $RBR%{_datadir}/sql-bench
+
+# Create a symlink "rcmysql", pointing to the init.script. SuSE users
+# will appreciate that, as all services usually offer this.
+ln -s %{_sysconfdir}/init.d/mysql $RBR%{_sbindir}/rcmysql
+
+# Touch the place where the my.cnf config file and mysqlmanager.passwd
+# (MySQL Instance Manager password file) might be located
+# Just to make sure it's in the file list and marked as a config file
+touch $RBR%{_sysconfdir}/my.cnf
+touch $RBR%{_sysconfdir}/mysqlmanager.passwd
+
+# Install SELinux files in datadir
+install -m600 $MBD/support-files/RHEL4-SElinux/mysql.{fc,te} \
+ $RBR%{_datadir}/mysql/SELinux/RHEL4
+
+
+##############################################################################
+# Post processing actions, i.e. when installed
+##############################################################################
+
+%pre -n Percona-XtraDB-server%{package_suffix}
+# Check if we can safely upgrade. An upgrade is only safe if it's from one
+# of our RPMs in the same version family.
+
+installed=`rpm -q --whatprovides mysql-server 2> /dev/null`
+if [ $? -eq 0 -a -n "$installed" ]; then
+ vendor=`rpm -q --queryformat='%{VENDOR}' "$installed" 2>&1`
+ version=`rpm -q --queryformat='%{VERSION}' "$installed" 2>&1`
+ myvendor='%{mysql_vendor}'
+ myversion='%{mysqlversion}'
+
+ old_family=`echo $version | sed -n -e 's,^\([1-9][0-9]*\.[0-9][0-9]*\)\..*$,\1,p'`
+ new_family=`echo $myversion | sed -n -e 's,^\([1-9][0-9]*\.[0-9][0-9]*\)\..*$,\1,p'`
+
+ [ -z "$vendor" ] && vendor='<unknown>'
+ [ -z "$old_family" ] && old_family="<unrecognized version $version>"
+ [ -z "$new_family" ] && new_family="<bad package specification: version $myversion>"
+
+ error_text=
+# if [ "$vendor" != "$myvendor" ]; then
+# error_text="$error_text
+#The current MySQL server package is provided by a different
+#vendor ($vendor) than $myvendor. Some files may be installed
+#to different locations, including log files and the service
+#startup script in %{_sysconfdir}/init.d/.
+#"
+# fi
+
+ if [ "$old_family" != "$new_family" ]; then
+ error_text="$error_text
+Upgrading directly from MySQL $old_family to MySQL $new_family may not
+be safe in all cases. A manual dump and restore using mysqldump is
+recommended. It is important to review the MySQL manual's Upgrading
+section for version-specific incompatibilities.
+"
+ fi
+
+ if [ -n "$error_text" ]; then
+ cat <<HERE >&2
+
+******************************************************************
+A MySQL server package ($installed) is installed.
+$error_text
+A manual upgrade is required.
+
+- Ensure that you have a complete, working backup of your data and my.cnf
+ files
+- Shut down the MySQL server cleanly
+- Remove the existing MySQL packages. Usually this command will
+ list the packages you should remove:
+ rpm -qa | grep -i '^mysql-'
+
+ You may choose to use 'rpm --nodeps -ev <package-name>' to remove
+ the package which contains the mysqlclient shared library. The
+ library will be reinstalled by the MySQL-shared-compat package.
+- Install the new MySQL packages supplied by $myvendor
+- Ensure that the MySQL server is started
+- Run the 'mysql_upgrade' program
+
+This is a brief description of the upgrade process. Important details
+can be found in the MySQL manual, in the Upgrading section.
+******************************************************************
+HERE
+ exit 1
+ fi
+fi
+
+# Shut down a previously installed server first
+if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+ %{_sysconfdir}/init.d/mysql stop > /dev/null 2>&1
+ echo "Giving mysqld 5 seconds to exit nicely"
+ sleep 5
+fi
+
+%post -n Percona-XtraDB-server%{package_suffix}
+mysql_datadir=%{mysqldatadir}
+
+# ----------------------------------------------------------------------
+# Create data directory
+# ----------------------------------------------------------------------
+mkdir -p $mysql_datadir/{mysql,test}
+
+# ----------------------------------------------------------------------
+# Make MySQL start/shutdown automatically when the machine does it.
+# ----------------------------------------------------------------------
+if [ -x /sbin/chkconfig ] ; then
+ /sbin/chkconfig --add mysql
+fi
+
+# ----------------------------------------------------------------------
+# Create a MySQL user and group. Do not report any problems if it already
+# exists.
+# ----------------------------------------------------------------------
+groupadd -r %{mysqld_group} 2> /dev/null || true
+useradd -M -r -d $mysql_datadir -s /bin/bash -c "MySQL server" -g %{mysqld_group} %{mysqld_user} 2> /dev/null || true
+# The user may already exist, make sure it has the proper group nevertheless (BUG#12823)
+usermod -g %{mysqld_group} %{mysqld_user} 2> /dev/null || true
+
+# ----------------------------------------------------------------------
+# Change permissions so that the user that will run the MySQL daemon
+# owns all database files.
+# ----------------------------------------------------------------------
+chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir
+
+# ----------------------------------------------------------------------
+# Initiate databases
+# ----------------------------------------------------------------------
+%{_bindir}/mysql_install_db --rpm --user=%{mysqld_user}
+
+# ----------------------------------------------------------------------
+# FIXME upgrade databases if needed would go here - but it cannot be
+# automated yet
+# ----------------------------------------------------------------------
+
+# ----------------------------------------------------------------------
+# Change permissions again to fix any new files.
+# ----------------------------------------------------------------------
+chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir
+
+# ----------------------------------------------------------------------
+# Fix permissions for the permission database so that only the user
+# can read them.
+# ----------------------------------------------------------------------
+chmod -R og-rw $mysql_datadir/mysql
+
+# ----------------------------------------------------------------------
+# install SELinux files - but don't override existing ones
+# ----------------------------------------------------------------------
+SETARGETDIR=/etc/selinux/targeted/src/policy
+SEDOMPROG=$SETARGETDIR/domains/program
+SECONPROG=$SETARGETDIR/file_contexts/program
+if [ -f /etc/redhat-release ] && \
+ (grep -q "Red Hat Enterprise Linux .. release 4" /etc/redhat-release \
+ || grep -q "CentOS release 4" /etc/redhat-release) ; then
+ echo
+ echo
+ echo 'Notes regarding SELinux on this platform:'
+ echo '========================================='
+ echo
+ echo 'The default policy might cause server startup to fail because it is '
+ echo 'not allowed to access critical files. In this case, please update '
+ echo 'your installation. '
+ echo
+ echo 'The default policy might also cause inavailability of SSL related '
+ echo 'features because the server is not allowed to access /dev/random '
+ echo 'and /dev/urandom. If this is a problem, please do the following: '
+ echo
+ echo ' 1) install selinux-policy-targeted-sources from your OS vendor'
+ echo ' 2) add the following two lines to '$SEDOMPROG/mysqld.te':'
+ echo ' allow mysqld_t random_device_t:chr_file read;'
+ echo ' allow mysqld_t urandom_device_t:chr_file read;'
+ echo ' 3) cd to '$SETARGETDIR' and issue the following command:'
+ echo ' make load'
+ echo
+ echo
+fi
+
+if [ -x sbin/restorecon ] ; then
+ sbin/restorecon -R var/lib/mysql
+fi
+
+# Restart in the same way that mysqld will be started normally.
+if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+ %{_sysconfdir}/init.d/mysql start
+ echo "Giving mysqld 2 seconds to start"
+ sleep 2
+fi
+
+# Allow mysqld_safe to start mysqld and print a message before we exit
+sleep 2
+
+%if %{CLUSTER_BUILD}
+%post -n MySQL%{cluster_package_prefix}storage%{package_suffix}
+# Create cluster directory if needed
+mkdir -p /var/lib/mysql-cluster
+%endif
+
+%preun -n Percona-XtraDB-server%{package_suffix}
+if [ $1 = 0 ] ; then
+ # Stop MySQL before uninstalling it
+ if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+ %{_sysconfdir}/init.d/mysql stop > /dev/null
+ # Don't start it automatically anymore
+ if [ -x /sbin/chkconfig ] ; then
+ /sbin/chkconfig --del mysql
+ fi
+ fi
+fi
+
+# We do not remove the mysql user since it may still own a lot of
+# database files.
+
+# ----------------------------------------------------------------------
+# Clean up the BuildRoot after build is done
+# ----------------------------------------------------------------------
+%clean
+[ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT;
+
+##############################################################################
+# Files section
+##############################################################################
+
+%files -n Percona-XtraDB-server%{package_suffix}
+%defattr(-,root,root,0755)
+
+%doc %{lic_files}
+%doc support-files/my-*.cnf
+%if %{CLUSTER_BUILD}
+%doc support-files/ndb-*.ini
+%endif
+
+%doc %attr(644, root, root) %{_infodir}/mysql.info*
+
+%if %{INNODB_BUILD}
+%doc %attr(644, root, man) %{_mandir}/man1/innochecksum.1*
+%endif
+%doc %attr(644, root, man) %{_mandir}/man1/my_print_defaults.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisam_ftdump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisamchk.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisamlog.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisampack.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_convert_table_format.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_fix_extensions.1*
+%doc %attr(644, root, man) %{_mandir}/man8/mysqld.8*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqld_multi.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqld_safe.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_fix_privilege_tables.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_install_db.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_secure_installation.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_setpermission.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_upgrade.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlhotcopy.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlman.1*
+%doc %attr(644, root, man) %{_mandir}/man8/mysqlmanager.8*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql.server.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqltest.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_tzinfo_to_sql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_zap.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlbug.1*
+%doc %attr(644, root, man) %{_mandir}/man1/perror.1*
+%doc %attr(644, root, man) %{_mandir}/man1/replace.1*
+%doc %attr(644, root, man) %{_mandir}/man1/resolve_stack_dump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/resolveip.1*
+
+%ghost %config(noreplace,missingok) %{_sysconfdir}/my.cnf
+%ghost %config(noreplace,missingok) %{_sysconfdir}/mysqlmanager.passwd
+
+%if %{INNODB_BUILD}
+%attr(755, root, root) %{_bindir}/innochecksum
+%endif
+%attr(755, root, root) %{_bindir}/my_print_defaults
+%attr(755, root, root) %{_bindir}/myisam_ftdump
+%attr(755, root, root) %{_bindir}/myisamchk
+%attr(755, root, root) %{_bindir}/myisamlog
+%attr(755, root, root) %{_bindir}/myisampack
+%attr(755, root, root) %{_bindir}/mysql_convert_table_format
+%attr(755, root, root) %{_bindir}/mysql_fix_extensions
+%attr(755, root, root) %{_bindir}/mysql_fix_privilege_tables
+%attr(755, root, root) %{_bindir}/mysql_install_db
+%attr(755, root, root) %{_bindir}/mysql_secure_installation
+%attr(755, root, root) %{_bindir}/mysql_setpermission
+%attr(755, root, root) %{_bindir}/mysql_tzinfo_to_sql
+%attr(755, root, root) %{_bindir}/mysql_upgrade
+%attr(755, root, root) %{_bindir}/mysql_zap
+%attr(755, root, root) %{_bindir}/mysqlbug
+%attr(755, root, root) %{_bindir}/mysqld_multi
+%attr(755, root, root) %{_bindir}/mysqld_safe
+%attr(755, root, root) %{_bindir}/mysqldumpslow
+%attr(755, root, root) %{_bindir}/mysqlhotcopy
+%attr(755, root, root) %{_bindir}/mysqltest
+%attr(755, root, root) %{_bindir}/perror
+%attr(755, root, root) %{_bindir}/replace
+%attr(755, root, root) %{_bindir}/resolve_stack_dump
+%attr(755, root, root) %{_bindir}/resolveip
+
+%attr(755, root, root) %{_sbindir}/mysqld
+%if %{BUILD_DEBUG}
+%attr(755, root, root) %{_sbindir}/mysqld-debug
+%endif
+%attr(755, root, root) %{_sbindir}/mysqlmanager
+%attr(755, root, root) %{_sbindir}/rcmysql
+#%attr(644, root, root) %{_libdir}/mysql/mysqld.sym
+%if %{BUILD_DEBUG}
+#%attr(644, root, root) %{_libdir}/mysql/mysqld-debug.sym
+%endif
+
+%attr(644, root, root) %config(noreplace,missingok) %{_sysconfdir}/logrotate.d/mysql
+%attr(755, root, root) %{_sysconfdir}/init.d/mysql
+
+%attr(755, root, root) %{_datadir}/mysql/
+
+%files -n Percona-XtraDB-client%{package_suffix}
+%defattr(-, root, root, 0755)
+%attr(755, root, root) %{_bindir}/msql2mysql
+%attr(755, root, root) %{_bindir}/mysql
+%attr(755, root, root) %{_bindir}/mysql_find_rows
+%attr(755, root, root) %{_bindir}/mysql_waitpid
+%attr(755, root, root) %{_bindir}/mysqlaccess
+%attr(755, root, root) %{_bindir}/mysqladmin
+%attr(755, root, root) %{_bindir}/mysqlbinlog
+%attr(755, root, root) %{_bindir}/mysqlcheck
+%attr(755, root, root) %{_bindir}/mysqldump
+%attr(755, root, root) %{_bindir}/mysqlimport
+%attr(755, root, root) %{_bindir}/mysqlshow
+%attr(755, root, root) %{_bindir}/mysqlslap
+
+%doc %attr(644, root, man) %{_mandir}/man1/msql2mysql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_find_rows.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_waitpid.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlaccess.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqladmin.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlbinlog.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlcheck.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqldump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlimport.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlshow.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlslap.1*
+
+%post -n Percona-XtraDB-shared%{package_suffix}
+/sbin/ldconfig
+
+%postun -n Percona-XtraDB-shared%{package_suffix}
+/sbin/ldconfig
+
+%if %{CLUSTER_BUILD}
+%files -n MySQL%{cluster_package_prefix}storage%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_sbindir}/ndbd
+%doc %attr(644, root, man) %{_mandir}/man8/ndbd.8*
+
+%files -n MySQL%{cluster_package_prefix}management%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_sbindir}/ndb_mgmd
+%doc %attr(644, root, man) %{_mandir}/man8/ndb_mgmd.8*
+
+%files -n MySQL%{cluster_package_prefix}tools%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_bindir}/ndb_config
+%attr(755, root, root) %{_bindir}/ndb_desc
+%attr(755, root, root) %{_bindir}/ndb_error_reporter
+%attr(755, root, root) %{_bindir}/ndb_mgm
+%attr(755, root, root) %{_bindir}/ndb_restore
+%attr(755, root, root) %{_bindir}/ndb_select_all
+%attr(755, root, root) %{_bindir}/ndb_select_count
+%attr(755, root, root) %{_bindir}/ndb_show_tables
+%attr(755, root, root) %{_bindir}/ndb_size.pl
+%attr(755, root, root) %{_bindir}/ndb_test_platform
+%attr(755, root, root) %{_bindir}/ndb_waiter
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_config.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_desc.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_error_reporter.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_mgm.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_restore.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_select_all.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_select_count.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_show_tables.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_size.pl.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_waiter.1*
+
+%files -n MySQL%{cluster_package_prefix}extra%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_bindir}/ndb_delete_all
+%attr(755, root, root) %{_bindir}/ndb_drop_index
+%attr(755, root, root) %{_bindir}/ndb_drop_table
+%attr(755, root, root) %{_sbindir}/ndb_cpcd
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_delete_all.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_drop_index.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_drop_table.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_cpcd.1*
+%endif
+
+%files -n Percona-XtraDB-devel%{package_suffix}
+%defattr(-, root, root, 0755)
+%if %{commercial}
+%else
+%doc EXCEPTIONS-CLIENT
+%endif
+%doc %attr(644, root, man) %{_mandir}/man1/comp_err.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_config.1*
+%attr(755, root, root) %{_bindir}/mysql_config
+%dir %attr(755, root, root) %{_libdir}/mysql
+%{_includedir}/mysql
+%{_datadir}/aclocal/mysql.m4
+%{_libdir}/mysql/libdbug.a
+%{_libdir}/mysql/libheap.a
+%if %{WITH_LIBGCC}
+%{_libdir}/mysql/libmygcc.a
+%endif
+%{_libdir}/mysql/libmyisam.a
+%{_libdir}/mysql/libmyisammrg.a
+%{_libdir}/mysql/libmysqlclient.a
+%{_libdir}/mysql/libmysqlclient.la
+%{_libdir}/mysql/libmysqlclient_r.a
+%{_libdir}/mysql/libmysqlclient_r.la
+%{_libdir}/mysql/libmystrings.a
+%{_libdir}/mysql/libmysys.a
+%{_libdir}/mysql/libvio.a
+%{_libdir}/mysql/libz.a
+%{_libdir}/mysql/libz.la
+%if %{CLUSTER_BUILD}
+%{_libdir}/mysql/libndbclient.a
+%{_libdir}/mysql/libndbclient.la
+%endif
+
+%files -n Percona-XtraDB-shared%{package_suffix}
+%defattr(-, root, root, 0755)
+# Shared libraries (omit for architectures that don't support them)
+%{_libdir}/*.so*
+
+%files -n Percona-XtraDB-test%{package_suffix}
+%defattr(-, root, root, 0755)
+%{_datadir}/mysql-test
+%attr(755, root, root) %{_bindir}/mysql_client_test
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_client_test.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql-stress-test.pl.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql-test-run.pl.1*
+
+%files -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+%defattr(-, root, root, 0755)
+%attr(644, root, root) %{_libdir}/mysql/plugin/ha_innodb.so*
+%attr(644, root, root) %{_libdir}/mysql/plugin/install_innodb_plugins.sql
+
+##############################################################################
+# The spec file changelog only includes changes made to the spec file
+# itself - note that they must be ordered by date (important when
+# merging BK trees)
+##############################################################################
+%changelog
+* Mon Mar 22 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+XtraDB Release 10
+
+* Thu Feb 11 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+Package name changed to Percona-XtraDB
+
+* Tue Jan 05 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+- Corrected emails
+- -m64 is removed from CFLAGS
+
+* Tue Apr 21 2009 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+- Adoption for XtraDB Storage Engine
+
+* Fri Nov 07 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Modify CFLAGS and CXXFLAGS such that a debug build is not optimized.
+ This should cover both gcc and icc flags. Fixes bug#40546.
+
+* Mon Aug 18 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Get rid of the "warning: Installed (but unpackaged) file(s) found:"
+ Some generated files aren't needed in RPMs:
+ - the "sql-bench/" subdirectory
+ Some files were missing:
+ - /usr/share/aclocal/mysql.m4 ("devel" subpackage)
+ - Manuals for embedded tests ("test" subpackage)
+ - Manual "mysqlbug" ("server" subpackage)
+ - Manual "mysql_find_rows" ("client" subpackage)
+
+* Wed Jun 11 2008 Kent Boortz <kent@mysql.com>
+
+- Removed the Example storage engine, it is not to be in products
+
+* Fri Apr 04 2008 Daniel Fischer <df@mysql.com>
+
+- Added Cluster+InnoDB product
+
+* Mon Mar 31 2008 Kent Boortz <kent@mysql.com>
+
+- Made the "Federated" storage engine an option
+
+* Tue Mar 11 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Cleanup: Remove manual file "mysql_tableinfo.1".
+
+* Mon Feb 18 2008 Timothy Smith <tim@mysql.com>
+
+- Require a manual upgrade if the alread-installed mysql-server is
+ from another vendor, or is of a different major version.
+
+* Fri Dec 14 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add the "%doc" directive for all man pages and other documentation;
+ also, some re-ordering to reduce differences between spec files.
+
+* Fri Dec 14 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Added "client/mysqlslap" (bug#32077)
+
+* Wed Oct 31 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Explicitly handle InnoDB using its own variable and "--with"/"--without"
+ options, because the "configure" default is "yes".
+ Also, fix the specification of "community" to include "partitioning".
+
+* Mon Sep 03 2007 Kent Boortz <kent@mysql.com>
+
+- Let libmygcc be included unless "--without libgcc" is given.
+
+* Sun Sep 02 2007 Kent Boortz <kent@mysql.com>
+
+- Changed SSL flag given to configure to "--with-ssl"
+- Removed symbolic link "safe_mysqld"
+- Removed script and man page for "mysql_explain_log"
+- Removed scripts "mysql_tableinfo" and "mysql_upgrade_shell"
+- Removed "comp_err" from list to install
+- Removed duplicates of "libndbclient.a" and "libndbclient.la"
+
+* Tue Jul 17 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add the man page for "mysql-stress-test.pl" to the "test" RPM
+ (consistency in fixing bug#21023, the script is handled by "Makefile.am")
+
+* Wed Jul 11 2007 Daniel Fischer <df@mysql.com>
+
+- Change the way broken SELinux policies on RHEL4 and CentOS 4
+ are handled to be more likely to actually work
+
+* Thu Jun 05 2007 kent Boortz <kent@mysql.com>
+
+- Enabled the CSV engine in all builds
+
+* Thu May 3 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- Spring cleanup
+
+* Thu Apr 19 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- If sbin/restorecon exists then run it
+
+* Wed Apr 18 2007 Kent Boortz <kent@mysql.com>
+
+- Packed unpacked files
+
+ /usr/sbin/ndb_cpcd
+ /usr/bin/mysql_upgrade_shell
+ /usr/bin/innochecksum
+ /usr/share/man/man1/ndb_cpcd.1.gz
+ /usr/share/man/man1/innochecksum.1.gz
+ /usr/share/man/man1/mysql_fix_extensions.1.gz
+ /usr/share/man/man1/mysql_secure_installation.1.gz
+ /usr/share/man/man1/mysql_tableinfo.1.gz
+ /usr/share/man/man1/mysql_waitpid.1.gz
+
+- Commands currently not installed but that has man pages
+
+ /usr/share/man/man1/make_win_bin_dist.1.gz
+ /usr/share/man/man1/make_win_src_distribution.1.gz
+ /usr/share/man/man1/mysql-stress-test.pl.1.gz
+ /usr/share/man/man1/ndb_print_backup_file.1.gz
+ /usr/share/man/man1/ndb_print_schema_file.1.gz
+ /usr/share/man/man1/ndb_print_sys_file.1.gz
+
+* Thu Mar 22 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add "comment" options to the test runs, for better log analysis.
+
+* Wed Mar 21 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add even more man pages.
+
+* Fri Mar 16 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Build the server twice, once as "mysqld-debug" and once as "mysqld";
+ test them both, and include them in the resulting file.
+- Consequences of the fix for bug#20166:
+ Remove "mysql_create_system_tables",
+ new "mysql_fix_privilege_tables.sql" is included implicitly.
+
+* Wed Mar 14 2007 Daniel Fischer <df@mysql.com>
+
+- Adjust compile options some more and change naming of community
+ cluster RPMs to explicitly say 'cluster'.
+
+* Mon Mar 12 2007 Daniel Fischer <df@mysql.com>
+
+- Adjust compile options and other settings for 5.0 community builds.
+
+* Fri Mar 02 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add several man pages which are now created.
+
+* Mon Jan 29 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- Make sure SELinux works correctly. Files from Colin Charles.
+
+* Fri Jan 05 2007 Kent Boortz <kent@mysql.com>
+
+- Add CFLAGS to gcc call with --print-libgcc-file, to make sure the
+ correct "libgcc.a" path is returned for the 32/64 bit architecture.
+
+* Tue Dec 19 2006 Joerg Bruehe <joerg@mysql.com>
+
+- The man page for "mysqld" is now in section 8.
+
+* Thu Dec 14 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Include the new man pages for "my_print_defaults" and "mysql_tzinfo_to_sql"
+ in the server RPM.
+- The "mysqlmanager" man page was relocated to section 8, reflect that.
+
+* Fri Nov 17 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Really fix obsoletes/provides for community -> this
+- Make it possible to not run test by setting
+ MYSQL_RPMBUILD_TEST to "no"
+
+* Wed Nov 15 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Switch from "make test*" to explicit calls of the test suite,
+ so that "report features" can be used.
+
+* Wed Nov 15 2006 Kent Boortz <kent@mysql.com>
+
+- Added "--with cluster" and "--define cluster{_gpl}"
+
+* Tue Oct 24 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Shared need to Provide/Obsolete mysql-shared
+
+* Mon Oct 23 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Run sbin/restorecon after db init (Bug#12676)
+
+* Thu Jul 06 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Correct a typing error in my previous change.
+
+* Tue Jul 04 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Use the Perl script to run the tests, because it will automatically check
+ whether the server is configured with SSL.
+
+* Wed Jun 28 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Revert all previous attempts to call "mysql_upgrade" during RPM upgrade,
+ there are some more aspects which need to be solved before this is possible.
+ For now, just ensure the binary "mysql_upgrade" is delivered and installed.
+
+* Wed Jun 28 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Move "mysqldumpslow" from the client RPM to the server RPM (bug#20216).
+
+* Wed Jun 21 2006 Joerg Bruehe <joerg@mysql.com>
+
+- To run "mysql_upgrade", we need a running server;
+ start it in isolation and skip password checks.
+
+* Sat May 23 2006 Kent Boortz <kent@mysql.com>
+
+- Always compile for PIC, position independent code.
+
+* Fri Apr 28 2006 Kent Boortz <kent@mysql.com>
+
+- Install and run "mysql_upgrade"
+
+* Sat Apr 01 2006 Kent Boortz <kent@mysql.com>
+
+- Allow to override $LDFLAGS
+
+* Fri Jan 06 2006 Lenz Grimmer <lenz@mysql.com>
+
+- added a MySQL-test subpackage (BUG#16070)
+
+* Tue Dec 27 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Some minor alignment with the 4.1 version
+
+* Wed Dec 14 2005 Rodrigo Novo <rodrigo@mysql.com>
+
+- Cosmetic changes: source code location & rpm packager
+- Protect "nm -D" against libtool weirdness
+- Add libz.a & libz.la to the list of files for subpackage -devel
+- moved --with-zlib-dir=bundled out of BuildMySQL, as it doesn't makes
+ sense for the shared package
+
+* Tue Nov 22 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Extend the file existence check for "init.d/mysql" on un-install
+ to also guard the call to "insserv"/"chkconfig".
+
+* Wed Nov 16 2005 Lenz Grimmer <lenz@mysql.com>
+
+- added mysql_client_test to the "client" subpackage (BUG#14546)
+
+* Tue Nov 15 2005 Lenz Grimmer <lenz@mysql.com>
+
+- changed default definitions to build a standard GPL release when not
+ defining anything else
+- install the shared libs more elegantly by using "make install"
+
+* Wed Oct 19 2005 Kent Boortz <kent@mysql.com>
+
+- Made yaSSL support an option (off by default)
+
+* Wed Oct 19 2005 Kent Boortz <kent@mysql.com>
+
+- Enabled yaSSL support
+
+* Thu Oct 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- added a usermod call to assign a potential existing mysql user to the
+ correct user group (BUG#12823)
+- added a separate macro "mysqld_group" to be able to define the
+ user group of the mysql user seperately, if desired.
+
+* Fri Oct 1 2005 Kent Boortz <kent@mysql.com>
+
+- Copy the config.log file to location outside
+ the build tree
+
+* Fri Sep 30 2005 Lenz Grimmer <lenz@mysql.com>
+
+- don't use install-strip to install the binaries (strip segfaults on
+ icc-compiled binaries on IA64)
+
+* Thu Sep 22 2005 Lenz Grimmer <lenz@mysql.com>
+
+- allow overriding the CFLAGS (needed for Intel icc compiles)
+- replace the CPPFLAGS=-DBIG_TABLES with "--with-big-tables" configure option
+
+* Fri Aug 19 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Protect against failing tests.
+
+* Thu Aug 04 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed the creation of the mysql user group account in the postinstall
+ section (BUG 12348)
+
+* Fri Jul 29 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed external RPM Requirements to better suit the target distribution
+ (BUG 12233)
+
+* Fri Jul 15 2005 Lenz Grimmer <lenz@mysql.com>
+
+- create a "mysql" user group and assign the mysql user account to that group
+ in the server postinstall section. (BUG 10984)
+
+* Wed Jun 01 2005 Lenz Grimmer <lenz@mysql.com>
+
+- use "mysqldatadir" variable instead of hard-coding the path multiple times
+- use the "mysqld_user" variable on all occasions a user name is referenced
+- removed (incomplete) Brazilian translations
+- removed redundant release tags from the subpackage descriptions
+
+* Fri May 27 2005 Lenz Grimmer <lenz@mysql.com>
+
+- fixed file list (removed libnisam.a and libmerge.a from the devel subpackage)
+- force running the test suite
+
+* Wed Apr 20 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Enabled the "blackhole" storage engine for the Max RPM
+
+* Wed Apr 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- removed the MySQL manual files (html/ps/texi) - they have been removed
+ from the MySQL sources and are now available seperately.
+
+* Mon Apr 4 2005 Petr Chardin <petr@mysql.com>
+
+- old mysqlmanager, mysqlmanagerc and mysqlmanager-pwger renamed into
+ mysqltestmanager, mysqltestmanager and mysqltestmanager-pwgen respectively
+
+* Fri Mar 18 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Disabled RAID in the Max binaries once and for all (it has finally been
+ removed from the source tree)
+
+* Sun Feb 20 2005 Petr Chardin <petr@mysql.com>
+
+- Install MySQL Instance Manager together with mysqld, touch mysqlmanager
+ password file
+
+* Mon Feb 14 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed the compilation comments and moved them into the separate build sections
+ for Max and Standard
+
+* Mon Feb 7 2005 Tomas Ulin <tomas@mysql.com>
+
+- enabled the "Ndbcluster" storage engine for the max binary
+- added extra make install in ndb subdir after Max build to get ndb binaries
+- added packages for ndbcluster storage engine
+
+* Fri Jan 14 2005 Lenz Grimmer <lenz@mysql.com>
+
+- replaced obsoleted "BuildPrereq" with "BuildRequires" instead
+
+* Thu Jan 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- enabled the "Federated" storage engine for the max binary
+
+* Tue Jan 04 2005 Petr Chardin <petr@mysql.com>
+
+- ISAM and merge storage engines were purged. As well as appropriate
+ tools and manpages (isamchk and isamlog)
+
+* Thu Dec 31 2004 Lenz Grimmer <lenz@mysql.com>
+
+- enabled the "Archive" storage engine for the max binary
+- enabled the "CSV" storage engine for the max binary
+- enabled the "Example" storage engine for the max binary
+
+* Thu Aug 26 2004 Lenz Grimmer <lenz@mysql.com>
+
+- MySQL-Max now requires MySQL-server instead of MySQL (BUG 3860)
+
+* Fri Aug 20 2004 Lenz Grimmer <lenz@mysql.com>
+
+- do not link statically on IA64/AMD64 as these systems do not have
+ a patched glibc installed
+
+* Tue Aug 10 2004 Lenz Grimmer <lenz@mysql.com>
+
+- Added libmygcc.a to the devel subpackage (required to link applications
+ against the the embedded server libmysqld.a) (BUG 4921)
+
+* Mon Aug 09 2004 Lenz Grimmer <lenz@mysql.com>
+
+- Added EXCEPTIONS-CLIENT to the "devel" package
+
+* Thu Jul 29 2004 Lenz Grimmer <lenz@mysql.com>
+
+- disabled OpenSSL in the Max binaries again (the RPM packages were the
+ only exception to this anyway) (BUG 1043)
+
+* Wed Jun 30 2004 Lenz Grimmer <lenz@mysql.com>
+
+- fixed server postinstall (mysql_install_db was called with the wrong
+ parameter)
+
+* Thu Jun 24 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added mysql_tzinfo_to_sql to the server subpackage
+- run "make clean" instead of "make distclean"
+
+* Mon Apr 05 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added ncurses-devel to the build prerequisites (BUG 3377)
+
+* Thu Feb 12 2004 Lenz Grimmer <lenz@mysql.com>
+
+- when using gcc, _always_ use CXX=gcc
+- replaced Copyright with License field (Copyright is obsolete)
+
+* Tue Feb 03 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added myisam_ftdump to the Server package
+
+* Tue Jan 13 2004 Lenz Grimmer <lenz@mysql.com>
+
+- link the mysql client against libreadline instead of libedit (BUG 2289)
+
+* Mon Dec 22 2003 Lenz Grimmer <lenz@mysql.com>
+
+- marked /etc/logrotate.d/mysql as a config file (BUG 2156)
+
+* Fri Dec 13 2003 Lenz Grimmer <lenz@mysql.com>
+
+- fixed file permissions (BUG 1672)
+
+* Thu Dec 11 2003 Lenz Grimmer <lenz@mysql.com>
+
+- made testing for gcc3 a bit more robust
+
+* Fri Dec 05 2003 Lenz Grimmer <lenz@mysql.com>
+
+- added missing file mysql_create_system_tables to the server subpackage
+
+* Fri Nov 21 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed dependency on MySQL-client from the MySQL-devel subpackage
+ as it is not really required. (BUG 1610)
+
+* Fri Aug 29 2003 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed BUG 1162 (removed macro names from the changelog)
+- Really fixed BUG 998 (disable the checking for installed but
+ unpackaged files)
+
+* Tue Aug 05 2003 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed BUG 959 (libmysqld not being compiled properly)
+- Fixed BUG 998 (RPM build errors): added missing files to the
+ distribution (mysql_fix_extensions, mysql_tableinfo, mysqldumpslow,
+ mysql_fix_privilege_tables.1), removed "-n" from install section.
+
+* Wed Jul 09 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed the GIF Icon (file was not included in the sources anyway)
+- removed unused variable shared_lib_version
+- do not run automake before building the standard binary
+ (should not be necessary)
+- add server suffix '-standard' to standard binary (to be in line
+ with the binary tarball distributions)
+- Use more RPM macros (_exec_prefix, _sbindir, _libdir, _sysconfdir,
+ _datadir, _includedir) throughout the spec file.
+- allow overriding CC and CXX (required when building with other compilers)
+
+* Fri May 16 2003 Lenz Grimmer <lenz@mysql.com>
+
+- re-enabled RAID again
+
+* Wed Apr 30 2003 Lenz Grimmer <lenz@mysql.com>
+
+- disabled MyISAM RAID (--with-raid) - it throws an assertion which
+ needs to be investigated first.
+
+* Mon Mar 10 2003 Lenz Grimmer <lenz@mysql.com>
+
+- added missing file mysql_secure_installation to server subpackage
+ (BUG 141)
+
+* Tue Feb 11 2003 Lenz Grimmer <lenz@mysql.com>
+
+- re-added missing pre- and post(un)install scripts to server subpackage
+- added config file /etc/my.cnf to the file list (just for completeness)
+- make sure to create the datadir with 755 permissions
+
+* Mon Jan 27 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed unused CC and CXX variables
+- CFLAGS and CXXFLAGS should honor RPM_OPT_FLAGS
+
+* Fri Jan 24 2003 Lenz Grimmer <lenz@mysql.com>
+
+- renamed package "MySQL" to "MySQL-server"
+- fixed Copyright tag
+- added mysql_waitpid to client subpackage (required for mysql-test-run)
+
+* Wed Nov 27 2002 Lenz Grimmer <lenz@mysql.com>
+
+- moved init script from /etc/rc.d/init.d to /etc/init.d (the majority of
+ Linux distributions now support this scheme as proposed by the LSB either
+ directly or via a compatibility symlink)
+- Use new "restart" init script action instead of starting and stopping
+ separately
+- Be more flexible in activating the automatic bootup - use insserv (on
+ older SuSE versions) or chkconfig (Red Hat, newer SuSE versions and
+ others) to create the respective symlinks
+
+* Wed Sep 25 2002 Lenz Grimmer <lenz@mysql.com>
+
+- MySQL-Max now requires MySQL >= 4.0 to avoid version mismatches
+ (mixing 3.23 and 4.0 packages)
+
+* Fri Aug 09 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Turn off OpenSSL in MySQL-Max for now until it works properly again
+- enable RAID for the Max binary instead
+- added compatibility link: safe_mysqld -> mysqld_safe to ease the
+ transition from 3.23
+
+* Thu Jul 18 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Reworked the build steps a little bit: the Max binary is supposed
+ to include OpenSSL, which cannot be linked statically, thus trying
+ to statically link against a special glibc is futile anyway
+- because of this, it is not required to make yet another build run
+ just to compile the shared libs (saves a lot of time)
+- updated package description of the Max subpackage
+- clean up the BuildRoot directory afterwards
+
+* Mon Jul 15 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Updated Packager information
+- Fixed the build options: the regular package is supposed to
+ include InnoDB and linked statically, while the Max package
+ should include BDB and SSL support
+
+* Fri May 03 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Use more RPM macros (e.g. infodir, mandir) to make the spec
+ file more portable
+- reorganized the installation of documentation files: let RPM
+ take care of this
+- reorganized the file list: actually install man pages along
+ with the binaries of the respective subpackage
+- do not include libmysqld.a in the devel subpackage as well, if we
+ have a special "embedded" subpackage
+- reworked the package descriptions
+
+* Mon Oct 8 2001 Monty
+
+- Added embedded server as a separate RPM
+
+* Fri Apr 13 2001 Monty
+
+- Added mysqld-max to the distribution
+
+* Tue Jan 2 2001 Monty
+
+- Added mysql-test to the bench package
+
+* Fri Aug 18 2000 Tim Smith <tim@mysql.com>
+
+- Added separate libmysql_r directory; now both a threaded
+ and non-threaded library is shipped.
+
+* Wed Sep 28 1999 David Axmark <davida@mysql.com>
+
+- Added the support-files/my-example.cnf to the docs directory.
+
+- Removed devel dependency on base since it is about client
+ development.
+
+* Wed Sep 8 1999 David Axmark <davida@mysql.com>
+
+- Cleaned up some for 3.23.
+
+* Thu Jul 1 1999 David Axmark <davida@mysql.com>
+
+- Added support for shared libraries in a separate sub
+ package. Original fix by David Fox (dsfox@cogsci.ucsd.edu)
+
+- The --enable-assembler switch is now automatically disables on
+ platforms there assembler code is unavailable. This should allow
+ building this RPM on non i386 systems.
+
+* Mon Feb 22 1999 David Axmark <david@detron.se>
+
+- Removed unportable cc switches from the spec file. The defaults can
+ now be overridden with environment variables. This feature is used
+ to compile the official RPM with optimal (but compiler version
+ specific) switches.
+
+- Removed the repetitive description parts for the sub rpms. Maybe add
+ again if RPM gets a multiline macro capability.
+
+- Added support for a pt_BR translation. Translation contributed by
+ Jorge Godoy <jorge@bestway.com.br>.
+
+* Wed Nov 4 1998 David Axmark <david@detron.se>
+
+- A lot of changes in all the rpm and install scripts. This may even
+ be a working RPM :-)
+
+* Sun Aug 16 1998 David Axmark <david@detron.se>
+
+- A developers changelog for MySQL is available in the source RPM. And
+ there is a history of major user visible changed in the Reference
+ Manual. Only RPM specific changes will be documented here.
diff --git a/storage/xtradb/data/data0data.c b/storage/xtradb/data/data0data.c
new file mode 100644
index 00000000000..0715b49bf9c
--- /dev/null
+++ b/storage/xtradb/data/data0data.c
@@ -0,0 +1,779 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file data/data0data.c
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "data0data.h"
+
+#ifdef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+
+#include <ctype.h>
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+UNIV_INTERN byte data_error;
+
+# ifndef UNIV_DEBUG_VALGRIND
+/** this is used to fool the compiler in dtuple_validate */
+UNIV_INTERN ulint data_dummy;
+# endif /* !UNIV_DEBUG_VALGRIND */
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data) /*!< in: data */
+{
+ if (len != dfield_get_len(field)) {
+
+ return(FALSE);
+ }
+
+ if (len == UNIV_SQL_NULL) {
+
+ return(TRUE);
+ }
+
+ if (0 != memcmp(dfield_get_data(field), data, len)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+ const dtuple_t* tuple1, /*!< in: tuple 1 */
+ const dtuple_t* tuple2) /*!< in: tuple 2 */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(tuple1 && tuple2);
+ ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple1));
+ ut_ad(dtuple_check_typed(tuple2));
+
+ n_fields = dtuple_get_n_fields(tuple1);
+
+ if (n_fields != dtuple_get_n_fields(tuple2)) {
+
+ return(n_fields < dtuple_get_n_fields(tuple2) ? -1 : 1);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ int cmp;
+ const dfield_t* field1 = dtuple_get_nth_field(tuple1, i);
+ const dfield_t* field2 = dtuple_get_nth_field(tuple2, i);
+
+ cmp = cmp_dfield_dfield(field1, field2);
+
+ if (cmp) {
+ return(cmp);
+ }
+ }
+
+ return(0);
+}
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields) /*!< in: number of fields */
+{
+ ut_ad(tuple);
+
+ tuple->n_fields = n_fields;
+ tuple->n_fields_cmp = n_fields;
+}
+
+/**********************************************************//**
+Checks that a data field is typed.
+@return TRUE if ok */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+ const dfield_t* field) /*!< in: data field */
+{
+ if (dfield_get_type(field)->mtype > DATA_MYSQL
+ || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+ fprintf(stderr,
+ "InnoDB: Error: data field type %lu, len %lu\n",
+ (ulong) dfield_get_type(field)->mtype,
+ (ulong) dfield_get_len(field));
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ const dfield_t* field;
+ ulint i;
+
+ if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+ fprintf(stderr,
+ "InnoDB: Error: index entry has %lu fields\n",
+ (ulong) dtuple_get_n_fields(tuple));
+dump:
+ fputs("InnoDB: Tuple contents: ", stderr);
+ dtuple_print(stderr, tuple);
+ putc('\n', stderr);
+
+ return(FALSE);
+ }
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ if (!dfield_check_typed_no_assert(field)) {
+ goto dump;
+ }
+ }
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field) /*!< in: data field */
+{
+ if (dfield_get_type(field)->mtype > DATA_MYSQL
+ || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+ fprintf(stderr,
+ "InnoDB: Error: data field type %lu, len %lu\n",
+ (ulong) dfield_get_type(field)->mtype,
+ (ulong) dfield_get_len(field));
+
+ ut_error;
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ const dfield_t* field;
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ ut_a(dfield_check_typed(field));
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ n_fields = dtuple_get_n_fields(tuple);
+
+ /* We dereference all the data of each field to test
+ for memory traps */
+
+ for (i = 0; i < n_fields; i++) {
+
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (!dfield_is_null(field)) {
+
+ const byte* data = dfield_get_data(field);
+#ifndef UNIV_DEBUG_VALGRIND
+ ulint j;
+
+ for (j = 0; j < len; j++) {
+
+ data_dummy += *data; /* fool the compiler not
+ to optimize out this
+ code */
+ data++;
+ }
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+ UNIV_MEM_ASSERT_RW(data, len);
+ }
+ }
+
+ ut_a(dtuple_check_typed(tuple));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ const byte* data;
+ ulint len;
+ ulint i;
+
+ len = dfield_get_len(dfield);
+ data = dfield_get_data(dfield);
+
+ if (dfield_is_null(dfield)) {
+ fputs("NULL", stderr);
+
+ return;
+ }
+
+ switch (dtype_get_mtype(dfield_get_type(dfield))) {
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ for (i = 0; i < len; i++) {
+ int c = *data++;
+ putc(isprint(c) ? c : ' ', stderr);
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+ break;
+ case DATA_INT:
+ ut_a(len == 4); /* only works for 32-bit integers */
+ fprintf(stderr, "%d", (int)mach_read_from_4(data));
+ break;
+ default:
+ ut_error;
+ }
+}
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ const byte* data;
+ ulint len;
+ ulint prtype;
+ ulint i;
+ ibool print_also_hex;
+
+ len = dfield_get_len(dfield);
+ data = dfield_get_data(dfield);
+
+ if (dfield_is_null(dfield)) {
+ fputs("NULL", stderr);
+
+ return;
+ }
+
+ prtype = dtype_get_prtype(dfield_get_type(dfield));
+
+ switch (dtype_get_mtype(dfield_get_type(dfield))) {
+ dulint id;
+ case DATA_INT:
+ switch (len) {
+ ulint val;
+ case 1:
+ val = mach_read_from_1(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x80;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 2:
+ val = mach_read_from_2(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x8000;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 3:
+ val = mach_read_from_3(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x800000;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 4:
+ val = mach_read_from_4(data);
+
+ if (!(prtype & DATA_UNSIGNED)) {
+ val &= ~0x80000000;
+ fprintf(stderr, "%ld", (long) val);
+ } else {
+ fprintf(stderr, "%lu", (ulong) val);
+ }
+ break;
+
+ case 6:
+ id = mach_read_from_6(data);
+ fprintf(stderr, "{%lu %lu}",
+ ut_dulint_get_high(id),
+ ut_dulint_get_low(id));
+ break;
+
+ case 7:
+ id = mach_read_from_7(data);
+ fprintf(stderr, "{%lu %lu}",
+ ut_dulint_get_high(id),
+ ut_dulint_get_low(id));
+ break;
+ case 8:
+ id = mach_read_from_8(data);
+ fprintf(stderr, "{%lu %lu}",
+ ut_dulint_get_high(id),
+ ut_dulint_get_low(id));
+ break;
+ default:
+ goto print_hex;
+ }
+ break;
+
+ case DATA_SYS:
+ switch (prtype & DATA_SYS_PRTYPE_MASK) {
+ case DATA_TRX_ID:
+ id = mach_read_from_6(data);
+
+ fprintf(stderr, "trx_id " TRX_ID_FMT,
+ TRX_ID_PREP_PRINTF(id));
+ break;
+
+ case DATA_ROLL_PTR:
+ id = mach_read_from_7(data);
+
+ fprintf(stderr, "roll_ptr {%lu %lu}",
+ ut_dulint_get_high(id), ut_dulint_get_low(id));
+ break;
+
+ case DATA_ROW_ID:
+ id = mach_read_from_6(data);
+
+ fprintf(stderr, "row_id {%lu %lu}",
+ ut_dulint_get_high(id), ut_dulint_get_low(id));
+ break;
+
+ default:
+ id = mach_dulint_read_compressed(data);
+
+ fprintf(stderr, "mix_id {%lu %lu}",
+ ut_dulint_get_high(id), ut_dulint_get_low(id));
+ }
+ break;
+
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ print_also_hex = FALSE;
+
+ for (i = 0; i < len; i++) {
+ int c = *data++;
+
+ if (!isprint(c)) {
+ print_also_hex = TRUE;
+
+ fprintf(stderr, "\\x%02x", (unsigned char) c);
+ } else {
+ putc(c, stderr);
+ }
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+
+ if (!print_also_hex) {
+ break;
+ }
+
+ data = dfield_get_data(dfield);
+ /* fall through */
+
+ case DATA_BINARY:
+ default:
+print_hex:
+ fputs(" Hex: ",stderr);
+
+ for (i = 0; i < len; i++) {
+ fprintf(stderr, "%02lx", (ulint) *data++);
+ }
+
+ if (dfield_is_ext(dfield)) {
+ fputs("(external)", stderr);
+ }
+ }
+}
+
+/*************************************************************//**
+Print a dfield value using ut_print_buf. */
+static
+void
+dfield_print_raw(
+/*=============*/
+ FILE* f, /*!< in: output stream */
+ const dfield_t* dfield) /*!< in: dfield */
+{
+ ulint len = dfield_get_len(dfield);
+ if (!dfield_is_null(dfield)) {
+ ulint print_len = ut_min(len, 1000);
+ ut_print_buf(f, dfield_get_data(dfield), print_len);
+ if (len != print_len) {
+ fprintf(f, "(total %lu bytes%s)",
+ (ulong) len,
+ dfield_is_ext(dfield) ? ", external" : "");
+ }
+ } else {
+ fputs(" SQL NULL", f);
+ }
+}
+
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_fields;
+ ulint i;
+
+ n_fields = dtuple_get_n_fields(tuple);
+
+ fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields);
+
+ for (i = 0; i < n_fields; i++) {
+ fprintf(f, " %lu:", (ulong) i);
+
+ dfield_print_raw(f, dtuple_get_nth_field(tuple, i));
+
+ putc(';', f);
+ putc('\n', f);
+ }
+
+ ut_ad(dtuple_validate(tuple));
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext) /*!< in/out: number of
+ externally stored columns */
+{
+ mem_heap_t* heap;
+ big_rec_t* vector;
+ dfield_t* dfield;
+ dict_field_t* ifield;
+ ulint size;
+ ulint n_fields;
+ ulint local_len;
+ ulint local_prefix_len;
+
+ if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+ return(NULL);
+ }
+
+ if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
+ /* up to MySQL 5.1: store a 768-byte prefix locally */
+ local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_MAX_INDEX_COL_LEN;
+ } else {
+ /* new-format table: do not store any BLOB prefix locally */
+ local_len = BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ ut_a(dtuple_check_typed_no_assert(entry));
+
+ size = rec_get_converted_size(index, entry, *n_ext);
+
+ if (UNIV_UNLIKELY(size > 1000000000)) {
+ fprintf(stderr,
+ "InnoDB: Warning: tuple size very big: %lu\n",
+ (ulong) size);
+ fputs("InnoDB: Tuple contents: ", stderr);
+ dtuple_print(stderr, entry);
+ putc('\n', stderr);
+ }
+
+ heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+ * sizeof(big_rec_field_t) + 1000);
+
+ vector = mem_heap_alloc(heap, sizeof(big_rec_t));
+
+ vector->heap = heap;
+ vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
+ * sizeof(big_rec_field_t));
+
+ /* Decide which fields to shorten: the algorithm is to look for
+ a variable-length field that yields the biggest savings when
+ stored externally */
+
+ n_fields = 0;
+
+ while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
+ *n_ext),
+ dict_table_is_comp(index->table),
+ dict_index_get_n_fields(index),
+ dict_table_zip_size(index->table))) {
+ ulint i;
+ ulint longest = 0;
+ ulint longest_i = ULINT_MAX;
+ byte* data;
+ big_rec_field_t* b;
+
+ for (i = dict_index_get_n_unique_in_tree(index);
+ i < dtuple_get_n_fields(entry); i++) {
+ ulint savings;
+
+ dfield = dtuple_get_nth_field(entry, i);
+ ifield = dict_index_get_nth_field(index, i);
+
+ /* Skip fixed-length, NULL, externally stored,
+ or short columns */
+
+ if (ifield->fixed_len
+ || dfield_is_null(dfield)
+ || dfield_is_ext(dfield)
+ || dfield_get_len(dfield) <= local_len
+ || dfield_get_len(dfield)
+ <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+ goto skip_field;
+ }
+
+ savings = dfield_get_len(dfield) - local_len;
+
+ /* Check that there would be savings */
+ if (longest >= savings) {
+ goto skip_field;
+ }
+
+ /* In DYNAMIC and COMPRESSED format, store
+ locally any non-BLOB columns whose maximum
+ length does not exceed 256 bytes. This is
+ because there is no room for the "external
+ storage" flag when the maximum length is 255
+ bytes or less. This restriction trivially
+ holds in REDUNDANT and COMPACT format, because
+ there we always store locally columns whose
+ length is up to local_len == 788 bytes.
+ @see rec_init_offsets_comp_ordinary */
+ if (ifield->col->mtype != DATA_BLOB
+ && ifield->col->len < 256) {
+ goto skip_field;
+ }
+
+ longest_i = i;
+ longest = savings;
+
+skip_field:
+ continue;
+ }
+
+ if (!longest) {
+ /* Cannot shorten more */
+
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ /* Move data from field longest_i to big rec vector.
+
+ We store the first bytes locally to the record. Then
+ we can calculate all ordering fields in all indexes
+ from locally stored data. */
+
+ dfield = dtuple_get_nth_field(entry, longest_i);
+ ifield = dict_index_get_nth_field(index, longest_i);
+ local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ b = &vector->fields[n_fields];
+ b->field_no = longest_i;
+ b->len = dfield_get_len(dfield) - local_prefix_len;
+ b->data = (char*) dfield_get_data(dfield) + local_prefix_len;
+
+ /* Allocate the locally stored part of the column. */
+ data = mem_heap_alloc(heap, local_len);
+
+ /* Copy the local prefix. */
+ memcpy(data, dfield_get_data(dfield), local_prefix_len);
+ /* Clear the extern field reference (BLOB pointer). */
+ memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
+#if 0
+ /* The following would fail the Valgrind checks in
+ page_cur_insert_rec_low() and page_cur_insert_rec_zip().
+ The BLOB pointers in the record will be initialized after
+ the record and the BLOBs have been written. */
+ UNIV_MEM_ALLOC(data + local_prefix_len,
+ BTR_EXTERN_FIELD_REF_SIZE);
+#endif
+
+ dfield_set_data(dfield, data, local_len);
+ dfield_set_ext(dfield);
+
+ n_fields++;
+ (*n_ext)++;
+ ut_ad(n_fields < dtuple_get_n_fields(entry));
+ }
+
+ vector->n_fields = n_fields;
+ return(vector);
+}
+
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index __attribute__((unused)), /*!< in: index */
+ dtuple_t* entry, /*!< in: entry whose data was put to vector */
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ big_rec_field_t* b = vector->fields;
+ const big_rec_field_t* const end = b + vector->n_fields;
+
+ for (; b < end; b++) {
+ dfield_t* dfield;
+ ulint local_len;
+
+ dfield = dtuple_get_nth_field(entry, b->field_no);
+ local_len = dfield_get_len(dfield);
+
+ ut_ad(dfield_is_ext(dfield));
+ ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ ut_ad(local_len <= DICT_MAX_INDEX_COL_LEN);
+
+ dfield_set_data(dfield,
+ (char*) b->data - local_len,
+ b->len + local_len);
+ }
+
+ mem_heap_free(vector->heap);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/data/data0type.c b/storage/xtradb/data/data0type.c
new file mode 100644
index 00000000000..e834fd2ec55
--- /dev/null
+++ b/storage/xtradb/data/data0type.c
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file data/data0type.c
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+
+#ifdef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+UNIV_INTERN ulint data_mysql_default_charset_coll;
+
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of a
+ multi-byte character */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multi-byte character */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str) /*!< in: the string whose prefix
+ length is being determined */
+{
+ ut_a(data_len != UNIV_SQL_NULL);
+ ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen));
+
+ if (mbminlen != mbmaxlen) {
+ ut_a(!(prefix_len % mbmaxlen));
+ return(innobase_get_at_most_n_mbchars(
+ dtype_get_charset_coll(prtype),
+ prefix_len, data_len, str));
+ }
+
+ if (prefix_len < data_len) {
+
+ return(prefix_len);
+
+ }
+
+ return(data_len);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+ ulint mtype) /*!< in: InnoDB main data type code: DATA_CHAR, ... */
+{
+ if (mtype <= DATA_BLOB
+ || mtype == DATA_MYSQL
+ || mtype == DATA_VARMYSQL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype) /*!< in: precise type */
+{
+ if ((mtype == DATA_FIXBINARY)
+ || (mtype == DATA_BINARY)
+ || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype) /*!< in: precise type */
+{
+ if (dtype_is_string_type(mtype) == TRUE
+ && dtype_is_binary_string_type(mtype, prtype) == FALSE) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+ ulint old_prtype, /*!< in: the MySQL type code and the flags
+ DATA_BINARY_TYPE etc. */
+ ulint charset_coll) /*!< in: MySQL charset-collation code */
+{
+ ut_a(old_prtype < 256 * 256);
+ ut_a(charset_coll < 256);
+
+ return(old_prtype + (charset_coll << 16));
+}
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type) /*!< in: type struct to validate */
+{
+ ut_a(type);
+ ut_a(type->mtype >= DATA_VARCHAR);
+ ut_a(type->mtype <= DATA_MYSQL);
+
+ if (type->mtype == DATA_SYS) {
+ ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ ut_a(type->mbminlen <= type->mbmaxlen);
+#endif /* !UNIV_HOTBACKUP */
+
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+ const dtype_t* type) /*!< in: type */
+{
+ ulint mtype;
+ ulint prtype;
+ ulint len;
+
+ ut_a(type);
+
+ mtype = type->mtype;
+ prtype = type->prtype;
+
+ switch (mtype) {
+ case DATA_VARCHAR:
+ fputs("DATA_VARCHAR", stderr);
+ break;
+
+ case DATA_CHAR:
+ fputs("DATA_CHAR", stderr);
+ break;
+
+ case DATA_BINARY:
+ fputs("DATA_BINARY", stderr);
+ break;
+
+ case DATA_FIXBINARY:
+ fputs("DATA_FIXBINARY", stderr);
+ break;
+
+ case DATA_BLOB:
+ fputs("DATA_BLOB", stderr);
+ break;
+
+ case DATA_INT:
+ fputs("DATA_INT", stderr);
+ break;
+
+ case DATA_MYSQL:
+ fputs("DATA_MYSQL", stderr);
+ break;
+
+ case DATA_SYS:
+ fputs("DATA_SYS", stderr);
+ break;
+
+ case DATA_FLOAT:
+ fputs("DATA_FLOAT", stderr);
+ break;
+
+ case DATA_DOUBLE:
+ fputs("DATA_DOUBLE", stderr);
+ break;
+
+ case DATA_DECIMAL:
+ fputs("DATA_DECIMAL", stderr);
+ break;
+
+ case DATA_VARMYSQL:
+ fputs("DATA_VARMYSQL", stderr);
+ break;
+
+ default:
+ fprintf(stderr, "type %lu", (ulong) mtype);
+ break;
+ }
+
+ len = type->len;
+
+ if ((type->mtype == DATA_SYS)
+ || (type->mtype == DATA_VARCHAR)
+ || (type->mtype == DATA_CHAR)) {
+ putc(' ', stderr);
+ if (prtype == DATA_ROW_ID) {
+ fputs("DATA_ROW_ID", stderr);
+ len = DATA_ROW_ID_LEN;
+ } else if (prtype == DATA_ROLL_PTR) {
+ fputs("DATA_ROLL_PTR", stderr);
+ len = DATA_ROLL_PTR_LEN;
+ } else if (prtype == DATA_TRX_ID) {
+ fputs("DATA_TRX_ID", stderr);
+ len = DATA_TRX_ID_LEN;
+ } else if (prtype == DATA_ENGLISH) {
+ fputs("DATA_ENGLISH", stderr);
+ } else {
+ fprintf(stderr, "prtype %lu", (ulong) prtype);
+ }
+ } else {
+ if (prtype & DATA_UNSIGNED) {
+ fputs(" DATA_UNSIGNED", stderr);
+ }
+
+ if (prtype & DATA_BINARY_TYPE) {
+ fputs(" DATA_BINARY_TYPE", stderr);
+ }
+
+ if (prtype & DATA_NOT_NULL) {
+ fputs(" DATA_NOT_NULL", stderr);
+ }
+ }
+
+ fprintf(stderr, " len %lu", (ulong) len);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0boot.c b/storage/xtradb/dict/dict0boot.c
new file mode 100644
index 00000000000..43cfced65a0
--- /dev/null
+++ b/storage/xtradb/dict/dict0boot.c
@@ -0,0 +1,549 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0boot.c
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0boot.h"
+
+#ifdef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "dict0load.h"
+#include "dict0load.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ dict_hdr_t* header;
+
+ block = buf_page_get(DICT_HDR_SPACE, 0, DICT_HDR_PAGE_NO,
+ RW_X_LATCH, mtr);
+ header = DICT_HDR + buf_block_get_frame(block);
+
+ buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
+
+ return(header);
+}
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+ dulint* table_id, /*!< out: table id (not assigned if NULL) */
+ dulint* index_id, /*!< out: index id (not assigned if NULL) */
+ ulint* space_id) /*!< out: space id (not assigned if NULL) */
+{
+ dict_hdr_t* dict_hdr;
+ dulint id;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ dict_hdr = dict_hdr_get(&mtr);
+
+ if (table_id) {
+ id = mtr_read_dulint(dict_hdr + DICT_HDR_TABLE_ID, &mtr);
+ id = ut_dulint_add(id, 1);
+ mlog_write_dulint(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr);
+ *table_id = id;
+ }
+
+ if (index_id) {
+ id = mtr_read_dulint(dict_hdr + DICT_HDR_INDEX_ID, &mtr);
+ id = ut_dulint_add(id, 1);
+ mlog_write_dulint(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr);
+ *index_id = id;
+ }
+
+ if (space_id) {
+ *space_id = mtr_read_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+ MLOG_4BYTES, &mtr);
+ if (fil_assign_new_space_id(space_id)) {
+ mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+ *space_id, MLOG_4BYTES, &mtr);
+ }
+ }
+
+ mtr_commit(&mtr);
+}
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void)
+/*=======================*/
+{
+ dict_hdr_t* dict_hdr;
+ dulint id;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ id = dict_sys->row_id;
+
+ mtr_start(&mtr);
+
+ dict_hdr = dict_hdr_get(&mtr);
+
+ mlog_write_dulint(dict_hdr + DICT_HDR_ROW_ID, id, &mtr);
+
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates the file page for the dictionary header. This function is
+called only at the database creation.
+@return TRUE if succeed */
+static
+ibool
+dict_hdr_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ dict_hdr_t* dict_header;
+ ulint root_page_no;
+
+ ut_ad(mtr);
+
+ /* Create the dictionary header file block in a new, allocated file
+ segment in the system tablespace */
+ block = fseg_create(DICT_HDR_SPACE, 0,
+ DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
+
+ ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block));
+
+ dict_header = dict_hdr_get(mtr);
+
+ /* Start counting row, table, index, and tree ids from
+ DICT_HDR_FIRST_ID */
+ mlog_write_dulint(dict_header + DICT_HDR_ROW_ID,
+ ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+ mlog_write_dulint(dict_header + DICT_HDR_TABLE_ID,
+ ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+ mlog_write_dulint(dict_header + DICT_HDR_INDEX_ID,
+ ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+ mlog_write_ulint(dict_header + DICT_HDR_MAX_SPACE_ID,
+ 0, MLOG_4BYTES, mtr);
+
+ /* Obsolete, but we must initialize it anyway. */
+ mlog_write_ulint(dict_header + DICT_HDR_MIX_ID_LOW,
+ DICT_HDR_FIRST_ID, MLOG_4BYTES, mtr);
+
+ /* Create the B-tree roots for the clustered indexes of the basic
+ system tables */
+
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ DICT_HDR_SPACE, 0, DICT_TABLES_ID,
+ dict_ind_redundant, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no,
+ MLOG_4BYTES, mtr);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0,
+ DICT_TABLE_IDS_ID,
+ dict_ind_redundant, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no,
+ MLOG_4BYTES, mtr);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ DICT_HDR_SPACE, 0, DICT_COLUMNS_ID,
+ dict_ind_redundant, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no,
+ MLOG_4BYTES, mtr);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ DICT_HDR_SPACE, 0, DICT_INDEXES_ID,
+ dict_ind_redundant, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no,
+ MLOG_4BYTES, mtr);
+ /*--------------------------*/
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ DICT_HDR_SPACE, 0, DICT_FIELDS_ID,
+ dict_ind_redundant, mtr);
+ if (root_page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no,
+ MLOG_4BYTES, mtr);
+ /*--------------------------*/
+
+ return(TRUE);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+UNIV_INTERN
+void
+dict_boot(void)
+/*===========*/
+{
+ dict_table_t* table;
+ dict_index_t* index;
+ dict_hdr_t* dict_hdr;
+ mem_heap_t* heap;
+ mtr_t mtr;
+ ulint error;
+
+ mtr_start(&mtr);
+
+ /* Create the hash tables etc. */
+ dict_init();
+
+ heap = mem_heap_create(450);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ /* Get the dictionary header */
+ dict_hdr = dict_hdr_get(&mtr);
+
+ if (ut_dulint_cmp(mtr_read_dulint(dict_hdr + DICT_HDR_XTRADB_MARK, &mtr),
+ DICT_HDR_XTRADB_FLAG) != 0) {
+ /* not extended yet by XtraDB, need to be extended */
+ ulint root_page_no;
+
+ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+ DICT_HDR_SPACE, 0, DICT_STATS_ID,
+ dict_ind_redundant, &mtr);
+ if (root_page_no == FIL_NULL) {
+ fprintf(stderr, "InnoDB: Warning: failed to create SYS_STATS btr.\n");
+ srv_use_sys_stats_table = FALSE;
+ } else {
+ mlog_write_ulint(dict_hdr + DICT_HDR_STATS, root_page_no,
+ MLOG_4BYTES, &mtr);
+ mlog_write_dulint(dict_hdr + DICT_HDR_XTRADB_MARK,
+ DICT_HDR_XTRADB_FLAG, &mtr);
+ }
+ mtr_commit(&mtr);
+ /* restart mtr */
+ mtr_start(&mtr);
+ dict_hdr = dict_hdr_get(&mtr);
+ }
+
+ /* Because we only write new row ids to disk-based data structure
+ (dictionary header) when it is divisible by
+ DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
+ the latest value of the row id counter. Therefore we advance
+ the counter at the database startup to avoid overlapping values.
+ Note that when a user after database startup first time asks for
+ a new row id, then because the counter is now divisible by
+ ..._MARGIN, it will immediately be updated to the disk-based
+ header. */
+
+ dict_sys->row_id = ut_dulint_add(
+ ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID,
+ &mtr),
+ DICT_HDR_ROW_ID_WRITE_MARGIN),
+ DICT_HDR_ROW_ID_WRITE_MARGIN);
+
+ /* Insert into the dictionary cache the descriptions of the basic
+ system tables */
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+ /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
+ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
+ /* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT)
+ and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
+ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+ /* MIX_LEN may contain additional table flags when
+ ROW_FORMAT!=REDUNDANT. Currently, these flags include
+ DICT_TF2_TEMPORARY. */
+ dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+
+ table->id = DICT_TABLES_ID;
+
+ dict_table_add_to_cache(table, heap);
+ dict_sys->sys_tables = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create("SYS_TABLES", "CLUST_IND",
+ DICT_HDR_SPACE,
+ DICT_UNIQUE | DICT_CLUSTERED, 1);
+
+ dict_mem_index_add_field(index, "NAME", 0);
+
+ index->id = DICT_TABLES_ID;
+
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_TABLES,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ index = dict_mem_index_create("SYS_TABLES", "ID_IND",
+ DICT_HDR_SPACE, DICT_UNIQUE, 1);
+ dict_mem_index_add_field(index, "ID", 0);
+
+ index->id = DICT_TABLE_IDS_ID;
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_TABLE_IDS,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);
+
+ table->id = DICT_COLUMNS_ID;
+
+ dict_table_add_to_cache(table, heap);
+ dict_sys->sys_columns = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND",
+ DICT_HDR_SPACE,
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "TABLE_ID", 0);
+ dict_mem_index_add_field(index, "POS", 0);
+
+ index->id = DICT_COLUMNS_ID;
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_COLUMNS,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
+
+ /* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
+#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
+#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
+#endif
+#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
+#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
+#endif
+#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
+#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
+#endif
+#if DICT_SYS_INDEXES_NAME_FIELD != 2 + 2
+#error "DICT_SYS_INDEXES_NAME_FIELD != 2 + 2"
+#endif
+
+ table->id = DICT_INDEXES_ID;
+ dict_table_add_to_cache(table, heap);
+ dict_sys->sys_indexes = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND",
+ DICT_HDR_SPACE,
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "TABLE_ID", 0);
+ dict_mem_index_add_field(index, "ID", 0);
+
+ index->id = DICT_INDEXES_ID;
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_INDEXES,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
+
+ table->id = DICT_FIELDS_ID;
+ dict_table_add_to_cache(table, heap);
+ dict_sys->sys_fields = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND",
+ DICT_HDR_SPACE,
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "INDEX_ID", 0);
+ dict_mem_index_add_field(index, "POS", 0);
+
+ index->id = DICT_FIELDS_ID;
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_FIELDS,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_STATS", DICT_HDR_SPACE, 3, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
+ dict_mem_table_add_col(table, heap, "KEY_COLS", DATA_INT, 0, 4);
+ dict_mem_table_add_col(table, heap, "DIFF_VALS", DATA_BINARY, 0, 0);
+
+ /* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
+#if DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2
+#error "DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2"
+#endif
+
+ table->id = DICT_STATS_ID;
+ dict_table_add_to_cache(table, heap);
+ dict_sys->sys_stats = table;
+ mem_heap_empty(heap);
+
+ index = dict_mem_index_create("SYS_STATS", "CLUST_IND",
+ DICT_HDR_SPACE,
+ DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+ dict_mem_index_add_field(index, "INDEX_ID", 0);
+ dict_mem_index_add_field(index, "KEY_COLS", 0);
+
+ index->id = DICT_STATS_ID;
+ error = dict_index_add_to_cache(table, index,
+ mtr_read_ulint(dict_hdr
+ + DICT_HDR_STATS,
+ MLOG_4BYTES, &mtr),
+ FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ mem_heap_free(heap);
+
+ mtr_commit(&mtr);
+ /*-------------------------*/
+
+ /* Initialize the insert buffer table and index for each tablespace */
+
+ ibuf_init_at_db_start();
+
+ /* Load definitions of other indexes on system tables */
+
+ dict_load_sys_table(dict_sys->sys_tables);
+ dict_load_sys_table(dict_sys->sys_columns);
+ dict_load_sys_table(dict_sys->sys_indexes);
+ dict_load_sys_table(dict_sys->sys_fields);
+ dict_load_sys_table(dict_sys->sys_stats);
+
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/*****************************************************************//**
+Inserts the basic system table data into themselves in the database
+creation. */
+static
+void
+dict_insert_initial_data(void)
+/*==========================*/
+{
+ /* Does nothing yet */
+}
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the database creation. */
+UNIV_INTERN
+void
+dict_create(void)
+/*=============*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ dict_hdr_create(&mtr);
+
+ mtr_commit(&mtr);
+
+ dict_boot();
+
+ dict_insert_initial_data();
+}
diff --git a/storage/xtradb/dict/dict0crea.c b/storage/xtradb/dict/dict0crea.c
new file mode 100644
index 00000000000..a6d0e11740a
--- /dev/null
+++ b/storage/xtradb/dict/dict0crea.c
@@ -0,0 +1,1733 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0crea.c
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0crea.h"
+
+#ifdef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "pars0pars.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "ut0vec.h"
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_TABLES system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_tables_tuple(
+/*=========================*/
+ const dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dict_table_t* sys_tables;
+ dtuple_t* entry;
+ dfield_t* dfield;
+ byte* ptr;
+
+ ut_ad(table);
+ ut_ad(heap);
+
+ sys_tables = dict_sys->sys_tables;
+
+ entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, sys_tables);
+
+ /* 0: NAME -----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
+
+ dfield_set_data(dfield, table->name, ut_strlen(table->name));
+ /* 3: ID -------------------------------*/
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 4: N_COLS ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
+
+#if DICT_TF_COMPACT != 1
+#error
+#endif
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, table->n_def
+ | ((table->flags & DICT_TF_COMPACT) << 31));
+ dfield_set_data(dfield, ptr, 4);
+ /* 5: TYPE -----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
+ ut_a(table->flags & DICT_TF_COMPACT);
+ ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+ ut_a(((ulonglong) table->flags & DICT_TF_ZSSIZE_MASK)
+ <= (ulonglong) (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
+ ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
+ mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
+ } else {
+ mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
+ }
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 6: MIX_ID (obsolete) ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
+
+ ptr = mem_heap_zalloc(heap, 8);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 7: MIX_LEN (additional flags) --------------------------*/
+
+ dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 8: CLUSTER_NAME ---------------------*/
+ dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
+ dfield_set_null(dfield); /* not supported */
+
+ /* 9: SPACE ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, table->space);
+
+ dfield_set_data(dfield, ptr, 4);
+ /*----------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_COLUMNS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_columns_tuple(
+/*==========================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint i, /*!< in: column number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dict_table_t* sys_columns;
+ dtuple_t* entry;
+ const dict_col_t* column;
+ dfield_t* dfield;
+ byte* ptr;
+ const char* col_name;
+
+ ut_ad(table);
+ ut_ad(heap);
+
+ column = dict_table_get_nth_col(table, i);
+
+ sys_columns = dict_sys->sys_columns;
+
+ entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, sys_columns);
+
+ /* 0: TABLE_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 1: POS ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, i);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 4: NAME ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+
+ col_name = dict_table_get_col_name(table, i);
+ dfield_set_data(dfield, col_name, ut_strlen(col_name));
+ /* 5: MTYPE --------------------------*/
+ dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, column->mtype);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 6: PRTYPE -------------------------*/
+ dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, column->prtype);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 7: LEN ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, column->len);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 8: PREC ---------------------------*/
+ dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, 0/* unused */);
+
+ dfield_set_data(dfield, ptr, 4);
+ /*---------------------------------*/
+
+ return(entry);
+}
+
+/***************************************************************//**
+Builds a table definition to insert.
+@return DB_SUCCESS or error code */
+static
+ulint
+dict_build_table_def_step(
+/*======================*/
+ que_thr_t* thr, /*!< in: query thread */
+ tab_node_t* node) /*!< in: table create node */
+{
+ dict_table_t* table;
+ dtuple_t* row;
+ ulint error;
+ ulint flags;
+ const char* path_or_name;
+ ibool is_path;
+ mtr_t mtr;
+ ulint space = 0;
+ ibool file_per_table;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = node->table;
+
+ /* Cache the global variable "srv_file_per_table" to
+ a local variable before using it. Please note
+ "srv_file_per_table" is not under dict_sys mutex
+ protection, and could be changed while executing
+ this function. So better to cache the current value
+ to a local variable, and all future reference to
+ "srv_file_per_table" should use this local variable. */
+ file_per_table = srv_file_per_table;
+
+ dict_hdr_get_new_id(&table->id, NULL, NULL);
+
+ thr_get_trx(thr)->table_id = table->id;
+
+ if (file_per_table) {
+ /* Get a new space id if srv_file_per_table is set */
+ dict_hdr_get_new_id(NULL, NULL, &space);
+
+ if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) {
+ return(DB_ERROR);
+ }
+
+ /* We create a new single-table tablespace for the table.
+ We initially let it be 4 pages:
+ - page 0 is the fsp header and an extent descriptor page,
+ - page 1 is an ibuf bitmap page,
+ - page 2 is the first inode page,
+ - page 3 will contain the root of the clustered index of the
+ table we create here. */
+
+ if (table->dir_path_of_temp_table) {
+ /* We place tables created with CREATE TEMPORARY
+ TABLE in the tmp dir of mysqld server */
+
+ path_or_name = table->dir_path_of_temp_table;
+ is_path = TRUE;
+ } else {
+ path_or_name = table->name;
+ is_path = FALSE;
+ }
+
+ ut_ad(dict_table_get_format(table) <= DICT_TF_FORMAT_MAX);
+ ut_ad(!dict_table_zip_size(table)
+ || dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+
+ flags = table->flags & ~(~0 << DICT_TF_BITS);
+ error = fil_create_new_single_table_tablespace(
+ space, path_or_name, is_path,
+ flags == DICT_TF_COMPACT ? 0 : flags,
+ FIL_IBD_FILE_INITIAL_SIZE);
+ table->space = (unsigned int) space;
+
+ if (error != DB_SUCCESS) {
+
+ return(error);
+ }
+
+ mtr_start(&mtr);
+
+ fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+
+ mtr_commit(&mtr);
+ } else {
+ /* Create in the system tablespace: disallow new features */
+ table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
+ }
+
+ row = dict_create_sys_tables_tuple(table, node->heap);
+
+ ins_node_set_new_row(node->tab_def, row);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a column definition to insert.
+@return DB_SUCCESS */
+static
+ulint
+dict_build_col_def_step(
+/*====================*/
+ tab_node_t* node) /*!< in: table create node */
+{
+ dtuple_t* row;
+
+ row = dict_create_sys_columns_tuple(node->table, node->col_no,
+ node->heap);
+ ins_node_set_new_row(node->col_def, row);
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_INDEXES system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_indexes_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dict_table_t* sys_indexes;
+ dict_table_t* table;
+ dtuple_t* entry;
+ dfield_t* dfield;
+ byte* ptr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(index);
+ ut_ad(heap);
+
+ sys_indexes = dict_sys->sys_indexes;
+
+ table = dict_table_get_low(index->table_name);
+
+ entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, sys_indexes);
+
+ /* 0: TABLE_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, table->id);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 1: ID ----------------------------*/
+ dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, index->id);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 4: NAME --------------------------*/
+ dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+
+ dfield_set_data(dfield, index->name, ut_strlen(index->name));
+ /* 5: N_FIELDS ----------------------*/
+ dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, index->n_fields);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 6: TYPE --------------------------*/
+ dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, index->type);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 7: SPACE --------------------------*/
+
+#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 7
+#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
+#endif
+
+ dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, index->space);
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 8: PAGE_NO --------------------------*/
+
+#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 8
+#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
+#endif
+
+ dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, FIL_NULL);
+
+ dfield_set_data(dfield, ptr, 4);
+ /*--------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_FIELDS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_fields_tuple(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint i, /*!< in: field number */
+ mem_heap_t* heap) /*!< in: memory heap from
+ which the memory for the built
+ tuple is allocated */
+{
+ dict_table_t* sys_fields;
+ dtuple_t* entry;
+ dict_field_t* field;
+ dfield_t* dfield;
+ byte* ptr;
+ ibool index_contains_column_prefix_field = FALSE;
+ ulint j;
+
+ ut_ad(index);
+ ut_ad(heap);
+
+ for (j = 0; j < index->n_fields; j++) {
+ if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
+ index_contains_column_prefix_field = TRUE;
+ break;
+ }
+ }
+
+ field = dict_index_get_nth_field(index, i);
+
+ sys_fields = dict_sys->sys_fields;
+
+ entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, sys_fields);
+
+ /* 0: INDEX_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, index->id);
+
+ dfield_set_data(dfield, ptr, 8);
+ /* 1: POS + PREFIX LENGTH ----------------------------*/
+
+ dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+
+ ptr = mem_heap_alloc(heap, 4);
+
+ if (index_contains_column_prefix_field) {
+ /* If there are column prefix fields in the index, then
+ we store the number of the field to the 2 HIGH bytes
+ and the prefix length to the 2 low bytes, */
+
+ mach_write_to_4(ptr, (i << 16) + field->prefix_len);
+ } else {
+ /* Else we store the number of the field to the 2 LOW bytes.
+ This is to keep the storage format compatible with
+ InnoDB versions < 4.0.14. */
+
+ mach_write_to_4(ptr, i);
+ }
+
+ dfield_set_data(dfield, ptr, 4);
+ /* 4: COL_NAME -------------------------*/
+ dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
+
+ dfield_set_data(dfield, field->name,
+ ut_strlen(field->name));
+ /*---------------------------------*/
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_STATS system table.
+@return the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_stats_tuple(
+/*========================*/
+ const dict_index_t* index,
+ ulint i,
+ mem_heap_t* heap)
+{
+ dict_table_t* sys_stats;
+ dtuple_t* entry;
+ dfield_t* dfield;
+ byte* ptr;
+
+ ut_ad(index);
+ ut_ad(heap);
+
+ sys_stats = dict_sys->sys_stats;
+
+ entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+ dict_table_copy_types(entry, sys_stats);
+
+ /* 0: INDEX_ID -----------------------*/
+ dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, index->id);
+ dfield_set_data(dfield, ptr, 8);
+ /* 1: KEY_COLS -----------------------*/
+ dfield = dtuple_get_nth_field(entry, 1/*KEY_COLS*/);
+ ptr = mem_heap_alloc(heap, 4);
+ mach_write_to_4(ptr, i);
+ dfield_set_data(dfield, ptr, 4);
+ /* 4: DIFF_VALS ----------------------*/
+ dfield = dtuple_get_nth_field(entry, 2/*DIFF_VALS*/);
+ ptr = mem_heap_alloc(heap, 8);
+ mach_write_to_8(ptr, ut_dulint_zero); /* initial value is 0 */
+ dfield_set_data(dfield, ptr, 8);
+
+ return(entry);
+}
+
+/*****************************************************************//**
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created.
+@return the tuple for search */
+static
+dtuple_t*
+dict_create_search_tuple(
+/*=====================*/
+ const dtuple_t* tuple, /*!< in: the tuple inserted in the SYS_INDEXES
+ table */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory for
+ the built tuple is allocated */
+{
+ dtuple_t* search_tuple;
+ const dfield_t* field1;
+ dfield_t* field2;
+
+ ut_ad(tuple && heap);
+
+ search_tuple = dtuple_create(heap, 2);
+
+ field1 = dtuple_get_nth_field(tuple, 0);
+ field2 = dtuple_get_nth_field(search_tuple, 0);
+
+ dfield_copy(field2, field1);
+
+ field1 = dtuple_get_nth_field(tuple, 1);
+ field2 = dtuple_get_nth_field(search_tuple, 1);
+
+ dfield_copy(field2, field1);
+
+ ut_ad(dtuple_validate(search_tuple));
+
+ return(search_tuple);
+}
+
+/***************************************************************//**
+Builds an index definition row to insert.
+@return DB_SUCCESS or error code */
+static
+ulint
+dict_build_index_def_step(
+/*======================*/
+ que_thr_t* thr, /*!< in: query thread */
+ ind_node_t* node) /*!< in: index create node */
+{
+ dict_table_t* table;
+ dict_index_t* index;
+ dtuple_t* row;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ trx = thr_get_trx(thr);
+
+ index = node->index;
+
+ table = dict_table_get_low(index->table_name);
+
+ if (table == NULL) {
+ return(DB_TABLE_NOT_FOUND);
+ }
+
+ trx->table_id = table->id;
+
+ node->table = table;
+
+ ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+ || dict_index_is_clust(index));
+
+ dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+ /* Inherit the space id from the table; we store all indexes of a
+ table in the same tablespace */
+
+ index->space = table->space;
+ node->page_no = FIL_NULL;
+ row = dict_create_sys_indexes_tuple(index, node->heap);
+ node->ind_row = row;
+
+ ins_node_set_new_row(node->ind_def, row);
+
+ /* Note that the index was created by this transaction. */
+ index->trx_id = (ib_uint64_t) ut_conv_dulint_to_longlong(trx->id);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a field definition row to insert.
+@return DB_SUCCESS */
+static
+ulint
+dict_build_field_def_step(
+/*======================*/
+ ind_node_t* node) /*!< in: index create node */
+{
+ dict_index_t* index;
+ dtuple_t* row;
+
+ index = node->index;
+
+ row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
+
+ ins_node_set_new_row(node->field_def, row);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a row for storing stats to insert.
+@return DB_SUCCESS */
+static
+ulint
+dict_build_stats_def_step(
+/*======================*/
+ ind_node_t* node)
+{
+ dict_index_t* index;
+ dtuple_t* row;
+
+ index = node->index;
+
+ row = dict_create_sys_stats_tuple(index, node->stats_no, node->heap);
+
+ ins_node_set_new_row(node->stats_def, row);
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+dict_create_index_tree_step(
+/*========================*/
+ ind_node_t* node) /*!< in: index create node */
+{
+ dict_index_t* index;
+ dict_table_t* sys_indexes;
+ dict_table_t* table;
+ dtuple_t* search_tuple;
+ ulint zip_size;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ index = node->index;
+ table = node->table;
+
+ sys_indexes = dict_sys->sys_indexes;
+
+ /* Run a mini-transaction in which the index tree is allocated for
+ the index and its root address is written to the index entry in
+ sys_indexes */
+
+ mtr_start(&mtr);
+
+ search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+
+ btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes),
+ search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ zip_size = dict_table_zip_size(index->table);
+
+ node->page_no = btr_create(index->type, index->space, zip_size,
+ index->id, index, &mtr);
+ /* printf("Created a new index tree in space %lu root page %lu\n",
+ index->space, index->page_no); */
+
+ page_rec_write_index_page_no(btr_pcur_get_rec(&pcur),
+ DICT_SYS_INDEXES_PAGE_NO_FIELD,
+ node->page_no, &mtr);
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->page_no == FIL_NULL) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+ rec_t* rec, /*!< in/out: record in the clustered index
+ of SYS_INDEXES table */
+ mtr_t* mtr) /*!< in: mtr having the latch on the record page */
+{
+ ulint root_page_no;
+ ulint space;
+ ulint zip_size;
+ const byte* ptr;
+ ulint len;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+ ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+
+ ut_ad(len == 4);
+
+ root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+ if (root_page_no == FIL_NULL) {
+ /* The tree has already been freed */
+
+ return;
+ }
+
+ ptr = rec_get_nth_field_old(rec,
+ DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+
+ ut_ad(len == 4);
+
+ space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+ zip_size = fil_space_get_zip_size(space);
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+ /* It is a single table tablespace and the .ibd file is
+ missing: do nothing */
+
+ return;
+ }
+
+ /* We free all the pages but the root page first; this operation
+ may span several mini-transactions */
+
+ btr_free_but_not_root(space, zip_size, root_page_no);
+
+ /* Then we free the root page in the same mini-transaction where
+ we write FIL_NULL to the appropriate field in the SYS_INDEXES
+ record: this mini-transaction marks the B-tree totally freed */
+
+ /* printf("Dropping index tree in space %lu root page %lu\n", space,
+ root_page_no); */
+ btr_free_root(space, zip_size, root_page_no, mtr);
+
+ page_rec_write_index_page_no(rec,
+ DICT_SYS_INDEXES_PAGE_NO_FIELD,
+ FIL_NULL, mtr);
+}
+
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+ dict_table_t* table, /*!< in: the table the index belongs to */
+ ulint space, /*!< in: 0=truncate,
+ nonzero=create the index tree in the
+ given tablespace */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to
+ record in the clustered index of
+ SYS_INDEXES table. The cursor may be
+ repositioned in this call. */
+ mtr_t* mtr) /*!< in: mtr having the latch
+ on the record page. The mtr may be
+ committed and restarted in this call. */
+{
+ ulint root_page_no;
+ ibool drop = !space;
+ ulint zip_size;
+ ulint type;
+ dulint index_id;
+ rec_t* rec;
+ const byte* ptr;
+ ulint len;
+ dict_index_t* index;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+ rec = btr_pcur_get_rec(pcur);
+ ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+
+ ut_ad(len == 4);
+
+ root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+ if (drop && root_page_no == FIL_NULL) {
+ /* The tree has been freed. */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Trying to TRUNCATE"
+ " a missing index of table %s!\n", table->name);
+ drop = FALSE;
+ }
+
+ ptr = rec_get_nth_field_old(rec,
+ DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+
+ ut_ad(len == 4);
+
+ if (drop) {
+ space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+ }
+
+ zip_size = fil_space_get_zip_size(space);
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+ /* It is a single table tablespace and the .ibd file is
+ missing: do nothing */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Trying to TRUNCATE"
+ " a missing .ibd file of table %s!\n", table->name);
+ return(FIL_NULL);
+ }
+
+ ptr = rec_get_nth_field_old(rec,
+ DICT_SYS_INDEXES_TYPE_FIELD, &len);
+ ut_ad(len == 4);
+ type = mach_read_from_4(ptr);
+
+ ptr = rec_get_nth_field_old(rec, 1, &len);
+ ut_ad(len == 8);
+ index_id = mach_read_from_8(ptr);
+
+ if (!drop) {
+
+ goto create;
+ }
+
+ /* We free all the pages but the root page first; this operation
+ may span several mini-transactions */
+
+ btr_free_but_not_root(space, zip_size, root_page_no);
+
+ /* Then we free the root page in the same mini-transaction where
+ we create the b-tree and write its new root page number to the
+ appropriate field in the SYS_INDEXES record: this mini-transaction
+ marks the B-tree totally truncated */
+
+ btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+ btr_free_root(space, zip_size, root_page_no, mtr);
+create:
+ /* We will temporarily write FIL_NULL to the PAGE_NO field
+ in SYS_INDEXES, so that the database will not get into an
+ inconsistent state in case it crashes between the mtr_commit()
+ below and the following mtr_commit() call. */
+ page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+ FIL_NULL, mtr);
+
+ /* We will need to commit the mini-transaction in order to avoid
+ deadlocks in the btr_create() call, because otherwise we would
+ be freeing and allocating pages in the same mini-transaction. */
+ btr_pcur_store_position(pcur, mtr);
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+ btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+ /* Find the index corresponding to this SYS_INDEXES record. */
+ for (index = UT_LIST_GET_FIRST(table->indexes);
+ index;
+ index = UT_LIST_GET_NEXT(indexes, index)) {
+ if (!ut_dulint_cmp(index->id, index_id)) {
+ root_page_no = btr_create(type, space, zip_size,
+ index_id, index, mtr);
+ index->page = (unsigned int) root_page_no;
+ return(root_page_no);
+ }
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Index %lu %lu of table %s is missing\n"
+ "InnoDB: from the data dictionary during TRUNCATE!\n",
+ ut_dulint_get_high(index_id),
+ ut_dulint_get_low(index_id),
+ table->name);
+
+ return(FIL_NULL);
+}
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as a memory data
+ structure */
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ tab_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(tab_node_t));
+
+ node->common.type = QUE_NODE_CREATE_TABLE;
+
+ node->table = table;
+
+ node->state = TABLE_BUILD_TABLE_DEF;
+ node->heap = mem_heap_create(256);
+
+ node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables,
+ heap);
+ node->tab_def->common.parent = node;
+
+ node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns,
+ heap);
+ node->col_def->common.parent = node;
+
+ node->commit_node = commit_node_create(heap);
+ node->commit_node->common.parent = node;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Creates an index create graph.
+@return own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+ dict_index_t* index, /*!< in: index to create, built as a memory data
+ structure */
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ ind_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(ind_node_t));
+
+ node->common.type = QUE_NODE_CREATE_INDEX;
+
+ node->index = index;
+
+ node->state = INDEX_BUILD_INDEX_DEF;
+ node->page_no = FIL_NULL;
+ node->heap = mem_heap_create(256);
+
+ node->ind_def = ins_node_create(INS_DIRECT,
+ dict_sys->sys_indexes, heap);
+ node->ind_def->common.parent = node;
+
+ node->field_def = ins_node_create(INS_DIRECT,
+ dict_sys->sys_fields, heap);
+ node->field_def->common.parent = node;
+
+ if (srv_use_sys_stats_table) {
+ node->stats_def = ins_node_create(INS_DIRECT,
+ dict_sys->sys_stats, heap);
+ node->stats_def->common.parent = node;
+ } else {
+ node->stats_def = NULL;
+ }
+
+ node->commit_node = commit_node_create(heap);
+ node->commit_node->common.parent = node;
+
+ return(node);
+}
+
+/*********************************************************************//**
+*/
+UNIV_INTERN
+ind_node_t*
+ind_insert_stats_graph_create(
+/*==========================*/
+ dict_index_t* index,
+ mem_heap_t* heap)
+{
+ ind_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(ind_node_t));
+
+ node->common.type = QUE_NODE_INSERT_STATS;
+
+ node->index = index;
+
+ node->state = INDEX_BUILD_STATS_COLS;
+ node->page_no = FIL_NULL;
+ node->heap = mem_heap_create(256);
+
+ node->ind_def = NULL;
+ node->field_def = NULL;
+
+ node->stats_def = ins_node_create(INS_DIRECT,
+ dict_sys->sys_stats, heap);
+ node->stats_def->common.parent = node;
+ node->stats_no = 0;
+
+ node->commit_node = commit_node_create(heap);
+ node->commit_node->common.parent = node;
+
+ return(node);
+}
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ tab_node_t* node;
+ ulint err = DB_ERROR;
+ trx_t* trx;
+
+ ut_ad(thr);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = TABLE_BUILD_TABLE_DEF;
+ }
+
+ if (node->state == TABLE_BUILD_TABLE_DEF) {
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = dict_build_table_def_step(thr, node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->state = TABLE_BUILD_COL_DEF;
+ node->col_no = 0;
+
+ thr->run_node = node->tab_def;
+
+ return(thr);
+ }
+
+ if (node->state == TABLE_BUILD_COL_DEF) {
+
+ if (node->col_no < (node->table)->n_def) {
+
+ err = dict_build_col_def_step(node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->col_no++;
+
+ thr->run_node = node->col_def;
+
+ return(thr);
+ } else {
+ node->state = TABLE_COMMIT_WORK;
+ }
+ }
+
+ if (node->state == TABLE_COMMIT_WORK) {
+
+ /* Table was correctly defined: do NOT commit the transaction
+ (CREATE TABLE does NOT do an implicit commit of the current
+ transaction) */
+
+ node->state = TABLE_ADD_TO_CACHE;
+
+ /* thr->run_node = node->commit_node;
+
+ return(thr); */
+ }
+
+ if (node->state == TABLE_ADD_TO_CACHE) {
+
+ dict_table_add_to_cache(node->table, node->heap);
+
+ err = DB_SUCCESS;
+ }
+
+function_exit:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ind_node_t* node;
+ ulint err = DB_ERROR;
+ trx_t* trx;
+
+ ut_ad(thr);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = INDEX_BUILD_INDEX_DEF;
+ }
+
+ if (node->state == INDEX_BUILD_INDEX_DEF) {
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+ err = dict_build_index_def_step(thr, node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->state = INDEX_BUILD_FIELD_DEF;
+ node->field_no = 0;
+ node->stats_no = 0;
+
+ thr->run_node = node->ind_def;
+
+ return(thr);
+ }
+
+ if (node->state == INDEX_BUILD_FIELD_DEF) {
+
+ if (node->field_no < (node->index)->n_fields) {
+
+ err = dict_build_field_def_step(node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->field_no++;
+
+ thr->run_node = node->field_def;
+
+ return(thr);
+ } else {
+ node->state = INDEX_ADD_TO_CACHE;
+ }
+ }
+
+ if (node->state == INDEX_ADD_TO_CACHE) {
+
+ dulint index_id = node->index->id;
+
+ err = dict_index_add_to_cache(
+ node->table, node->index, FIL_NULL,
+ trx_is_strict(trx)
+ || dict_table_get_format(node->table)
+ >= DICT_TF_FORMAT_ZIP);
+
+ node->index = dict_index_get_if_in_cache_low(index_id);
+ ut_a(!node->index == (err != DB_SUCCESS));
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ if (srv_use_sys_stats_table
+ && !((node->table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
+ node->state = INDEX_BUILD_STATS_COLS;
+ } else {
+ node->state = INDEX_CREATE_INDEX_TREE;
+ }
+ }
+ if (node->state == INDEX_BUILD_STATS_COLS) {
+ if (node->stats_no <= dict_index_get_n_unique(node->index)) {
+
+ err = dict_build_stats_def_step(node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->stats_no++;
+
+ thr->run_node = node->stats_def;
+
+ return(thr);
+ } else {
+ node->state = INDEX_CREATE_INDEX_TREE;
+ }
+ }
+
+ if (node->state == INDEX_CREATE_INDEX_TREE) {
+
+ err = dict_create_index_tree_step(node);
+
+ if (err != DB_SUCCESS) {
+ dict_index_remove_from_cache(node->table, node->index);
+ node->index = NULL;
+
+ goto function_exit;
+ }
+
+ node->index->page = node->page_no;
+ node->state = INDEX_COMMIT_WORK;
+ }
+
+ if (node->state == INDEX_COMMIT_WORK) {
+
+ /* Index was correctly defined: do NOT commit the transaction
+ (CREATE INDEX does NOT currently do an implicit commit of
+ the current transaction) */
+
+ node->state = INDEX_CREATE_INDEX_TREE;
+
+ /* thr->run_node = node->commit_node;
+
+ return(thr); */
+ }
+
+function_exit:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ /* Ok: do nothing */
+
+ } else if (err == DB_LOCK_WAIT) {
+
+ return(NULL);
+ } else {
+ /* SQL error detected */
+
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/****************************************************************//**
+*/
+UNIV_INTERN
+que_thr_t*
+dict_insert_stats_step(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ind_node_t* node;
+ ulint err = DB_ERROR;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = INDEX_BUILD_STATS_COLS;
+ }
+
+ if (node->state == INDEX_BUILD_STATS_COLS) {
+ if (node->stats_no <= dict_index_get_n_unique(node->index)) {
+
+ err = dict_build_stats_def_step(node);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->stats_no++;
+
+ thr->run_node = node->stats_def;
+
+ return(thr);
+ } else {
+ node->state = INDEX_COMMIT_WORK;
+ }
+ }
+
+ if (node->state == INDEX_COMMIT_WORK) {
+
+ /* do not commit transaction here for now */
+ }
+
+function_exit:
+ trx->error_state = err;
+
+ if (err == DB_SUCCESS) {
+ } else {
+ return(NULL);
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at database creation or database start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_create_or_check_foreign_constraint_tables(void)
+/*================================================*/
+{
+ dict_table_t* table1;
+ dict_table_t* table2;
+ ulint error;
+ trx_t* trx;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table1 = dict_table_get_low("SYS_FOREIGN");
+ table2 = dict_table_get_low("SYS_FOREIGN_COLS");
+
+ if (table1 && table2
+ && UT_LIST_GET_LEN(table1->indexes) == 3
+ && UT_LIST_GET_LEN(table2->indexes) == 1) {
+
+ /* Foreign constraint system tables have already been
+ created, and they are ok */
+
+ table1->n_mysql_handles_opened = 1; /* for pin */
+ table2->n_mysql_handles_opened = 1; /* for pin */
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(DB_SUCCESS);
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ trx = trx_allocate_for_mysql();
+
+ trx->op_info = "creating foreign key sys tables";
+
+ row_mysql_lock_data_dictionary(trx);
+
+ if (table1) {
+ fprintf(stderr,
+ "InnoDB: dropping incompletely created"
+ " SYS_FOREIGN table\n");
+ row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+ }
+
+ if (table2) {
+ fprintf(stderr,
+ "InnoDB: dropping incompletely created"
+ " SYS_FOREIGN_COLS table\n");
+ row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+ }
+
+ fprintf(stderr,
+ "InnoDB: Creating foreign key constraint system tables\n");
+
+ /* NOTE: in dict_load_foreigns we use the fact that
+ there are 2 secondary indexes on SYS_FOREIGN, and they
+ are defined just like below */
+
+ /* NOTE: when designing InnoDB's foreign key support in 2001, we made
+ an error and made the table names and the foreign key id of type
+ 'CHAR' (internally, really a VARCHAR). We should have made the type
+ VARBINARY, like in other InnoDB system tables, to get a clean
+ design. */
+
+ error = que_eval_sql(NULL,
+ "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
+ "BEGIN\n"
+ "CREATE TABLE\n"
+ "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+ " REF_NAME CHAR, N_COLS INT);\n"
+ "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+ " ON SYS_FOREIGN (ID);\n"
+ "CREATE INDEX FOR_IND"
+ " ON SYS_FOREIGN (FOR_NAME);\n"
+ "CREATE INDEX REF_IND"
+ " ON SYS_FOREIGN (REF_NAME);\n"
+ "CREATE TABLE\n"
+ "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+ " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+ "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+ " ON SYS_FOREIGN_COLS (ID, POS);\n"
+ "END;\n"
+ , FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ fprintf(stderr, "InnoDB: error %lu in creation\n",
+ (ulong) error);
+
+ ut_a(error == DB_OUT_OF_FILE_SPACE
+ || error == DB_TOO_MANY_CONCURRENT_TRXS);
+
+ fprintf(stderr,
+ "InnoDB: creation failed\n"
+ "InnoDB: tablespace is full\n"
+ "InnoDB: dropping incompletely created"
+ " SYS_FOREIGN tables\n");
+
+ row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+ row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+
+ error = DB_MUST_GET_MORE_FILE_SPACE;
+ }
+
+ trx_commit_for_mysql(trx);
+
+ table1 = dict_table_get_low("SYS_FOREIGN");
+ table2 = dict_table_get_low("SYS_FOREIGN_COLS");
+ table1->n_mysql_handles_opened = 1; /* for pin */
+ table2->n_mysql_handles_opened = 1; /* for pin */
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_free_for_mysql(trx);
+
+ if (error == DB_SUCCESS) {
+ fprintf(stderr,
+ "InnoDB: Foreign key constraint system tables"
+ " created\n");
+ }
+
+ return(error);
+}
+
+/****************************************************************//**
+Evaluate the given foreign key SQL statement.
+@return error code or DB_SUCCESS */
+static
+ulint
+dict_foreign_eval_sql(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string to evaluate */
+ dict_table_t* table, /*!< in: table */
+ dict_foreign_t* foreign,/*!< in: foreign */
+ trx_t* trx) /*!< in: transaction */
+{
+ ulint error;
+ FILE* ef = dict_foreign_err_file;
+
+ error = que_eval_sql(info, sql, FALSE, trx);
+
+ if (error == DB_DUPLICATE_KEY) {
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Error in foreign key constraint creation for table ",
+ ef);
+ ut_print_name(ef, trx, TRUE, table->name);
+ fputs(".\nA foreign key constraint of name ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->id);
+ fputs("\nalready exists."
+ " (Note that internally InnoDB adds 'databasename'\n"
+ "in front of the user-defined constraint name.)\n"
+ "Note that InnoDB's FOREIGN KEY system tables store\n"
+ "constraint names as case-insensitive, with the\n"
+ "MySQL standard latin1_swedish_ci collation. If you\n"
+ "create tables or databases whose names differ only in\n"
+ "the character case, then collisions in constraint\n"
+ "names can occur. Workaround: name your constraints\n"
+ "explicitly with unique names.\n",
+ ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(error);
+ }
+
+ if (error != DB_SUCCESS) {
+ fprintf(stderr,
+ "InnoDB: Foreign key constraint creation failed:\n"
+ "InnoDB: internal error number %lu\n", (ulong) error);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ ut_print_timestamp(ef);
+ fputs(" Internal error in foreign key constraint creation"
+ " for table ", ef);
+ ut_print_name(ef, trx, TRUE, table->name);
+ fputs(".\n"
+ "See the MySQL .err log in the datadir"
+ " for more information.\n", ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(error);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Add a single foreign key field definition to the data dictionary tables in
+the database.
+@return error code or DB_SUCCESS */
+static
+ulint
+dict_create_add_foreign_field_to_dictionary(
+/*========================================*/
+ ulint field_nr, /*!< in: foreign field number */
+ dict_table_t* table, /*!< in: table */
+ dict_foreign_t* foreign, /*!< in: foreign */
+ trx_t* trx) /*!< in: transaction */
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+
+ pars_info_add_int4_literal(info, "pos", field_nr);
+
+ pars_info_add_str_literal(info, "for_col_name",
+ foreign->foreign_col_names[field_nr]);
+
+ pars_info_add_str_literal(info, "ref_col_name",
+ foreign->referenced_col_names[field_nr]);
+
+ return(dict_foreign_eval_sql(
+ info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_FOREIGN_COLS VALUES"
+ "(:id, :pos, :for_col_name, :ref_col_name);\n"
+ "END;\n",
+ table, foreign, trx));
+}
+
+/********************************************************************//**
+Add a single foreign key definition to the data dictionary tables in the
+database. We also generate names to constraints that were not named by the
+user. A generated constraint has a name of the format
+databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and
+are given locally for this table, that is, the number is not global, as in
+the old format constraints < 4.0.18 it used to be.
+@return error code or DB_SUCCESS */
+static
+ulint
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+ ulint* id_nr, /*!< in/out: number to use in id generation;
+ incremented if used */
+ dict_table_t* table, /*!< in: table */
+ dict_foreign_t* foreign,/*!< in: foreign */
+ trx_t* trx) /*!< in: transaction */
+{
+ ulint error;
+ ulint i;
+
+ pars_info_t* info = pars_info_create();
+
+ if (foreign->id == NULL) {
+ /* Generate a new constraint id */
+ ulint namelen = strlen(table->name);
+ char* id = mem_heap_alloc(foreign->heap, namelen + 20);
+ /* no overflow if number < 1e13 */
+ sprintf(id, "%s_ibfk_%lu", table->name, (ulong) (*id_nr)++);
+ foreign->id = id;
+ }
+
+ pars_info_add_str_literal(info, "id", foreign->id);
+
+ pars_info_add_str_literal(info, "for_name", table->name);
+
+ pars_info_add_str_literal(info, "ref_name",
+ foreign->referenced_table_name);
+
+ pars_info_add_int4_literal(info, "n_cols",
+ foreign->n_fields + (foreign->type << 24));
+
+ error = dict_foreign_eval_sql(info,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "INSERT INTO SYS_FOREIGN VALUES"
+ "(:id, :for_name, :ref_name, :n_cols);\n"
+ "END;\n"
+ , table, foreign, trx);
+
+ if (error != DB_SUCCESS) {
+
+ return(error);
+ }
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ error = dict_create_add_foreign_field_to_dictionary(
+ i, table, foreign, trx);
+
+ if (error != DB_SUCCESS) {
+
+ return(error);
+ }
+ }
+
+ error = dict_foreign_eval_sql(NULL,
+ "PROCEDURE P () IS\n"
+ "BEGIN\n"
+ "COMMIT WORK;\n"
+ "END;\n"
+ , table, foreign, trx);
+
+ return(error);
+}
+
+/********************************************************************//**
+Adds foreign key definitions to data dictionary tables in the database.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ ulint start_id,/*!< in: if we are actually doing ALTER TABLE
+ ADD CONSTRAINT, we want to generate constraint
+ numbers which are bigger than in the table so
+ far; we number the constraints from
+ start_id + 1 up; start_id should be set to 0 if
+ we are creating a new table, or if the table
+ so far has no constraints for which the name
+ was generated here */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_foreign_t* foreign;
+ ulint number = start_id + 1;
+ ulint error;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ if (NULL == dict_table_get_low("SYS_FOREIGN")) {
+ fprintf(stderr,
+ "InnoDB: table SYS_FOREIGN not found"
+ " in internal data dictionary\n");
+
+ return(DB_ERROR);
+ }
+
+ for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+ foreign;
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+ error = dict_create_add_foreign_to_dictionary(&number, table,
+ foreign, trx);
+
+ if (error != DB_SUCCESS) {
+
+ return(error);
+ }
+ }
+
+ return(DB_SUCCESS);
+}
diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c
new file mode 100644
index 00000000000..1d0517f5cc7
--- /dev/null
+++ b/storage/xtradb/dict/dict0dict.c
@@ -0,0 +1,5347 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0dict.c
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0dict.h"
+
+#ifdef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+UNIV_INTERN dict_index_t* dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+UNIV_INTERN dict_index_t* dict_ind_compact;
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "page0zip.h"
+#include "page0page.h"
+#include "pars0pars.h"
+#include "pars0sym.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0merge.h"
+#include "m_ctype.h" /* my_isspace() */
+#include "ha_prototypes.h" /* innobase_strcasecmp() */
+#include "srv0start.h" /* SRV_LOG_SPACE_FIRST_ID */
+
+#include <ctype.h>
+
+/** the dictionary system */
+UNIV_INTERN dict_sys_t* dict_sys = NULL;
+
+/** @brief the data dictionary rw-latch protecting dict_sys
+
+table create, drop, etc. reserve this in X-mode; implicit or
+backround operations purge, rollback, foreign key checks reserve this
+in S-mode; we cannot trust that MySQL protects implicit or background
+operations a table drop since MySQL does not know of them; therefore
+we need this; NOTE: a transaction which reserves this must keep book
+on the mode in trx_struct::dict_operation_lock_mode */
+UNIV_INTERN rw_lock_t dict_operation_lock;
+
+#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
+ creating a table or index object */
+#define DICT_POOL_PER_TABLE_HASH 512 /*!< buffer pool max size per table
+ hash table fixed size in bytes */
+#define DICT_POOL_PER_VARYING 4 /*!< buffer pool max size per data
+ dictionary varying size in bytes */
+
+/** Identifies generated InnoDB foreign key names */
+static char dict_ibfk[] = "_ibfk_";
+
+/** array of mutexes protecting dict_index_t::stat_n_diff_key_vals[] */
+#define DICT_INDEX_STAT_MUTEX_SIZE 32
+static mutex_t dict_index_stat_mutex[DICT_INDEX_STAT_MUTEX_SIZE];
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index); /*!< in: index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index); /*!< in: user representation of
+ a clustered index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index); /*!< in: user representation of
+ a non-clustered index */
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+static
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+ dict_foreign_t* foreign); /*!< in, own: foreign constraint */
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_col_t* col); /*!< in: column */
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+ dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+ const dict_field_t* field); /*!< in: field */
+/*********************************************************************//**
+Frees a foreign key struct. */
+static
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign); /*!< in, own: foreign key struct */
+
+/* Stream for storing detailed information about the latest foreign key
+and unique key errors */
+UNIV_INTERN FILE* dict_foreign_err_file = NULL;
+/* mutex protecting the foreign and unique error buffers */
+UNIV_INTERN mutex_t dict_foreign_err_mutex;
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+ char* a) /*!< in/out: string to put in lower case */
+{
+ innobase_casedn_str(a);
+}
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2) /*!< in: table name in the form
+ dbname '/' tablename */
+{
+ for (; *name1 == *name2; name1++, name2++) {
+ if (*name1 == '/') {
+ return(TRUE);
+ }
+ ut_a(*name1); /* the names must contain '/' */
+ }
+ return(FALSE);
+}
+
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+UNIV_INTERN
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name) /*!< in: table name in the form
+ dbname '/' tablename */
+{
+ const char* s = strchr(name, '/');
+ ut_a(s);
+
+ return(s + 1);
+}
+
+/********************************************************************//**
+Get the database name length in a table name.
+@return database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+ const char* name) /*!< in: table name in the form
+ dbname '/' tablename */
+{
+ const char* s;
+ s = strchr(name, '/');
+ ut_a(s);
+ return(s - name);
+}
+
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void)
+/*============================*/
+{
+ mutex_enter(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void)
+/*===========================*/
+{
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/** Get the mutex that protects index->stat_n_diff_key_vals[] */
+#define GET_INDEX_STAT_MUTEX(index) \
+ (&dict_index_stat_mutex[ut_fold_dulint(index->id) \
+ % DICT_INDEX_STAT_MUTEX_SIZE])
+
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index != NULL);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ ut_ad(!index->to_be_dropped);
+
+ mutex_enter(GET_INDEX_STAT_MUTEX(index));
+}
+
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index != NULL);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+ ut_ad(!index->to_be_dropped);
+
+ mutex_exit(GET_INDEX_STAT_MUTEX(index));
+}
+
+/********************************************************************//**
+Decrements the count of open MySQL handles to a table. */
+UNIV_INTERN
+void
+dict_table_decrement_handle_count(
+/*==============================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool dict_locked) /*!< in: TRUE=data dictionary locked */
+{
+ if (!dict_locked) {
+ mutex_enter(&dict_sys->mutex);
+ }
+
+ ut_ad(mutex_own(&dict_sys->mutex));
+ ut_a(table->n_mysql_handles_opened > 0);
+
+ table->n_mysql_handles_opened--;
+
+ if (!dict_locked) {
+ mutex_exit(&dict_sys->mutex);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint col_nr) /*!< in: column number */
+{
+ ulint i;
+ const char* s;
+
+ ut_ad(table);
+ ut_ad(col_nr < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ s = table->col_names;
+ if (s) {
+ for (i = 0; i < col_nr; i++) {
+ s += strlen(s) + 1;
+ }
+ }
+
+ return(s);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ mutex_enter(&table->autoinc_mutex);
+}
+
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value) /*!< in: next value to assign to a row */
+{
+ ut_ad(mutex_own(&table->autoinc_mutex));
+
+ table->autoinc = value;
+}
+
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(mutex_own(&table->autoinc_mutex));
+
+ return(table->autoinc);
+}
+
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value) /*!< in: value which was assigned to a row */
+{
+ ut_ad(mutex_own(&table->autoinc_mutex));
+
+ if (value > table->autoinc) {
+
+ table->autoinc = value;
+ }
+}
+
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+ dict_table_t* table) /*!< in/out: table */
+{
+ mutex_exit(&table->autoinc_mutex);
+}
+
+/**********************************************************************//**
+Looks for an index with the given table and index id.
+NOTE that we do not reserve the dictionary mutex.
+@return index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_on_id_low(
+/*=====================*/
+ dict_table_t* table, /*!< in: table */
+ dulint id) /*!< in: index id */
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index) {
+ if (0 == ut_dulint_cmp(id, index->id)) {
+ /* Found */
+
+ return(index);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n) /*!< in: column number */
+{
+ const dict_field_t* field;
+ const dict_col_t* col;
+ ulint pos;
+ ulint n_fields;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ col = dict_table_get_nth_col(index->table, n);
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_col_get_clust_pos(col, index));
+ }
+
+ n_fields = dict_index_get_n_fields(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (col == field->col && field->prefix_len == 0) {
+
+ return(pos);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n) /*!< in: column number */
+{
+ const dict_field_t* field;
+ const dict_col_t* col;
+ ulint pos;
+ ulint n_fields;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ if (dict_index_is_clust(index)) {
+
+ return(TRUE);
+ }
+
+ col = dict_table_get_nth_col(index->table, n);
+
+ n_fields = dict_index_get_n_fields(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (col == field->col) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n) /*!< in: field number in index2 */
+{
+ const dict_field_t* field;
+ const dict_field_t* field2;
+ ulint n_fields;
+ ulint pos;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ field2 = dict_index_get_nth_field(index2, n);
+
+ n_fields = dict_index_get_n_fields(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (field->col == field2->col
+ && (field->prefix_len == 0
+ || (field->prefix_len >= field2->prefix_len
+ && field2->prefix_len != 0))) {
+
+ return(pos);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+ dulint table_id, /*!< in: table id */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dict_table_t* table;
+
+ if (trx->dict_operation_lock_mode == RW_X_LATCH) {
+
+ /* Note: An X latch implies that the transaction
+ already owns the dictionary mutex. */
+
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ return(dict_table_get_on_id_low(table_id));
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = dict_table_get_on_id_low(table_id);
+
+ dict_table_LRU_trim(table);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(table);
+}
+
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n) /*!< in: column number */
+{
+ return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+ n));
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n) /*!< in: column number */
+{
+ const dict_index_t* index;
+ const dict_field_t* field;
+ const dict_col_t* col;
+ ulint pos;
+ ulint n_fields;
+
+ ut_ad(table);
+
+ col = dict_table_get_nth_col(table, n);
+
+ index = dict_table_get_first_index(table);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ for (pos = 0; pos < n_fields; pos++) {
+ field = dict_index_get_nth_field(index, pos);
+
+ if (col == field->col) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void)
+/*===========*/
+{
+ int i;
+
+ dict_sys = mem_alloc(sizeof(dict_sys_t));
+
+ mutex_create(&dict_sys->mutex, SYNC_DICT);
+
+ dict_sys->table_hash = hash_create(buf_pool_get_curr_size()
+ / (DICT_POOL_PER_TABLE_HASH
+ * UNIV_WORD_SIZE));
+ dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size()
+ / (DICT_POOL_PER_TABLE_HASH
+ * UNIV_WORD_SIZE));
+ dict_sys->size = 0;
+
+ UT_LIST_INIT(dict_sys->table_LRU);
+
+ rw_lock_create(&dict_operation_lock, SYNC_DICT_OPERATION);
+
+ dict_foreign_err_file = os_file_create_tmpfile();
+ ut_a(dict_foreign_err_file);
+
+ mutex_create(&dict_foreign_err_mutex, SYNC_ANY_LATCH);
+
+ for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+ mutex_create(&dict_index_stat_mutex[i], SYNC_INDEX_TREE);
+ }
+}
+
+/**********************************************************************//**
+Returns a table object and optionally increment its MySQL open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low is usually the
+appropriate function.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get(
+/*===========*/
+ const char* table_name, /*!< in: table name */
+ ibool inc_mysql_count)/*!< in: whether to increment the open
+ handle count on the table */
+{
+ dict_table_t* table;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = dict_table_get_low(table_name);
+
+ if (inc_mysql_count && table) {
+ table->n_mysql_handles_opened++;
+ }
+
+ dict_table_LRU_trim(table);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ if (table != NULL) {
+ if (!table->stat_initialized && !table->is_corrupt) {
+ /* If table->ibd_file_missing == TRUE, this will
+ print an error message and return without doing
+ anything. */
+ dict_update_statistics(table, FALSE);
+ }
+ }
+
+ return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap) /*!< in: temporary heap */
+{
+ ut_ad(table);
+ ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!table->cached);
+
+ /* NOTE: the system columns MUST be added in the following order
+ (so that they can be indexed by the numerical value of DATA_ROW_ID,
+ etc.) and as the last columns of the table memory object.
+ The clustered index will not always physically contain all
+ system columns. */
+
+ dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS,
+ DATA_ROW_ID | DATA_NOT_NULL,
+ DATA_ROW_ID_LEN);
+#if DATA_ROW_ID != 0
+#error "DATA_ROW_ID != 0"
+#endif
+ dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS,
+ DATA_TRX_ID | DATA_NOT_NULL,
+ DATA_TRX_ID_LEN);
+#if DATA_TRX_ID != 1
+#error "DATA_TRX_ID != 1"
+#endif
+ dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS,
+ DATA_ROLL_PTR | DATA_NOT_NULL,
+ DATA_ROLL_PTR_LEN);
+#if DATA_ROLL_PTR != 2
+#error "DATA_ROLL_PTR != 2"
+#endif
+
+ /* This check reminds that if a new system column is added to
+ the program, it should be dealt with here */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: temporary heap */
+{
+ ulint fold;
+ ulint id_fold;
+ ulint i;
+ ulint row_len;
+
+ /* The lower limit for what we consider a "big" row */
+#define BIG_ROW_SIZE 1024
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ dict_table_add_system_columns(table, heap);
+
+ table->cached = TRUE;
+
+ fold = ut_fold_string(table->name);
+ id_fold = ut_fold_dulint(table->id);
+
+ row_len = 0;
+ for (i = 0; i < table->n_def; i++) {
+ ulint col_len = dict_col_get_max_size(
+ dict_table_get_nth_col(table, i));
+
+ row_len += col_len;
+
+ /* If we have a single unbounded field, or several gigantic
+ fields, mark the maximum row size as BIG_ROW_SIZE. */
+ if (row_len >= BIG_ROW_SIZE || col_len >= BIG_ROW_SIZE) {
+ row_len = BIG_ROW_SIZE;
+
+ break;
+ }
+ }
+
+ table->big_rows = row_len >= BIG_ROW_SIZE;
+
+ /* Look for a table with the same name: error if such exists */
+ {
+ dict_table_t* table2;
+ HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ ut_strcmp(table2->name, table->name) == 0);
+ ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+ /* Look for the same table pointer with a different name */
+ HASH_SEARCH_ALL(name_hash, dict_sys->table_hash,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ table2 == table);
+ ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+ }
+
+ /* Look for a table with the same id: error if such exists */
+ {
+ dict_table_t* table2;
+ HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ ut_dulint_cmp(table2->id, table->id) == 0);
+ ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+ /* Look for the same table pointer with a different id */
+ HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ table2 == table);
+ ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+ }
+
+ /* Add table to hash table of tables */
+ HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+ table);
+
+ /* Add table to hash table of tables based on table id */
+ HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold,
+ table);
+ /* Add table to LRU list of tables */
+ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+ dict_sys->size += mem_heap_get_size(table->heap)
+ + strlen(table->name) + 1;
+}
+
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+ dulint id) /*!< in: index id */
+{
+ dict_table_t* table;
+ dict_index_t* index;
+
+ table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+ while (table) {
+ index = dict_table_get_first_index(table);
+
+ while (index) {
+ if (0 == ut_dulint_cmp(id, index->id)) {
+ /* Found */
+
+ return(index);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ }
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* new_name, /*!< in: new name */
+ ibool rename_also_foreigns)/*!< in: in ALTER TABLE we want
+ to preserve the original table name
+ in constraints which reference it */
+{
+ dict_foreign_t* foreign;
+ dict_index_t* index;
+ ulint fold;
+ char old_name[MAX_TABLE_NAME_LEN + 1];
+
+ ut_ad(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* store the old/current name to an automatic variable */
+ if (strlen(table->name) + 1 <= sizeof(old_name)) {
+ memcpy(old_name, table->name, strlen(table->name) + 1);
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, "InnoDB: too long table name: '%s', "
+ "max length is %d\n", table->name,
+ MAX_TABLE_NAME_LEN);
+ ut_error;
+ }
+
+ fold = ut_fold_string(new_name);
+
+ /* Look for a table with the same name: error if such exists */
+ {
+ dict_table_t* table2;
+ HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+ dict_table_t*, table2, ut_ad(table2->cached),
+ (ut_strcmp(table2->name, new_name) == 0));
+ if (UNIV_LIKELY_NULL(table2)) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: dictionary cache"
+ " already contains a table ", stderr);
+ ut_print_name(stderr, NULL, TRUE, new_name);
+ fputs("\n"
+ "InnoDB: cannot rename table ", stderr);
+ ut_print_name(stderr, NULL, TRUE, old_name);
+ putc('\n', stderr);
+ return(FALSE);
+ }
+ }
+
+ /* If the table is stored in a single-table tablespace, rename the
+ .ibd file */
+
+ if (table->space != 0) {
+ if (table->dir_path_of_temp_table != NULL) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: trying to rename a"
+ " TEMPORARY TABLE ", stderr);
+ ut_print_name(stderr, NULL, TRUE, old_name);
+ fputs(" (", stderr);
+ ut_print_filename(stderr,
+ table->dir_path_of_temp_table);
+ fputs(" )\n", stderr);
+ return(FALSE);
+ } else if (!fil_rename_tablespace(old_name, table->space,
+ new_name)) {
+ return(FALSE);
+ }
+ }
+
+ /* Remove table from the hash tables of tables */
+ HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+ ut_fold_string(old_name), table);
+
+ if (strlen(new_name) > strlen(table->name)) {
+ /* We allocate MAX_TABLE_NAME_LEN+1 bytes here to avoid
+ memory fragmentation, we assume a repeated calls of
+ ut_realloc() with the same size do not cause fragmentation */
+ ut_a(strlen(new_name) <= MAX_TABLE_NAME_LEN);
+ table->name = ut_realloc(table->name, MAX_TABLE_NAME_LEN + 1);
+ }
+ memcpy(table->name, new_name, strlen(new_name) + 1);
+
+ /* Add table to hash table of tables */
+ HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+ table);
+
+ dict_sys->size += strlen(new_name) - strlen(old_name);
+ ut_a(dict_sys->size > 0);
+
+ /* Update the table_name field in indexes */
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ index->table_name = table->name;
+
+ index = dict_table_get_next_index(index);
+ }
+
+ if (!rename_also_foreigns) {
+ /* In ALTER TABLE we think of the rename table operation
+ in the direction table -> temporary table (#sql...)
+ as dropping the table with the old name and creating
+ a new with the new name. Thus we kind of drop the
+ constraints from the dictionary cache here. The foreign key
+ constraints will be inherited to the new table from the
+ system tables through a call of dict_load_foreigns. */
+
+ /* Remove the foreign constraints from the cache */
+ foreign = UT_LIST_GET_LAST(table->foreign_list);
+
+ while (foreign != NULL) {
+ dict_foreign_remove_from_cache(foreign);
+ foreign = UT_LIST_GET_LAST(table->foreign_list);
+ }
+
+ /* Reset table field in referencing constraints */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign != NULL) {
+ foreign->referenced_table = NULL;
+ foreign->referenced_index = NULL;
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ /* Make the list of referencing constraints empty */
+
+ UT_LIST_INIT(table->referenced_list);
+
+ return(TRUE);
+ }
+
+ /* Update the table name fields in foreign constraints, and update also
+ the constraint id of new format >= 4.0.18 constraints. Note that at
+ this point we have already changed table->name to the new name. */
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign != NULL) {
+ if (ut_strlen(foreign->foreign_table_name)
+ < ut_strlen(table->name)) {
+ /* Allocate a longer name buffer;
+ TODO: store buf len to save memory */
+
+ foreign->foreign_table_name
+ = mem_heap_alloc(foreign->heap,
+ ut_strlen(table->name) + 1);
+ }
+
+ strcpy(foreign->foreign_table_name, table->name);
+
+ if (strchr(foreign->id, '/')) {
+ ulint db_len;
+ char* old_id;
+
+ /* This is a >= 4.0.18 format id */
+
+ old_id = mem_strdup(foreign->id);
+
+ if (ut_strlen(foreign->id) > ut_strlen(old_name)
+ + ((sizeof dict_ibfk) - 1)
+ && !memcmp(foreign->id, old_name,
+ ut_strlen(old_name))
+ && !memcmp(foreign->id + ut_strlen(old_name),
+ dict_ibfk, (sizeof dict_ibfk) - 1)) {
+
+ /* This is a generated >= 4.0.18 format id */
+
+ if (strlen(table->name) > strlen(old_name)) {
+ foreign->id = mem_heap_alloc(
+ foreign->heap,
+ strlen(table->name)
+ + strlen(old_id) + 1);
+ }
+
+ /* Replace the prefix 'databasename/tablename'
+ with the new names */
+ strcpy(foreign->id, table->name);
+ strcat(foreign->id,
+ old_id + ut_strlen(old_name));
+ } else {
+ /* This is a >= 4.0.18 format id where the user
+ gave the id name */
+ db_len = dict_get_db_name_len(table->name) + 1;
+
+ if (dict_get_db_name_len(table->name)
+ > dict_get_db_name_len(foreign->id)) {
+
+ foreign->id = mem_heap_alloc(
+ foreign->heap,
+ db_len + strlen(old_id) + 1);
+ }
+
+ /* Replace the database prefix in id with the
+ one from table->name */
+
+ ut_memcpy(foreign->id, table->name, db_len);
+
+ strcpy(foreign->id + db_len,
+ dict_remove_db_name(old_id));
+ }
+
+ mem_free(old_id);
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign != NULL) {
+ if (ut_strlen(foreign->referenced_table_name)
+ < ut_strlen(table->name)) {
+ /* Allocate a longer name buffer;
+ TODO: store buf len to save memory */
+
+ foreign->referenced_table_name = mem_heap_alloc(
+ foreign->heap, strlen(table->name) + 1);
+ }
+
+ strcpy(foreign->referenced_table_name, table->name);
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ dulint new_id) /*!< in: new id to set */
+{
+ ut_ad(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ /* Remove the table from the hash table of id's */
+
+ HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+ ut_fold_dulint(table->id), table);
+ table->id = new_id;
+
+ /* Add the table back to the hash table */
+ HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash,
+ ut_fold_dulint(table->id), table);
+}
+
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+ dict_table_t* table) /*!< in, own: table */
+{
+ dict_foreign_t* foreign;
+ dict_index_t* index;
+ ulint size;
+
+ ut_ad(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+#if 0
+ fputs("Removing table ", stderr);
+ ut_print_name(stderr, table->name, ULINT_UNDEFINED);
+ fputs(" from dictionary cache\n", stderr);
+#endif
+
+ /* Remove the foreign constraints from the cache */
+ foreign = UT_LIST_GET_LAST(table->foreign_list);
+
+ while (foreign != NULL) {
+ dict_foreign_remove_from_cache(foreign);
+ foreign = UT_LIST_GET_LAST(table->foreign_list);
+ }
+
+ /* Reset table field in referencing constraints */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign != NULL) {
+ foreign->referenced_table = NULL;
+ foreign->referenced_index = NULL;
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ /* Remove the indexes from the cache */
+ index = UT_LIST_GET_LAST(table->indexes);
+
+ while (index != NULL) {
+ dict_index_remove_from_cache(table, index);
+ index = UT_LIST_GET_LAST(table->indexes);
+ }
+
+ /* Remove table from the hash tables of tables */
+ HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+ ut_fold_string(table->name), table);
+ HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+ ut_fold_dulint(table->id), table);
+
+ /* Remove table from LRU list of tables */
+ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+ size = mem_heap_get_size(table->heap) + strlen(table->name) + 1;
+
+ ut_ad(dict_sys->size >= size);
+
+ dict_sys->size -= size;
+
+ dict_mem_table_free(table);
+}
+
+/**************************************************************************
+Frees tables from the end of table_LRU if the dictionary cache occupies
+too much space. */
+UNIV_INTERN
+void
+dict_table_LRU_trim(
+/*================*/
+ dict_table_t* self)
+{
+ dict_table_t* table;
+ dict_table_t* prev_table;
+ dict_foreign_t* foreign;
+ ulint n_removed;
+ ulint n_have_parent;
+ ulint cached_foreign_tables;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+retry:
+ n_removed = n_have_parent = 0;
+ table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+
+ while ( srv_dict_size_limit && table
+ && ((dict_sys->table_hash->n_cells
+ + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
+ + dict_sys->size) > srv_dict_size_limit ) {
+ prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+ if (table == self || table->n_mysql_handles_opened || table->is_corrupt)
+ goto next_loop;
+
+ cached_foreign_tables = 0;
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+ while (foreign != NULL) {
+ if (foreign->referenced_table)
+ cached_foreign_tables++;
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ if (cached_foreign_tables == 0) {
+ dict_table_remove_from_cache(table);
+ n_removed++;
+ } else {
+ n_have_parent++;
+ }
+next_loop:
+ table = prev_table;
+ }
+
+ if ( srv_dict_size_limit && n_have_parent && n_removed
+ && ((dict_sys->table_hash->n_cells
+ + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
+ + dict_sys->size) > srv_dict_size_limit )
+ goto retry;
+}
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name) /*!< in: column name */
+{
+ /* This check reminds that if a new system column is added to
+ the program, it should be dealt with here. */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+
+ static const char* reserved_names[] = {
+ "DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"
+ };
+
+ ulint i;
+
+ for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) {
+ if (innobase_strcasecmp(name, reserved_names[i]) == 0) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+If an undo log record for this table might not fit on a single page,
+return TRUE.
+@return TRUE if the undo log record could become too big */
+static
+ibool
+dict_index_too_big_for_undo(
+/*========================*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_index_t* new_index) /*!< in: index */
+{
+ /* Make sure that all column prefixes will fit in the undo log record
+ in trx_undo_page_report_modify() right after trx_undo_page_init(). */
+
+ ulint i;
+ const dict_index_t* clust_index
+ = dict_table_get_first_index(table);
+ ulint undo_page_len
+ = TRX_UNDO_PAGE_HDR - TRX_UNDO_PAGE_HDR_SIZE
+ + 2 /* next record pointer */
+ + 1 /* type_cmpl */
+ + 11 /* trx->undo_no */ + 11 /* table->id */
+ + 1 /* rec_get_info_bits() */
+ + 11 /* DB_TRX_ID */
+ + 11 /* DB_ROLL_PTR */
+ + 10 + FIL_PAGE_DATA_END /* trx_undo_left() */
+ + 2/* pointer to previous undo log record */;
+
+ if (UNIV_UNLIKELY(!clust_index)) {
+ ut_a(dict_index_is_clust(new_index));
+ clust_index = new_index;
+ }
+
+ /* Add the size of the ordering columns in the
+ clustered index. */
+ for (i = 0; i < clust_index->n_uniq; i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(clust_index, i);
+
+ /* Use the maximum output size of
+ mach_write_compressed(), although the encoded
+ length should always fit in 2 bytes. */
+ undo_page_len += 5 + dict_col_get_max_size(col);
+ }
+
+ /* Add the old values of the columns to be updated.
+ First, the amount and the numbers of the columns.
+ These are written by mach_write_compressed() whose
+ maximum output length is 5 bytes. However, given that
+ the quantities are below REC_MAX_N_FIELDS (10 bits),
+ the maximum length is 2 bytes per item. */
+ undo_page_len += 2 * (dict_table_get_n_cols(table) + 1);
+
+ for (i = 0; i < clust_index->n_def; i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(clust_index, i);
+ ulint max_size
+ = dict_col_get_max_size(col);
+ ulint fixed_size
+ = dict_col_get_fixed_size(col,
+ dict_table_is_comp(table));
+
+ if (fixed_size) {
+ /* Fixed-size columns are stored locally. */
+ max_size = fixed_size;
+ } else if (max_size <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+ /* Short columns are stored locally. */
+ } else if (!col->ord_part) {
+ /* See if col->ord_part would be set
+ because of new_index. */
+ ulint j;
+
+ for (j = 0; j < new_index->n_uniq; j++) {
+ if (dict_index_get_nth_col(
+ new_index, j) == col) {
+
+ goto is_ord_part;
+ }
+ }
+
+ /* This is not an ordering column in any index.
+ Thus, it can be stored completely externally. */
+ max_size = BTR_EXTERN_FIELD_REF_SIZE;
+ } else {
+is_ord_part:
+ /* This is an ordering column in some index.
+ A long enough prefix must be written to the
+ undo log. See trx_undo_page_fetch_ext(). */
+
+ if (max_size > REC_MAX_INDEX_COL_LEN) {
+ max_size = REC_MAX_INDEX_COL_LEN;
+ }
+
+ max_size += BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ undo_page_len += 5 + max_size;
+ }
+
+ return(undo_page_len >= UNIV_PAGE_SIZE);
+}
+
+/****************************************************************//**
+If a record of this index might not fit on a single B-tree page,
+return TRUE.
+@return TRUE if the index record could become too big */
+static
+ibool
+dict_index_too_big_for_tree(
+/*========================*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_index_t* new_index) /*!< in: index */
+{
+ ulint zip_size;
+ ulint comp;
+ ulint i;
+ /* maximum possible storage size of a record */
+ ulint rec_max_size;
+ /* maximum allowed size of a record on a leaf page */
+ ulint page_rec_max;
+ /* maximum allowed size of a node pointer record */
+ ulint page_ptr_max;
+
+ comp = dict_table_is_comp(table);
+ zip_size = dict_table_zip_size(table);
+
+ if (zip_size && zip_size < UNIV_PAGE_SIZE) {
+ /* On a compressed page, two records must fit in the
+ uncompressed page modification log. On compressed
+ pages with zip_size == UNIV_PAGE_SIZE, this limit will
+ never be reached. */
+ ut_ad(comp);
+ /* The maximum allowed record size is the size of
+ an empty page, minus a byte for recoding the heap
+ number in the page modification log. The maximum
+ allowed node pointer size is half that. */
+ page_rec_max = page_zip_empty_size(new_index->n_fields,
+ zip_size) - 1;
+ page_ptr_max = page_rec_max / 2;
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. */
+ rec_max_size = 2;
+ } else {
+ /* The maximum allowed record size is half a B-tree
+ page. No additional sparse page directory entry will
+ be generated for the first few user records. */
+ page_rec_max = page_get_free_space_of_empty(comp) / 2;
+ page_ptr_max = page_rec_max;
+ /* Each record has a header. */
+ rec_max_size = comp
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES;
+ }
+
+ if (comp) {
+ /* Include the "null" flags in the
+ maximum possible record size. */
+ rec_max_size += UT_BITS_IN_BYTES(new_index->n_nullable);
+ } else {
+ /* For each column, include a 2-byte offset and a
+ "null" flag. The 1-byte format is only used in short
+ records that do not contain externally stored columns.
+ Such records could never exceed the page limit, even
+ when using the 2-byte format. */
+ rec_max_size += 2 * new_index->n_fields;
+ }
+
+ /* Compute the maximum possible record size. */
+ for (i = 0; i < new_index->n_fields; i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(new_index, i);
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ ulint field_max_size;
+ ulint field_ext_max_size;
+
+ /* In dtuple_convert_big_rec(), variable-length columns
+ that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+ may be chosen for external storage.
+
+ Fixed-length columns, and all columns of secondary
+ index records are always stored inline. */
+
+ /* Determine the maximum length of the index field.
+ The field_ext_max_size should be computed as the worst
+ case in rec_get_converted_size_comp() for
+ REC_STATUS_ORDINARY records. */
+
+ field_max_size = dict_col_get_fixed_size(col, comp);
+ if (field_max_size) {
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!field->prefix_len
+ || field->fixed_len == field->prefix_len);
+ /* Fixed lengths are not encoded
+ in ROW_FORMAT=COMPACT. */
+ field_ext_max_size = 0;
+ goto add_field_size;
+ }
+
+ field_max_size = dict_col_get_max_size(col);
+ field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+ if (field->prefix_len) {
+ if (field->prefix_len < field_max_size) {
+ field_max_size = field->prefix_len;
+ }
+ } else if (field_max_size > BTR_EXTERN_FIELD_REF_SIZE * 2
+ && dict_index_is_clust(new_index)) {
+
+ /* In the worst case, we have a locally stored
+ column of BTR_EXTERN_FIELD_REF_SIZE * 2 bytes.
+ The length can be stored in one byte. If the
+ column were stored externally, the lengths in
+ the clustered index page would be
+ BTR_EXTERN_FIELD_REF_SIZE and 2. */
+ field_max_size = BTR_EXTERN_FIELD_REF_SIZE * 2;
+ field_ext_max_size = 1;
+ }
+
+ if (comp) {
+ /* Add the extra size for ROW_FORMAT=COMPACT.
+ For ROW_FORMAT=REDUNDANT, these bytes were
+ added to rec_max_size before this loop. */
+ rec_max_size += field_ext_max_size;
+ }
+add_field_size:
+ rec_max_size += field_max_size;
+
+ /* Check the size limit on leaf pages. */
+ if (UNIV_UNLIKELY(rec_max_size >= page_rec_max)) {
+
+ return(TRUE);
+ }
+
+ /* Check the size limit on non-leaf pages. Records
+ stored in non-leaf B-tree pages consist of the unique
+ columns of the record (the key columns of the B-tree)
+ and a node pointer field. When we have processed the
+ unique columns, rec_max_size equals the size of the
+ node pointer record minus the node pointer column. */
+ if (i + 1 == dict_index_get_n_unique_in_tree(new_index)
+ && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+ulint
+dict_index_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table on which the index is */
+ dict_index_t* index, /*!< in, own: index; NOTE! The index memory
+ object is freed in this function! */
+ ulint page_no,/*!< in: root page number of the index */
+ ibool strict) /*!< in: TRUE=refuse to create the index
+ if records could be too big to fit in
+ an B-tree page */
+{
+ dict_index_t* new_index;
+ ulint n_ord;
+ ulint i;
+
+ ut_ad(index);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(index->n_def == index->n_fields);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ ut_ad(mem_heap_validate(index->heap));
+ ut_a(!dict_index_is_clust(index)
+ || UT_LIST_GET_LEN(table->indexes) == 0);
+
+ if (!dict_index_find_cols(table, index)) {
+
+ dict_mem_index_free(index);
+ return(DB_CORRUPTION);
+ }
+
+ /* Build the cache internal representation of the index,
+ containing also the added system fields */
+
+ if (dict_index_is_clust(index)) {
+ new_index = dict_index_build_internal_clust(table, index);
+ } else {
+ new_index = dict_index_build_internal_non_clust(table, index);
+ }
+
+ /* Set the n_fields value in new_index to the actual defined
+ number of fields in the cache internal representation */
+
+ new_index->n_fields = new_index->n_def;
+
+ if (strict && dict_index_too_big_for_tree(table, new_index)) {
+too_big:
+ dict_mem_index_free(new_index);
+ dict_mem_index_free(index);
+ return(DB_TOO_BIG_RECORD);
+ }
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ n_ord = new_index->n_fields;
+ } else {
+ n_ord = new_index->n_uniq;
+ }
+
+ switch (dict_table_get_format(table)) {
+ case DICT_TF_FORMAT_51:
+ /* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store
+ prefixes of externally stored columns locally within
+ the record. There are no special considerations for
+ the undo log record size. */
+ goto undo_size_ok;
+
+ case DICT_TF_FORMAT_ZIP:
+ /* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED,
+ column prefix indexes require that prefixes of
+ externally stored columns are written to the undo log.
+ This may make the undo log record bigger than the
+ record on the B-tree page. The maximum size of an
+ undo log record is the page size. That must be
+ checked for below. */
+ break;
+
+#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
+# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#endif
+ }
+
+ for (i = 0; i < n_ord; i++) {
+ const dict_field_t* field
+ = dict_index_get_nth_field(new_index, i);
+ const dict_col_t* col
+ = dict_field_get_col(field);
+
+ /* In dtuple_convert_big_rec(), variable-length columns
+ that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+ may be chosen for external storage. If the column appears
+ in an ordering column of an index, a longer prefix of
+ REC_MAX_INDEX_COL_LEN will be copied to the undo log
+ by trx_undo_page_report_modify() and
+ trx_undo_page_fetch_ext(). It suffices to check the
+ capacity of the undo log whenever new_index includes
+ a column prefix on a column that may be stored externally. */
+
+ if (field->prefix_len /* prefix index */
+ && !col->ord_part /* not yet ordering column */
+ && !dict_col_get_fixed_size(col, TRUE) /* variable-length */
+ && dict_col_get_max_size(col)
+ > BTR_EXTERN_FIELD_REF_SIZE * 2 /* long enough */) {
+
+ if (dict_index_too_big_for_undo(table, new_index)) {
+ /* An undo log record might not fit in
+ a single page. Refuse to create this index. */
+
+ goto too_big;
+ }
+
+ break;
+ }
+ }
+
+undo_size_ok:
+ /* Flag the ordering columns */
+
+ for (i = 0; i < n_ord; i++) {
+
+ dict_index_get_nth_field(new_index, i)->col->ord_part = 1;
+ }
+
+ /* Add the new index as the last index for the table */
+
+ UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
+ new_index->table = table;
+ new_index->table_name = table->name;
+
+ new_index->search_info = btr_search_info_create(new_index->heap);
+
+ new_index->stat_index_size = 1;
+ new_index->stat_n_leaf_pages = 1;
+
+ new_index->page = page_no;
+ rw_lock_create(&new_index->lock, SYNC_INDEX_TREE);
+
+ if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {
+
+ new_index->stat_n_diff_key_vals = mem_heap_alloc(
+ new_index->heap,
+ (1 + dict_index_get_n_unique(new_index))
+ * sizeof(ib_int64_t));
+ /* Give some sensible values to stat_n_... in case we do
+ not calculate statistics quickly enough */
+
+ for (i = 0; i <= dict_index_get_n_unique(new_index); i++) {
+
+ new_index->stat_n_diff_key_vals[i] = 100;
+ }
+ }
+
+ dict_sys->size += mem_heap_get_size(new_index->heap);
+
+ dict_mem_index_free(index);
+
+ return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index) /*!< in, own: index */
+{
+ ulint size;
+ ulint retries = 0;
+ btr_search_t* info;
+
+ ut_ad(table && index);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* remove all entry of the index from adaptive hash index,
+ because removing from adaptive hash index needs dict_index */
+ if (btr_search_enabled && srv_dict_size_limit)
+ btr_search_drop_page_hash_index_on_index(index);
+
+ /* We always create search info whether or not adaptive
+ hash index is enabled or not. */
+ info = index->search_info;
+ ut_ad(info);
+
+ /* We are not allowed to free the in-memory index struct
+ dict_index_t until all entries in the adaptive hash index
+ that point to any of the page belonging to his b-tree index
+ are dropped. This is so because dropping of these entries
+ require access to dict_index_t struct. To avoid such scenario
+ We keep a count of number of such pages in the search_info and
+ only free the dict_index_t struct when this count drops to
+ zero. */
+
+ for (;;) {
+ ulint ref_count = btr_search_info_get_ref_count(info);
+ if (ref_count == 0) {
+ break;
+ }
+
+ /* Sleep for 10ms before trying again. */
+ os_thread_sleep(10000);
+ ++retries;
+
+ if (retries % 500 == 0) {
+ /* No luck after 5 seconds of wait. */
+ fprintf(stderr, "InnoDB: Error: Waited for"
+ " %lu secs for hash index"
+ " ref_count (%lu) to drop"
+ " to 0.\n"
+ "index: \"%s\""
+ " table: \"%s\"\n",
+ retries/100,
+ ref_count,
+ index->name,
+ table->name);
+ }
+
+ /* To avoid a hang here we commit suicide if the
+ ref_count doesn't drop to zero in 600 seconds. */
+ if (retries >= 60000) {
+ ut_error;
+ }
+ }
+
+ rw_lock_free(&index->lock);
+
+ /* Remove the index from the list of indexes of the table */
+ UT_LIST_REMOVE(indexes, table->indexes, index);
+
+ size = mem_heap_get_size(index->heap);
+
+ ut_ad(dict_sys->size >= size);
+
+ dict_sys->size -= size;
+
+ dict_mem_index_free(index);
+}
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index) /*!< in: index */
+{
+ ulint i;
+
+ ut_ad(table && index);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ for (i = 0; i < index->n_fields; i++) {
+ ulint j;
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+
+ for (j = 0; j < table->n_cols; j++) {
+ if (!strcmp(dict_table_get_col_name(table, j),
+ field->name)) {
+ field->col = dict_table_get_nth_col(table, j);
+
+ goto found;
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ /* It is an error not to find a matching column. */
+ fputs("InnoDB: Error: no matching column for ", stderr);
+ ut_print_name(stderr, NULL, FALSE, field->name);
+ fputs(" in ", stderr);
+ dict_index_name_print(stderr, NULL, index);
+ fputs("!\n", stderr);
+#endif /* UNIV_DEBUG */
+ return(FALSE);
+
+found:
+ ;
+ }
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ const dict_table_t* table, /*!< in: table */
+ dict_col_t* col, /*!< in: column */
+ ulint prefix_len) /*!< in: column prefix length */
+{
+ dict_field_t* field;
+ const char* col_name;
+
+ col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+ dict_mem_index_add_field(index, col_name, prefix_len);
+
+ field = dict_index_get_nth_field(index, index->n_def - 1);
+
+ field->col = col;
+ field->fixed_len = (unsigned int) dict_col_get_fixed_size(
+ col, dict_table_is_comp(table));
+
+ if (prefix_len && field->fixed_len > prefix_len) {
+ field->fixed_len = (unsigned int) prefix_len;
+ }
+
+ /* Long fixed-length fields that need external storage are treated as
+ variable-length fields, so that the extern flag can be embedded in
+ the length word. */
+
+ if (field->fixed_len > DICT_MAX_INDEX_COL_LEN) {
+ field->fixed_len = 0;
+ }
+#if DICT_MAX_INDEX_COL_LEN != 768
+ /* The comparison limit above must be constant. If it were
+ changed, the disk format of some fixed-length columns would
+ change, which would be a disaster. */
+# error "DICT_MAX_INDEX_COL_LEN != 768"
+#endif
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ index->n_nullable++;
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies fields contained in index2 to index1. */
+static
+void
+dict_index_copy(
+/*============*/
+ dict_index_t* index1, /*!< in: index to copy to */
+ dict_index_t* index2, /*!< in: index to copy from */
+ const dict_table_t* table, /*!< in: table */
+ ulint start, /*!< in: first position to copy */
+ ulint end) /*!< in: last position to copy */
+{
+ dict_field_t* field;
+ ulint i;
+
+ /* Copy fields contained in index2 */
+
+ for (i = start; i < end; i++) {
+
+ field = dict_index_get_nth_field(index2, i);
+ dict_index_add_col(index1, table, field->col,
+ field->prefix_len);
+ }
+}
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields) /*!< in: number of
+ field types to copy */
+{
+ ulint i;
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ dtuple_set_types_binary(tuple, n_fields);
+
+ return;
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ const dict_field_t* ifield;
+ dtype_t* dfield_type;
+
+ ifield = dict_index_get_nth_field(index, i);
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dict_col_copy_type(dict_field_get_col(ifield), dfield_type);
+ }
+}
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table) /*!< in: table */
+{
+ ulint i;
+
+ for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+ dfield_t* dfield = dtuple_get_nth_field(tuple, i);
+ dtype_t* dtype = dfield_get_type(dfield);
+
+ dfield_set_null(dfield);
+ dict_col_copy_type(dict_table_get_nth_col(table, i), dtype);
+ }
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index) /*!< in: user representation of
+ a clustered index */
+{
+ dict_index_t* new_index;
+ dict_field_t* field;
+ ulint fixed_size;
+ ulint trx_id_pos;
+ ulint i;
+ ibool* indexed;
+
+ ut_ad(table && index);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ /* Create a new index object with certainly enough fields */
+ new_index = dict_mem_index_create(table->name,
+ index->name, table->space,
+ index->type,
+ index->n_fields + table->n_cols);
+
+ /* Copy other relevant data from the old index struct to the new
+ struct: it inherits the values */
+
+ new_index->n_user_defined_cols = index->n_fields;
+
+ new_index->id = index->id;
+
+ /* Copy the fields of index */
+ dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ /* No fixed number of fields determines an entry uniquely */
+
+ new_index->n_uniq = REC_MAX_N_FIELDS;
+
+ } else if (dict_index_is_unique(index)) {
+ /* Only the fields defined so far are needed to identify
+ the index entry uniquely */
+
+ new_index->n_uniq = new_index->n_def;
+ } else {
+ /* Also the row id is needed to identify the entry */
+ new_index->n_uniq = 1 + new_index->n_def;
+ }
+
+ new_index->trx_id_offset = 0;
+
+ if (!dict_index_is_ibuf(index)) {
+ /* Add system columns, trx id first */
+
+ trx_id_pos = new_index->n_def;
+
+#if DATA_ROW_ID != 0
+# error "DATA_ROW_ID != 0"
+#endif
+#if DATA_TRX_ID != 1
+# error "DATA_TRX_ID != 1"
+#endif
+#if DATA_ROLL_PTR != 2
+# error "DATA_ROLL_PTR != 2"
+#endif
+
+ if (!dict_index_is_unique(index)) {
+ dict_index_add_col(new_index, table,
+ dict_table_get_sys_col(
+ table, DATA_ROW_ID),
+ 0);
+ trx_id_pos++;
+ }
+
+ dict_index_add_col(new_index, table,
+ dict_table_get_sys_col(table, DATA_TRX_ID),
+ 0);
+
+ dict_index_add_col(new_index, table,
+ dict_table_get_sys_col(table,
+ DATA_ROLL_PTR),
+ 0);
+
+ for (i = 0; i < trx_id_pos; i++) {
+
+ fixed_size = dict_col_get_fixed_size(
+ dict_index_get_nth_col(new_index, i),
+ dict_table_is_comp(table));
+
+ if (fixed_size == 0) {
+ new_index->trx_id_offset = 0;
+
+ break;
+ }
+
+ if (dict_index_get_nth_field(new_index, i)->prefix_len
+ > 0) {
+ new_index->trx_id_offset = 0;
+
+ break;
+ }
+
+ new_index->trx_id_offset += (unsigned int) fixed_size;
+ }
+
+ }
+
+ /* Remember the table columns already contained in new_index */
+ indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+
+ /* Mark the table columns already contained in new_index */
+ for (i = 0; i < new_index->n_def; i++) {
+
+ field = dict_index_get_nth_field(new_index, i);
+
+ /* If there is only a prefix of the column in the index
+ field, do not mark the column as contained in the index */
+
+ if (field->prefix_len == 0) {
+
+ indexed[field->col->ind] = TRUE;
+ }
+ }
+
+ /* Add to new_index non-system columns of table not yet included
+ there */
+ for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+
+ dict_col_t* col = dict_table_get_nth_col(table, i);
+ ut_ad(col->mtype != DATA_SYS);
+
+ if (!indexed[col->ind]) {
+ dict_index_add_col(new_index, table, col, 0);
+ }
+ }
+
+ mem_free(indexed);
+
+ ut_ad(dict_index_is_ibuf(index)
+ || (UT_LIST_GET_LEN(table->indexes) == 0));
+
+ new_index->cached = TRUE;
+
+ return(new_index);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+ const dict_table_t* table, /*!< in: table */
+ dict_index_t* index) /*!< in: user representation of
+ a non-clustered index */
+{
+ dict_field_t* field;
+ dict_index_t* new_index;
+ dict_index_t* clust_index;
+ ulint i;
+ ibool* indexed;
+
+ ut_ad(table && index);
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ /* The clustered index should be the first in the list of indexes */
+ clust_index = UT_LIST_GET_FIRST(table->indexes);
+
+ ut_ad(clust_index);
+ ut_ad(dict_index_is_clust(clust_index));
+ ut_ad(!(clust_index->type & DICT_UNIVERSAL));
+
+ /* Create a new index */
+ new_index = dict_mem_index_create(
+ table->name, index->name, index->space, index->type,
+ index->n_fields + 1 + clust_index->n_uniq);
+
+ /* Copy other relevant data from the old index
+ struct to the new struct: it inherits the values */
+
+ new_index->n_user_defined_cols = index->n_fields;
+
+ new_index->id = index->id;
+
+ /* Copy fields from index to new_index */
+ dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+ /* Remember the table columns already contained in new_index */
+ indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+
+ /* Mark the table columns already contained in new_index */
+ for (i = 0; i < new_index->n_def; i++) {
+
+ field = dict_index_get_nth_field(new_index, i);
+
+ /* If there is only a prefix of the column in the index
+ field, do not mark the column as contained in the index */
+
+ if (field->prefix_len == 0) {
+
+ indexed[field->col->ind] = TRUE;
+ }
+ }
+
+ /* Add to new_index the columns necessary to determine the clustered
+ index entry uniquely */
+
+ for (i = 0; i < clust_index->n_uniq; i++) {
+
+ field = dict_index_get_nth_field(clust_index, i);
+
+ if (!indexed[field->col->ind]) {
+ dict_index_add_col(new_index, table, field->col,
+ field->prefix_len);
+ }
+ }
+
+ mem_free(indexed);
+
+ if (dict_index_is_unique(index)) {
+ new_index->n_uniq = index->n_fields;
+ } else {
+ new_index->n_uniq = new_index->n_def;
+ }
+
+ /* Set the n_fields value in new_index to the actual defined
+ number of fields */
+
+ new_index->n_fields = new_index->n_def;
+
+ new_index->cached = TRUE;
+
+ return(new_index);
+}
+
+/*====================== FOREIGN KEY PROCESSING ========================*/
+
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+ const dict_table_t* table) /*!< in: InnoDB table */
+{
+ return(UT_LIST_GET_LEN(table->referenced_list) > 0);
+}
+
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return foreign
+else return NULL
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index) /*!< in: InnoDB index */
+{
+ dict_foreign_t* foreign;
+
+ ut_ad(index != NULL);
+ ut_ad(table != NULL);
+
+ for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+ foreign;
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+
+ if (foreign->referenced_index == index) {
+
+ return(foreign);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index) /*!< in: InnoDB index */
+{
+ dict_foreign_t* foreign;
+
+ ut_ad(index != NULL);
+ ut_ad(table != NULL);
+
+ for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+ foreign;
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+ if (foreign->foreign_index == index
+ || foreign->referenced_index == index) {
+
+ return(foreign);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+static
+void
+dict_foreign_free(
+/*==============*/
+ dict_foreign_t* foreign) /*!< in, own: foreign key struct */
+{
+ mem_heap_free(foreign->heap);
+}
+
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+static
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+ dict_foreign_t* foreign) /*!< in, own: foreign constraint */
+{
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_a(foreign);
+
+ if (foreign->referenced_table) {
+ UT_LIST_REMOVE(referenced_list,
+ foreign->referenced_table->referenced_list,
+ foreign);
+ }
+
+ if (foreign->foreign_table) {
+ UT_LIST_REMOVE(foreign_list,
+ foreign->foreign_table->foreign_list,
+ foreign);
+ }
+
+ dict_foreign_free(foreign);
+}
+
+/**********************************************************************//**
+Looks for the foreign constraint from the foreign and referenced lists
+of a table.
+@return foreign constraint */
+static
+dict_foreign_t*
+dict_foreign_find(
+/*==============*/
+ dict_table_t* table, /*!< in: table object */
+ const char* id) /*!< in: foreign constraint id */
+{
+ dict_foreign_t* foreign;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign) {
+ if (ut_strcmp(id, foreign->id) == 0) {
+
+ return(foreign);
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign) {
+ if (ut_strcmp(id, foreign->id) == 0) {
+
+ return(foreign);
+ }
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return matching index, NULL if not found */
+static
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols, /*!< in: number of columns */
+ dict_index_t* types_idx, /*!< in: NULL or an index to whose types the
+ column types must match */
+ ibool check_charsets,
+ /*!< in: whether to check charsets.
+ only has an effect if types_idx != NULL */
+ ulint check_null)
+ /*!< in: nonzero if none of the columns must
+ be declared NOT NULL */
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ /* Ignore matches that refer to the same instance
+ or the index is to be dropped */
+ if (index->to_be_dropped || types_idx == index) {
+
+ goto next_rec;
+
+ } else if (dict_index_get_n_fields(index) >= n_cols) {
+ ulint i;
+
+ for (i = 0; i < n_cols; i++) {
+ dict_field_t* field;
+ const char* col_name;
+
+ field = dict_index_get_nth_field(index, i);
+
+ col_name = dict_table_get_col_name(
+ table, dict_col_get_no(field->col));
+
+ if (field->prefix_len != 0) {
+ /* We do not accept column prefix
+ indexes here */
+
+ break;
+ }
+
+ if (0 != innobase_strcasecmp(columns[i],
+ col_name)) {
+ break;
+ }
+
+ if (check_null
+ && (field->col->prtype & DATA_NOT_NULL)) {
+
+ return(NULL);
+ }
+
+ if (types_idx && !cmp_cols_are_equal(
+ dict_index_get_nth_col(index, i),
+ dict_index_get_nth_col(types_idx,
+ i),
+ check_charsets)) {
+
+ break;
+ }
+ }
+
+ if (i == n_cols) {
+ /* We found a matching index */
+
+ return(index);
+ }
+ }
+
+next_rec:
+ index = dict_table_get_next_index(index);
+ }
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Find an index that is equivalent to the one passed in and is not marked
+for deletion.
+@return index equivalent to foreign->foreign_index, or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_equiv_index(
+/*==========================*/
+ dict_foreign_t* foreign)/*!< in: foreign key */
+{
+ ut_a(foreign != NULL);
+
+ /* Try to find an index which contains the columns as the
+ first fields and in the right order, and the types are the
+ same as in foreign->foreign_index */
+
+ return(dict_foreign_find_index(
+ foreign->foreign_table,
+ foreign->foreign_col_names, foreign->n_fields,
+ foreign->foreign_index, TRUE, /* check types */
+ FALSE/* allow columns to be NULL */));
+}
+
+/**********************************************************************//**
+Returns an index object by matching on the name and column names and
+if more than one index matches return the index with the max id
+@return matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_by_max_id(
+/*===========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name, /*!< in: the index name to find */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols) /*!< in: number of columns */
+{
+ dict_index_t* index;
+ dict_index_t* found;
+
+ found = NULL;
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (ut_strcmp(index->name, name) == 0
+ && dict_index_get_n_ordering_defined_by_user(index)
+ == n_cols) {
+
+ ulint i;
+
+ for (i = 0; i < n_cols; i++) {
+ dict_field_t* field;
+ const char* col_name;
+
+ field = dict_index_get_nth_field(index, i);
+
+ col_name = dict_table_get_col_name(
+ table, dict_col_get_no(field->col));
+
+ if (0 != innobase_strcasecmp(
+ columns[i], col_name)) {
+
+ break;
+ }
+ }
+
+ if (i == n_cols) {
+ /* We found a matching index, select
+ the index with the higher id*/
+
+ if (!found
+ || ut_dulint_cmp(index->id, found->id) > 0) {
+
+ found = index;
+ }
+ }
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(found);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report_low(
+/*==========================*/
+ FILE* file, /*!< in: output stream */
+ const char* name) /*!< in: table name */
+{
+ rewind(file);
+ ut_print_timestamp(file);
+ fprintf(file, " Error in foreign key constraint of table %s:\n",
+ name);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ dict_foreign_t* fk, /*!< in: foreign key constraint */
+ const char* msg) /*!< in: the error message */
+{
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(file, fk->foreign_table_name);
+ fputs(msg, file);
+ fputs(" Constraint:\n", file);
+ dict_print_info_on_foreign_key_in_create_format(file, NULL, fk, TRUE);
+ putc('\n', file);
+ if (fk->foreign_index) {
+ fputs("The index in the foreign key in table is ", file);
+ ut_print_name(file, NULL, FALSE, fk->foreign_index->name);
+ fputs("\n"
+ "See " REFMAN "innodb-foreign-key-constraints.html\n"
+ "for correct foreign key definition.\n",
+ file);
+ }
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of the foreign table and the referenced table must already
+be in the dictionary cache!
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign, /*!< in, own: foreign key constraint */
+ ibool check_charsets) /*!< in: TRUE=check charset
+ compatibility */
+{
+ dict_table_t* for_table;
+ dict_table_t* ref_table;
+ dict_foreign_t* for_in_cache = NULL;
+ dict_index_t* index;
+ ibool added_to_referenced_list= FALSE;
+ FILE* ef = dict_foreign_err_file;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ for_table = dict_table_check_if_in_cache_low(
+ foreign->foreign_table_name);
+
+ ref_table = dict_table_check_if_in_cache_low(
+ foreign->referenced_table_name);
+ ut_a(for_table || ref_table);
+
+ if (for_table) {
+ for_in_cache = dict_foreign_find(for_table, foreign->id);
+ }
+
+ if (!for_in_cache && ref_table) {
+ for_in_cache = dict_foreign_find(ref_table, foreign->id);
+ }
+
+ if (for_in_cache) {
+ /* Free the foreign object */
+ mem_heap_free(foreign->heap);
+ } else {
+ for_in_cache = foreign;
+ }
+
+ if (for_in_cache->referenced_table == NULL && ref_table) {
+ index = dict_foreign_find_index(
+ ref_table,
+ for_in_cache->referenced_col_names,
+ for_in_cache->n_fields, for_in_cache->foreign_index,
+ check_charsets, FALSE);
+
+ if (index == NULL) {
+ dict_foreign_error_report(
+ ef, for_in_cache,
+ "there is no index in referenced table"
+ " which would contain\n"
+ "the columns as the first columns,"
+ " or the data types in the\n"
+ "referenced table do not match"
+ " the ones in table.");
+
+ if (for_in_cache == foreign) {
+ mem_heap_free(foreign->heap);
+ }
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ for_in_cache->referenced_table = ref_table;
+ for_in_cache->referenced_index = index;
+ UT_LIST_ADD_LAST(referenced_list,
+ ref_table->referenced_list,
+ for_in_cache);
+ added_to_referenced_list = TRUE;
+ }
+
+ if (for_in_cache->foreign_table == NULL && for_table) {
+ index = dict_foreign_find_index(
+ for_table,
+ for_in_cache->foreign_col_names,
+ for_in_cache->n_fields,
+ for_in_cache->referenced_index, check_charsets,
+ for_in_cache->type
+ & (DICT_FOREIGN_ON_DELETE_SET_NULL
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+
+ if (index == NULL) {
+ dict_foreign_error_report(
+ ef, for_in_cache,
+ "there is no index in the table"
+ " which would contain\n"
+ "the columns as the first columns,"
+ " or the data types in the\n"
+ "table do not match"
+ " the ones in the referenced table\n"
+ "or one of the ON ... SET NULL columns"
+ " is declared NOT NULL.");
+
+ if (for_in_cache == foreign) {
+ if (added_to_referenced_list) {
+ UT_LIST_REMOVE(
+ referenced_list,
+ ref_table->referenced_list,
+ for_in_cache);
+ }
+
+ mem_heap_free(foreign->heap);
+ }
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ for_in_cache->foreign_table = for_table;
+ for_in_cache->foreign_index = index;
+ UT_LIST_ADD_LAST(foreign_list,
+ for_table->foreign_list,
+ for_in_cache);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Scans from pointer onwards. Stops if is at the start of a copy of
+'string' where characters are compared without case sensitivity, and
+only outside `` or "" quotes. Stops also at NUL.
+@return scanned up to this */
+static
+const char*
+dict_scan_to(
+/*=========*/
+ const char* ptr, /*!< in: scan from */
+ const char* string) /*!< in: look for this */
+{
+ char quote = '\0';
+
+ for (; *ptr; ptr++) {
+ if (*ptr == quote) {
+ /* Closing quote character: do not look for
+ starting quote or the keyword. */
+ quote = '\0';
+ } else if (quote) {
+ /* Within quotes: do nothing. */
+ } else if (*ptr == '`' || *ptr == '"') {
+ /* Starting quote: remember the quote character. */
+ quote = *ptr;
+ } else {
+ /* Outside quotes: look for the keyword. */
+ ulint i;
+ for (i = 0; string[i]; i++) {
+ if (toupper((int)(unsigned char)(ptr[i]))
+ != toupper((int)(unsigned char)
+ (string[i]))) {
+ goto nomatch;
+ }
+ }
+ break;
+nomatch:
+ ;
+ }
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Accepts a specified string. Comparisons are case-insensitive.
+@return if string was accepted, the pointer is moved after that, else
+ptr is returned */
+static
+const char*
+dict_accept(
+/*========*/
+ struct charset_info_st* cs,/*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scan from this */
+ const char* string, /*!< in: accept only this string as the next
+ non-whitespace string */
+ ibool* success)/*!< out: TRUE if accepted */
+{
+ const char* old_ptr = ptr;
+ const char* old_ptr2;
+
+ *success = FALSE;
+
+ while (my_isspace(cs, *ptr)) {
+ ptr++;
+ }
+
+ old_ptr2 = ptr;
+
+ ptr = dict_scan_to(ptr, string);
+
+ if (*ptr == '\0' || old_ptr2 != ptr) {
+ return(old_ptr);
+ }
+
+ *success = TRUE;
+
+ return(ptr + ut_strlen(string));
+}
+
+/*********************************************************************//**
+Scans an id. For the lexical definition of an 'id', see the code below.
+Strips backquotes or double quotes from around the id.
+@return scanned to */
+static
+const char*
+dict_scan_id(
+/*=========*/
+ struct charset_info_st* cs,/*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scanned to */
+ mem_heap_t* heap, /*!< in: heap where to allocate the id
+ (NULL=id will not be allocated, but it
+ will point to string near ptr) */
+ const char** id, /*!< out,own: the id; NULL if no id was
+ scannable */
+ ibool table_id,/*!< in: TRUE=convert the allocated id
+ as a table name; FALSE=convert to UTF-8 */
+ ibool accept_also_dot)
+ /*!< in: TRUE if also a dot can appear in a
+ non-quoted id; in a quoted id it can appear
+ always */
+{
+ char quote = '\0';
+ ulint len = 0;
+ const char* s;
+ char* str;
+ char* dst;
+
+ *id = NULL;
+
+ while (my_isspace(cs, *ptr)) {
+ ptr++;
+ }
+
+ if (*ptr == '\0') {
+
+ return(ptr);
+ }
+
+ if (*ptr == '`' || *ptr == '"') {
+ quote = *ptr++;
+ }
+
+ s = ptr;
+
+ if (quote) {
+ for (;;) {
+ if (!*ptr) {
+ /* Syntax error */
+ return(ptr);
+ }
+ if (*ptr == quote) {
+ ptr++;
+ if (*ptr != quote) {
+ break;
+ }
+ }
+ ptr++;
+ len++;
+ }
+ } else {
+ while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')'
+ && (accept_also_dot || *ptr != '.')
+ && *ptr != ',' && *ptr != '\0') {
+
+ ptr++;
+ }
+
+ len = ptr - s;
+ }
+
+ if (UNIV_UNLIKELY(!heap)) {
+ /* no heap given: id will point to source string */
+ *id = s;
+ return(ptr);
+ }
+
+ if (quote) {
+ char* d;
+ str = d = mem_heap_alloc(heap, len + 1);
+ while (len--) {
+ if ((*d++ = *s++) == quote) {
+ s++;
+ }
+ }
+ *d++ = 0;
+ len = d - str;
+ ut_ad(*s == quote);
+ ut_ad(s + 1 == ptr);
+ } else {
+ str = mem_heap_strdupl(heap, s, len);
+ }
+
+ if (!table_id) {
+convert_id:
+ /* Convert the identifier from connection character set
+ to UTF-8. */
+ len = 3 * len + 1;
+ *id = dst = mem_heap_alloc(heap, len);
+
+ innobase_convert_from_id(cs, dst, str, len);
+ } else if (!strncmp(str, srv_mysql50_table_name_prefix,
+ sizeof srv_mysql50_table_name_prefix)) {
+ /* This is a pre-5.1 table name
+ containing chars other than [A-Za-z0-9].
+ Discard the prefix and use raw UTF-8 encoding. */
+ str += sizeof srv_mysql50_table_name_prefix;
+ len -= sizeof srv_mysql50_table_name_prefix;
+ goto convert_id;
+ } else {
+ /* Encode using filename-safe characters. */
+ len = 5 * len + 1;
+ *id = dst = mem_heap_alloc(heap, len);
+
+ innobase_convert_from_table_id(cs, dst, str, len);
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Tries to scan a column name.
+@return scanned to */
+static
+const char*
+dict_scan_col(
+/*==========*/
+ struct charset_info_st* cs, /*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scanned to */
+ ibool* success,/*!< out: TRUE if success */
+ dict_table_t* table, /*!< in: table in which the column is */
+ const dict_col_t** column, /*!< out: pointer to column if success */
+ mem_heap_t* heap, /*!< in: heap where to allocate */
+ const char** name) /*!< out,own: the column name;
+ NULL if no name was scannable */
+{
+ ulint i;
+
+ *success = FALSE;
+
+ ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE);
+
+ if (*name == NULL) {
+
+ return(ptr); /* Syntax error */
+ }
+
+ if (table == NULL) {
+ *success = TRUE;
+ *column = NULL;
+ } else {
+ for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+ const char* col_name = dict_table_get_col_name(
+ table, i);
+
+ if (0 == innobase_strcasecmp(col_name, *name)) {
+ /* Found */
+
+ *success = TRUE;
+ *column = dict_table_get_nth_col(table, i);
+ strcpy((char*) *name, col_name);
+
+ break;
+ }
+ }
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Scans a table name from an SQL string.
+@return scanned to */
+static
+const char*
+dict_scan_table_name(
+/*=================*/
+ struct charset_info_st* cs,/*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scanned to */
+ dict_table_t** table, /*!< out: table object or NULL */
+ const char* name, /*!< in: foreign key table name */
+ ibool* success,/*!< out: TRUE if ok name found */
+ mem_heap_t* heap, /*!< in: heap where to allocate the id */
+ const char** ref_name)/*!< out,own: the table name;
+ NULL if no name was scannable */
+{
+ const char* database_name = NULL;
+ ulint database_name_len = 0;
+ const char* table_name = NULL;
+ ulint table_name_len;
+ const char* scan_name;
+ char* ref;
+
+ *success = FALSE;
+ *table = NULL;
+
+ ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE);
+
+ if (scan_name == NULL) {
+
+ return(ptr); /* Syntax error */
+ }
+
+ if (*ptr == '.') {
+ /* We scanned the database name; scan also the table name */
+
+ ptr++;
+
+ database_name = scan_name;
+ database_name_len = strlen(database_name);
+
+ ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE);
+
+ if (table_name == NULL) {
+
+ return(ptr); /* Syntax error */
+ }
+ } else {
+ /* To be able to read table dumps made with InnoDB-4.0.17 or
+ earlier, we must allow the dot separator between the database
+ name and the table name also to appear within a quoted
+ identifier! InnoDB used to print a constraint as:
+ ... REFERENCES `databasename.tablename` ...
+ starting from 4.0.18 it is
+ ... REFERENCES `databasename`.`tablename` ... */
+ const char* s;
+
+ for (s = scan_name; *s; s++) {
+ if (*s == '.') {
+ database_name = scan_name;
+ database_name_len = s - scan_name;
+ scan_name = ++s;
+ break;/* to do: multiple dots? */
+ }
+ }
+
+ table_name = scan_name;
+ }
+
+ if (database_name == NULL) {
+ /* Use the database name of the foreign key table */
+
+ database_name = name;
+ database_name_len = dict_get_db_name_len(name);
+ }
+
+ table_name_len = strlen(table_name);
+
+ /* Copy database_name, '/', table_name, '\0' */
+ ref = mem_heap_alloc(heap, database_name_len + table_name_len + 2);
+ memcpy(ref, database_name, database_name_len);
+ ref[database_name_len] = '/';
+ memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+#ifndef __WIN__
+ if (srv_lower_case_table_names) {
+#endif /* !__WIN__ */
+ /* The table name is always put to lower case on Windows. */
+ innobase_casedn_str(ref);
+#ifndef __WIN__
+ }
+#endif /* !__WIN__ */
+
+ *success = TRUE;
+ *ref_name = ref;
+ *table = dict_table_get_low(ref);
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Skips one id. The id is allowed to contain also '.'.
+@return scanned to */
+static
+const char*
+dict_skip_word(
+/*===========*/
+ struct charset_info_st* cs,/*!< in: the character set of ptr */
+ const char* ptr, /*!< in: scanned to */
+ ibool* success)/*!< out: TRUE if success, FALSE if just spaces
+ left in string or a syntax error */
+{
+ const char* start;
+
+ *success = FALSE;
+
+ ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE);
+
+ if (start) {
+ *success = TRUE;
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Removes MySQL comments from an SQL string. A comment is either
+(a) '#' to the end of the line,
+(b) '--[space]' to the end of the line, or
+(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar
+C comment syntax).
+@return own: SQL string stripped from comments; the caller must free
+this with mem_free()! */
+static
+char*
+dict_strip_comments(
+/*================*/
+ const char* sql_string, /*!< in: SQL string */
+ size_t sql_length) /*!< in: length of sql_string */
+{
+ char* str;
+ const char* sptr;
+ const char* eptr = sql_string + sql_length;
+ char* ptr;
+ /* unclosed quote character (0 if none) */
+ char quote = 0;
+
+ str = mem_alloc(sql_length + 1);
+
+ sptr = sql_string;
+ ptr = str;
+
+ for (;;) {
+scan_more:
+ if (sptr >= eptr || *sptr == '\0') {
+end_of_string:
+ *ptr = '\0';
+
+ ut_a(ptr <= str + sql_length);
+
+ return(str);
+ }
+
+ if (*sptr == quote) {
+ /* Closing quote character: do not look for
+ starting quote or comments. */
+ quote = 0;
+ } else if (quote) {
+ /* Within quotes: do not look for
+ starting quotes or comments. */
+ } else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
+ /* Starting quote: remember the quote character. */
+ quote = *sptr;
+ } else if (*sptr == '#'
+ || (sptr[0] == '-' && sptr[1] == '-'
+ && sptr[2] == ' ')) {
+ for (;;) {
+ if (++sptr >= eptr) {
+ goto end_of_string;
+ }
+
+ /* In Unix a newline is 0x0A while in Windows
+ it is 0x0D followed by 0x0A */
+
+ switch (*sptr) {
+ case (char) 0X0A:
+ case (char) 0x0D:
+ case '\0':
+ goto scan_more;
+ }
+ }
+ } else if (!quote && *sptr == '/' && *(sptr + 1) == '*') {
+ sptr += 2;
+ for (;;) {
+ if (sptr >= eptr) {
+ goto end_of_string;
+ }
+
+ switch (*sptr) {
+ case '\0':
+ goto scan_more;
+ case '*':
+ if (sptr[1] == '/') {
+ sptr += 2;
+ goto scan_more;
+ }
+ }
+
+ sptr++;
+ }
+ }
+
+ *ptr = *sptr;
+
+ ptr++;
+ sptr++;
+ }
+}
+
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return highest number, 0 if table has no new format foreign key constraints */
+static
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+ dict_table_t* table) /*!< in: table in the dictionary memory cache */
+{
+ dict_foreign_t* foreign;
+ char* endp;
+ ulint biggest_id = 0;
+ ulint id;
+ ulint len;
+
+ ut_a(table);
+
+ len = ut_strlen(table->name);
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign) {
+ if (ut_strlen(foreign->id) > ((sizeof dict_ibfk) - 1) + len
+ && 0 == ut_memcmp(foreign->id, table->name, len)
+ && 0 == ut_memcmp(foreign->id + len,
+ dict_ibfk, (sizeof dict_ibfk) - 1)
+ && foreign->id[len + ((sizeof dict_ibfk) - 1)] != '0') {
+ /* It is of the >= 4.0.18 format */
+
+ id = strtoul(foreign->id + len
+ + ((sizeof dict_ibfk) - 1),
+ &endp, 10);
+ if (*endp == '\0') {
+ ut_a(id != biggest_id);
+
+ if (id > biggest_id) {
+ biggest_id = id;
+ }
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ return(biggest_id);
+}
+
+/*********************************************************************//**
+Reports a simple foreign key create clause syntax error. */
+static
+void
+dict_foreign_report_syntax_err(
+/*===========================*/
+ const char* name, /*!< in: table name */
+ const char* start_of_latest_foreign,
+ /*!< in: start of the foreign key clause
+ in the SQL string */
+ const char* ptr) /*!< in: place of the syntax error */
+{
+ FILE* ef = dict_foreign_err_file;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\nSyntax error close to:\n%s\n",
+ start_of_latest_foreign, ptr);
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+static
+ulint
+dict_create_foreign_constraints_low(
+/*================================*/
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap, /*!< in: memory heap */
+ struct charset_info_st* cs,/*!< in: the character set of sql_string */
+ const char* sql_string,
+ /*!< in: CREATE TABLE or ALTER TABLE statement
+ where foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES table2(c, d),
+ table2 can be written also with the database
+ name before it: test.table2; the default
+ database is the database of parameter name */
+ const char* name, /*!< in: table full name in the normalized form
+ database_name/table_name */
+ ibool reject_fks)
+ /*!< in: if TRUE, fail with error code
+ DB_CANNOT_ADD_CONSTRAINT if any foreign
+ keys are found. */
+{
+ dict_table_t* table;
+ dict_table_t* referenced_table;
+ dict_table_t* table_to_alter;
+ ulint highest_id_so_far = 0;
+ dict_index_t* index;
+ dict_foreign_t* foreign;
+ const char* ptr = sql_string;
+ const char* start_of_latest_foreign = sql_string;
+ FILE* ef = dict_foreign_err_file;
+ const char* constraint_name;
+ ibool success;
+ ulint error;
+ const char* ptr1;
+ const char* ptr2;
+ ulint i;
+ ulint j;
+ ibool is_on_delete;
+ ulint n_on_deletes;
+ ulint n_on_updates;
+ const dict_col_t*columns[500];
+ const char* column_names[500];
+ const char* referenced_table_name;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = dict_table_get_low(name);
+
+ if (table == NULL) {
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef,
+ "Cannot find the table in the internal"
+ " data dictionary of InnoDB.\n"
+ "Create table statement:\n%s\n", sql_string);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_ERROR);
+ }
+
+ /* First check if we are actually doing an ALTER TABLE, and in that
+ case look for the table being altered */
+
+ ptr = dict_accept(cs, ptr, "ALTER", &success);
+
+ if (!success) {
+
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "TABLE", &success);
+
+ if (!success) {
+
+ goto loop;
+ }
+
+ /* We are doing an ALTER TABLE: scan the table name we are altering */
+
+ ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name,
+ &success, heap, &referenced_table_name);
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Error: could not find"
+ " the table being ALTERED in:\n%s\n",
+ sql_string);
+
+ return(DB_ERROR);
+ }
+
+ /* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the
+ format databasename/tablename_ibfk_[number], where [number] is local
+ to the table; look for the highest [number] for table_to_alter, so
+ that we can assign to new constraints higher numbers. */
+
+ /* If we are altering a temporary table, the table name after ALTER
+ TABLE does not correspond to the internal table name, and
+ table_to_alter is NULL. TODO: should we fix this somehow? */
+
+ if (table_to_alter == NULL) {
+ highest_id_so_far = 0;
+ } else {
+ highest_id_so_far = dict_table_get_highest_foreign_id(
+ table_to_alter);
+ }
+
+ /* Scan for foreign key declarations in a loop */
+loop:
+ /* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */
+
+ ptr1 = dict_scan_to(ptr, "CONSTRAINT");
+ ptr2 = dict_scan_to(ptr, "FOREIGN");
+
+ constraint_name = NULL;
+
+ if (ptr1 < ptr2) {
+ /* The user may have specified a constraint name. Pick it so
+ that we can store 'databasename/constraintname' as the id of
+ of the constraint to system tables. */
+ ptr = ptr1;
+
+ ptr = dict_accept(cs, ptr, "CONSTRAINT", &success);
+
+ ut_a(success);
+
+ if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') {
+ goto loop;
+ }
+
+ while (my_isspace(cs, *ptr)) {
+ ptr++;
+ }
+
+ /* read constraint name unless got "CONSTRAINT FOREIGN" */
+ if (ptr != ptr2) {
+ ptr = dict_scan_id(cs, ptr, heap,
+ &constraint_name, FALSE, FALSE);
+ }
+ } else {
+ ptr = ptr2;
+ }
+
+ if (*ptr == '\0') {
+ /* The proper way to reject foreign keys for temporary
+ tables would be to split the lexing and syntactical
+ analysis of foreign key clauses from the actual adding
+ of them, so that ha_innodb.cc could first parse the SQL
+ command, determine if there are any foreign keys, and
+ if so, immediately reject the command if the table is a
+ temporary one. For now, this kludge will work. */
+ if (reject_fks && (UT_LIST_GET_LEN(table->foreign_list) > 0)) {
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /**********************************************************/
+ /* The following call adds the foreign key constraints
+ to the data dictionary system tables on disk */
+
+ error = dict_create_add_foreigns_to_dictionary(
+ highest_id_so_far, table, trx);
+ return(error);
+ }
+
+ start_of_latest_foreign = ptr;
+
+ ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+ if (!success) {
+ goto loop;
+ }
+
+ if (!my_isspace(cs, *ptr)) {
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "KEY", &success);
+
+ if (!success) {
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "(", &success);
+
+ if (!success) {
+ /* MySQL allows also an index id before the '('; we
+ skip it */
+ ptr = dict_skip_word(cs, ptr, &success);
+
+ if (!success) {
+ dict_foreign_report_syntax_err(
+ name, start_of_latest_foreign, ptr);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ ptr = dict_accept(cs, ptr, "(", &success);
+
+ if (!success) {
+ /* We do not flag a syntax error here because in an
+ ALTER TABLE we may also have DROP FOREIGN KEY abc */
+
+ goto loop;
+ }
+ }
+
+ i = 0;
+
+ /* Scan the columns in the first list */
+col_loop1:
+ ut_a(i < (sizeof column_names) / sizeof *column_names);
+ ptr = dict_scan_col(cs, ptr, &success, table, columns + i,
+ heap, column_names + i);
+ if (!success) {
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\nCannot resolve column name close to:\n%s\n",
+ start_of_latest_foreign, ptr);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ i++;
+
+ ptr = dict_accept(cs, ptr, ",", &success);
+
+ if (success) {
+ goto col_loop1;
+ }
+
+ ptr = dict_accept(cs, ptr, ")", &success);
+
+ if (!success) {
+ dict_foreign_report_syntax_err(
+ name, start_of_latest_foreign, ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /* Try to find an index which contains the columns
+ as the first fields and in the right order */
+
+ index = dict_foreign_find_index(table, column_names, i,
+ NULL, TRUE, FALSE);
+
+ if (!index) {
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fputs("There is no index in table ", ef);
+ ut_print_name(ef, NULL, TRUE, name);
+ fprintf(ef, " where the columns appear\n"
+ "as the first columns. Constraint:\n%s\n"
+ "See " REFMAN "innodb-foreign-key-constraints.html\n"
+ "for correct foreign key definition.\n",
+ start_of_latest_foreign);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+ ptr = dict_accept(cs, ptr, "REFERENCES", &success);
+
+ if (!success || !my_isspace(cs, *ptr)) {
+ dict_foreign_report_syntax_err(
+ name, start_of_latest_foreign, ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /* Let us create a constraint struct */
+
+ foreign = dict_mem_foreign_create();
+
+ if (constraint_name) {
+ ulint db_len;
+
+ /* Catenate 'databasename/' to the constraint name specified
+ by the user: we conceive the constraint as belonging to the
+ same MySQL 'database' as the table itself. We store the name
+ to foreign->id. */
+
+ db_len = dict_get_db_name_len(table->name);
+
+ foreign->id = mem_heap_alloc(
+ foreign->heap, db_len + strlen(constraint_name) + 2);
+
+ ut_memcpy(foreign->id, table->name, db_len);
+ foreign->id[db_len] = '/';
+ strcpy(foreign->id + db_len + 1, constraint_name);
+ }
+
+ foreign->foreign_table = table;
+ foreign->foreign_table_name = mem_heap_strdup(foreign->heap,
+ table->name);
+ foreign->foreign_index = index;
+ foreign->n_fields = (unsigned int) i;
+ foreign->foreign_col_names = mem_heap_alloc(foreign->heap,
+ i * sizeof(void*));
+ for (i = 0; i < foreign->n_fields; i++) {
+ foreign->foreign_col_names[i] = mem_heap_strdup(
+ foreign->heap,
+ dict_table_get_col_name(table,
+ dict_col_get_no(columns[i])));
+ }
+
+ ptr = dict_scan_table_name(cs, ptr, &referenced_table, name,
+ &success, heap, &referenced_table_name);
+
+ /* Note that referenced_table can be NULL if the user has suppressed
+ checking of foreign key constraints! */
+
+ if (!success || (!referenced_table && trx->check_foreigns)) {
+ dict_foreign_free(foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\nCannot resolve table name close to:\n"
+ "%s\n",
+ start_of_latest_foreign, ptr);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ ptr = dict_accept(cs, ptr, "(", &success);
+
+ if (!success) {
+ dict_foreign_free(foreign);
+ dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+ ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /* Scan the columns in the second list */
+ i = 0;
+
+col_loop2:
+ ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i,
+ heap, column_names + i);
+ i++;
+
+ if (!success) {
+ dict_foreign_free(foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\nCannot resolve column name close to:\n"
+ "%s\n",
+ start_of_latest_foreign, ptr);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ ptr = dict_accept(cs, ptr, ",", &success);
+
+ if (success) {
+ goto col_loop2;
+ }
+
+ ptr = dict_accept(cs, ptr, ")", &success);
+
+ if (!success || foreign->n_fields != i) {
+ dict_foreign_free(foreign);
+
+ dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+ ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ n_on_deletes = 0;
+ n_on_updates = 0;
+
+scan_on_conditions:
+ /* Loop here as long as we can find ON ... conditions */
+
+ ptr = dict_accept(cs, ptr, "ON", &success);
+
+ if (!success) {
+
+ goto try_find_index;
+ }
+
+ ptr = dict_accept(cs, ptr, "DELETE", &success);
+
+ if (!success) {
+ ptr = dict_accept(cs, ptr, "UPDATE", &success);
+
+ if (!success) {
+ dict_foreign_free(foreign);
+
+ dict_foreign_report_syntax_err(
+ name, start_of_latest_foreign, ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ is_on_delete = FALSE;
+ n_on_updates++;
+ } else {
+ is_on_delete = TRUE;
+ n_on_deletes++;
+ }
+
+ ptr = dict_accept(cs, ptr, "RESTRICT", &success);
+
+ if (success) {
+ goto scan_on_conditions;
+ }
+
+ ptr = dict_accept(cs, ptr, "CASCADE", &success);
+
+ if (success) {
+ if (is_on_delete) {
+ foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+ } else {
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+ }
+
+ goto scan_on_conditions;
+ }
+
+ ptr = dict_accept(cs, ptr, "NO", &success);
+
+ if (success) {
+ ptr = dict_accept(cs, ptr, "ACTION", &success);
+
+ if (!success) {
+ dict_foreign_free(foreign);
+ dict_foreign_report_syntax_err(
+ name, start_of_latest_foreign, ptr);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ if (is_on_delete) {
+ foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+ } else {
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+ }
+
+ goto scan_on_conditions;
+ }
+
+ ptr = dict_accept(cs, ptr, "SET", &success);
+
+ if (!success) {
+ dict_foreign_free(foreign);
+ dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+ ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ ptr = dict_accept(cs, ptr, "NULL", &success);
+
+ if (!success) {
+ dict_foreign_free(foreign);
+ dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+ ptr);
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ for (j = 0; j < foreign->n_fields; j++) {
+ if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype)
+ & DATA_NOT_NULL) {
+
+ /* It is not sensible to define SET NULL
+ if the column is not allowed to be NULL! */
+
+ dict_foreign_free(foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\n"
+ "You have defined a SET NULL condition"
+ " though some of the\n"
+ "columns are defined as NOT NULL.\n",
+ start_of_latest_foreign);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+ }
+
+ if (is_on_delete) {
+ foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+ } else {
+ foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+ }
+
+ goto scan_on_conditions;
+
+try_find_index:
+ if (n_on_deletes > 1 || n_on_updates > 1) {
+ /* It is an error to define more than 1 action */
+
+ dict_foreign_free(foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\n"
+ "You have twice an ON DELETE clause"
+ " or twice an ON UPDATE clause.\n",
+ start_of_latest_foreign);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+
+ /* Try to find an index which contains the columns as the first fields
+ and in the right order, and the types are the same as in
+ foreign->foreign_index */
+
+ if (referenced_table) {
+ index = dict_foreign_find_index(referenced_table,
+ column_names, i,
+ foreign->foreign_index,
+ TRUE, FALSE);
+ if (!index) {
+ dict_foreign_free(foreign);
+ mutex_enter(&dict_foreign_err_mutex);
+ dict_foreign_error_report_low(ef, name);
+ fprintf(ef, "%s:\n"
+ "Cannot find an index in the"
+ " referenced table where the\n"
+ "referenced columns appear as the"
+ " first columns, or column types\n"
+ "in the table and the referenced table"
+ " do not match for constraint.\n"
+ "Note that the internal storage type of"
+ " ENUM and SET changed in\n"
+ "tables created with >= InnoDB-4.1.12,"
+ " and such columns in old tables\n"
+ "cannot be referenced by such columns"
+ " in new tables.\n"
+ "See " REFMAN
+ "innodb-foreign-key-constraints.html\n"
+ "for correct foreign key definition.\n",
+ start_of_latest_foreign);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ return(DB_CANNOT_ADD_CONSTRAINT);
+ }
+ } else {
+ ut_a(trx->check_foreigns == FALSE);
+ index = NULL;
+ }
+
+ foreign->referenced_index = index;
+ foreign->referenced_table = referenced_table;
+
+ foreign->referenced_table_name
+ = mem_heap_strdup(foreign->heap, referenced_table_name);
+
+ foreign->referenced_col_names = mem_heap_alloc(foreign->heap,
+ i * sizeof(void*));
+ for (i = 0; i < foreign->n_fields; i++) {
+ foreign->referenced_col_names[i]
+ = mem_heap_strdup(foreign->heap, column_names[i]);
+ }
+
+ /* We found an ok constraint definition: add to the lists */
+
+ UT_LIST_ADD_LAST(foreign_list, table->foreign_list, foreign);
+
+ if (referenced_table) {
+ UT_LIST_ADD_LAST(referenced_list,
+ referenced_table->referenced_list,
+ foreign);
+ }
+
+ goto loop;
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_foreign_constraints(
+/*============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES
+ table2(c, d), table2 can be written
+ also with the database
+ name before it: test.table2; the
+ default database id the database of
+ parameter name */
+ size_t sql_length, /*!< in: length of sql_string */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks) /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+{
+ char* str;
+ ulint err;
+ mem_heap_t* heap;
+
+ ut_a(trx);
+ ut_a(trx->mysql_thd);
+
+ str = dict_strip_comments(sql_string, sql_length);
+ heap = mem_heap_create(10000);
+
+ err = dict_create_foreign_constraints_low(
+ trx, heap, innobase_get_charset(trx->mysql_thd), str, name,
+ reject_fks);
+
+ mem_heap_free(heap);
+ mem_free(str);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+ulint
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop) /*!< out: id's of the
+ constraints to drop */
+{
+ dict_foreign_t* foreign;
+ ibool success;
+ char* str;
+ size_t len;
+ const char* ptr;
+ const char* id;
+ FILE* ef = dict_foreign_err_file;
+ struct charset_info_st* cs;
+
+ ut_a(trx);
+ ut_a(trx->mysql_thd);
+
+ cs = innobase_get_charset(trx->mysql_thd);
+
+ *n = 0;
+
+ *constraints_to_drop = mem_heap_alloc(heap, 1000 * sizeof(char*));
+
+ ptr = innobase_get_stmt(trx->mysql_thd, &len);
+
+ str = dict_strip_comments(ptr, len);
+
+ ptr = str;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+loop:
+ ptr = dict_scan_to(ptr, "DROP");
+
+ if (*ptr == '\0') {
+ mem_free(str);
+
+ return(DB_SUCCESS);
+ }
+
+ ptr = dict_accept(cs, ptr, "DROP", &success);
+
+ if (!my_isspace(cs, *ptr)) {
+
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+ if (!success || !my_isspace(cs, *ptr)) {
+
+ goto loop;
+ }
+
+ ptr = dict_accept(cs, ptr, "KEY", &success);
+
+ if (!success) {
+
+ goto syntax_error;
+ }
+
+ ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE);
+
+ if (id == NULL) {
+
+ goto syntax_error;
+ }
+
+ ut_a(*n < 1000);
+ (*constraints_to_drop)[*n] = id;
+ (*n)++;
+
+ /* Look for the given constraint id */
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign != NULL) {
+ if (0 == strcmp(foreign->id, id)
+ || (strchr(foreign->id, '/')
+ && 0 == strcmp(id,
+ dict_remove_db_name(foreign->id)))) {
+ /* Found */
+ break;
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ if (foreign == NULL) {
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Error in dropping of a foreign key constraint"
+ " of table ", ef);
+ ut_print_name(ef, NULL, TRUE, table->name);
+ fputs(",\n"
+ "in SQL command\n", ef);
+ fputs(str, ef);
+ fputs("\nCannot find a constraint with the given id ", ef);
+ ut_print_name(ef, NULL, FALSE, id);
+ fputs(".\n", ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ mem_free(str);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+ }
+
+ goto loop;
+
+syntax_error:
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Syntax error in dropping of a"
+ " foreign key constraint of table ", ef);
+ ut_print_name(ef, NULL, TRUE, table->name);
+ fprintf(ef, ",\n"
+ "close to:\n%s\n in SQL command\n%s\n", ptr, str);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ mem_free(str);
+
+ return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*==================== END OF FOREIGN KEY PROCESSING ====================*/
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ dulint index_id) /*!< in: index id */
+{
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ return(dict_index_find_on_id_low(index_id));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ dulint index_id) /*!< in: index id */
+{
+ dict_index_t* index;
+
+ if (dict_sys == NULL) {
+ return(NULL);
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+
+ index = dict_index_get_if_in_cache_low(index_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(index);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple) /*!< in: tuple used in a search */
+{
+ ut_a(index);
+ ut_a(dtuple_get_n_fields_cmp(tuple)
+ <= dict_index_get_n_unique_in_tree(index));
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level) /*!< in: level of rec in tree:
+ 0 means leaf level */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+ ulint n_unique;
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ /* In a universal index tree, we take the whole record as
+ the node pointer if the record is on the leaf level,
+ on non-leaf levels we remove the last field, which
+ contains the page number of the child page */
+
+ ut_a(!dict_table_is_comp(index->table));
+ n_unique = rec_get_n_fields_old(rec);
+
+ if (level > 0) {
+ ut_a(n_unique > 1);
+ n_unique--;
+ }
+ } else {
+ n_unique = dict_index_get_n_unique_in_tree(index);
+ }
+
+ tuple = dtuple_create(heap, n_unique + 1);
+
+ /* When searching in the tree for the node pointer, we must not do
+ comparison on the last field, the page number field, as on upper
+ levels in the tree there may be identical node pointers with a
+ different page number; therefore, we set the n_fields_cmp to one
+ less: */
+
+ dtuple_set_n_fields_cmp(tuple, n_unique);
+
+ dict_index_copy_types(tuple, index, n_unique);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ field = dtuple_get_nth_field(tuple, n_unique);
+ dfield_set_data(field, buf, 4);
+
+ dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+ rec_copy_prefix_to_dtuple(tuple, rec, index, n_unique, heap);
+ dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+ | REC_STATUS_NODE_PTR);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(tuple);
+}
+
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to
+ copy prefix */
+ ulint* n_fields,/*!< out: number of fields copied */
+ byte** buf, /*!< in/out: memory buffer for the
+ copied prefix, or NULL */
+ ulint* buf_size)/*!< in/out: buffer size */
+{
+ ulint n;
+
+ UNIV_PREFETCH_R(rec);
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ ut_a(!dict_table_is_comp(index->table));
+ n = rec_get_n_fields_old(rec);
+ } else {
+ n = dict_index_get_n_unique_in_tree(index);
+ }
+
+ *n_fields = n;
+ return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size));
+}
+
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+ dict_index_t* index, /*!< in: index tree */
+ rec_t* rec, /*!< in: record for which to build data tuple */
+ ulint n_fields,/*!< in: number of data fields */
+ mem_heap_t* heap) /*!< in: memory heap where tuple created */
+{
+ dtuple_t* tuple;
+
+ ut_ad(dict_table_is_comp(index->table)
+ || n_fields <= rec_get_n_fields_old(rec));
+
+ tuple = dtuple_create(heap, n_fields);
+
+ dict_index_copy_types(tuple, index, n_fields);
+
+ rec_copy_prefix_to_dtuple(tuple, rec, index, n_fields, heap);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint sum = 0;
+ ulint i;
+ ulint comp = dict_table_is_comp(index->table);
+
+ if (comp) {
+ ulint nullable = 0;
+ sum = REC_N_NEW_EXTRA_BYTES;
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, i);
+ ulint size = dict_col_get_fixed_size(col, comp);
+ sum += size;
+ if (!size) {
+ size = col->len;
+ sum += size < 128 ? 1 : 2;
+ }
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ nullable++;
+ }
+ }
+
+ /* round the NULL flags up to full bytes */
+ sum += UT_BITS_IN_BYTES(nullable);
+
+ return(sum);
+ }
+
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ sum += dict_col_get_fixed_size(
+ dict_index_get_nth_col(index, i), comp);
+ }
+
+ if (sum > 127) {
+ sum += 2 * dict_index_get_n_fields(index);
+ } else {
+ sum += dict_index_get_n_fields(index);
+ }
+
+ sum += REC_N_OLD_EXTRA_BYTES;
+
+ return(sum);
+}
+
+/*********************************************************************//**
+functions to use SYS_STATS system table. */
+static
+ibool
+dict_reload_statistics(
+/*===================*/
+ dict_table_t* table,
+ ulint* sum_of_index_sizes)
+{
+ dict_index_t* index;
+ ulint size;
+ mem_heap_t* heap;
+
+ index = dict_table_get_first_index(table);
+
+ if (index == NULL) {
+ /* Table definition is corrupt */
+
+ return(FALSE);
+ }
+
+ heap = mem_heap_create(1000);
+
+ while (index) {
+ if (table->is_corrupt) {
+ ut_a(srv_pass_corrupt_table);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+ index->stat_index_size = size;
+
+ *sum_of_index_sizes += size;
+
+ size = btr_get_size(index, BTR_N_LEAF_PAGES);
+
+ if (size == 0) {
+ /* The root node of the tree is a leaf */
+ size = 1;
+ }
+
+ index->stat_n_leaf_pages = size;
+
+/*===========================================*/
+{
+ dict_table_t* sys_stats;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ ulint key_cols;
+ ulint n_cols;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ib_int64_t* stat_n_diff_key_vals_tmp;
+ byte* buf;
+ ulint i;
+ mtr_t mtr;
+
+ n_cols = dict_index_get_n_unique(index);
+ stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+
+ sys_stats = dict_sys->sys_stats;
+ sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
+ ut_a(!dict_table_is_comp(sys_stats));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, index->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i <= n_cols; i++) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || ut_dulint_cmp(mach_read_from_8(rec_get_nth_field_old(rec, 0, &len)),
+ index->id)) {
+ /* not found: even 1 if not found should not be alowed */
+ fprintf(stderr, "InnoDB: Warning: stats for %s/%s (%lu/%lu)"
+ " not fonund in SYS_STATS\n",
+ index->table_name, index->name, i, n_cols);
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ goto next_rec;
+ }
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_a(len == 4);
+
+ key_cols = mach_read_from_4(field);
+
+ ut_a(i == key_cols);
+
+ field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
+ ut_a(len == 8);
+
+ stat_n_diff_key_vals_tmp[i] = ut_conv_dulint_to_longlong(mach_read_from_8(field));
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ dict_index_stat_mutex_enter(index);
+ for (i = 0; i <= n_cols; i++) {
+ index->stat_n_diff_key_vals[i] = stat_n_diff_key_vals_tmp[i];
+ }
+ dict_index_stat_mutex_exit(index);
+}
+/*===========================================*/
+
+ index = dict_table_get_next_index(index);
+ }
+
+ mem_heap_free(heap);
+ return(TRUE);
+}
+
+static
+void
+dict_store_statistics(
+/*==================*/
+ dict_table_t* table)
+{
+ dict_index_t* index;
+ mem_heap_t* heap;
+
+ index = dict_table_get_first_index(table);
+
+ ut_a(index);
+
+ heap = mem_heap_create(1000);
+
+ while (index) {
+ if (table->is_corrupt) {
+ ut_a(srv_pass_corrupt_table);
+ mem_heap_free(heap);
+ return;
+ }
+
+/*===========================================*/
+{
+ dict_table_t* sys_stats;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ ulint key_cols;
+ ulint n_cols;
+ ulint rests;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ib_int64_t* stat_n_diff_key_vals_tmp;
+ byte* buf;
+ ulint i;
+ mtr_t mtr;
+
+ n_cols = dict_index_get_n_unique(index);
+ stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+
+ dict_index_stat_mutex_enter(index);
+ for (i = 0; i <= n_cols; i++) {
+ stat_n_diff_key_vals_tmp[i] = index->stat_n_diff_key_vals[i];
+ }
+ dict_index_stat_mutex_exit(index);
+
+ sys_stats = dict_sys->sys_stats;
+ sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
+ ut_a(!dict_table_is_comp(sys_stats));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, index->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_MODIFY_LEAF, &pcur, &mtr);
+ rests = n_cols + 1;
+ for (i = 0; i <= n_cols; i++) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || ut_dulint_cmp(mach_read_from_8(rec_get_nth_field_old(rec, 0, &len)),
+ index->id)) {
+ /* not found */
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ break;
+ }
+
+ if (rec_get_deleted_flag(rec, 0)) {
+ goto next_rec;
+ }
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_a(len == 4);
+
+ key_cols = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
+ ut_a(len == 8);
+
+ mlog_write_dulint((byte*)field,
+ ut_dulint_create((ulint) (stat_n_diff_key_vals_tmp[key_cols] >> 32),
+ (ulint) stat_n_diff_key_vals_tmp[key_cols] & 0xFFFFFFFF),
+ &mtr);
+
+ rests--;
+
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (rests) {
+ fprintf(stderr, "InnoDB: Warning: failed to store %lu stats entries"
+ " of %s/%s to SYS_STATS system table.\n",
+ rests, index->table_name, index->name);
+ }
+}
+/*===========================================*/
+
+ index = dict_table_get_next_index(index);
+ }
+
+ mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics_low(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool has_dict_mutex __attribute__((unused)),
+ /*!< in: TRUE if the caller has the
+ dictionary mutex */
+ ibool sync) /*!< in: TRUE if must update SYS_STATS */
+{
+ dict_index_t* index;
+ ulint sum_of_index_sizes = 0;
+
+ if (table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: cannot calculate statistics for table %s\n"
+ "InnoDB: because the .ibd file is missing. For help,"
+ " please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+ table->name);
+
+ return;
+ }
+
+ if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) && !sync) {
+ /* reload statistics from SYS_STATS table */
+ if (dict_reload_statistics(table, &sum_of_index_sizes)) {
+ /* success */
+#ifdef UNIV_DEBUG
+ fprintf(stderr, "InnoDB: DEBUG: reload_statistics is scceeded for %s.\n",
+ table->name);
+#endif
+ goto end;
+ }
+ }
+#ifdef UNIV_DEBUG
+ fprintf(stderr, "InnoDB: DEBUG: update_statistics for %s.\n",
+ table->name);
+#endif
+ sum_of_index_sizes = 0;
+
+ /* Find out the sizes of the indexes and how many different values
+ for the key they approximately have */
+
+ index = dict_table_get_first_index(table);
+
+ if (index == NULL) {
+ /* Table definition is corrupt */
+
+ return;
+ }
+
+ do {
+ if (table->is_corrupt) {
+ ut_a(srv_pass_corrupt_table);
+ return;
+ }
+
+ if (UNIV_LIKELY
+ (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
+ || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
+ && dict_index_is_clust(index)))) {
+ ulint size;
+ size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+ index->stat_index_size = size;
+
+ sum_of_index_sizes += size;
+
+ size = btr_get_size(index, BTR_N_LEAF_PAGES);
+
+ if (size == 0) {
+ /* The root node of the tree is a leaf */
+ size = 1;
+ }
+
+ index->stat_n_leaf_pages = size;
+
+ btr_estimate_number_of_different_key_vals(index);
+ } else {
+ /* If we have set a high innodb_force_recovery
+ level, do not calculate statistics, as a badly
+ corrupted index can cause a crash in it.
+ Initialize some bogus index cardinality
+ statistics, so that the data can be queried in
+ various means, also via secondary indexes. */
+ ulint i;
+
+ sum_of_index_sizes++;
+ index->stat_index_size = index->stat_n_leaf_pages = 1;
+
+ for (i = dict_index_get_n_unique(index); i; ) {
+ index->stat_n_diff_key_vals[i--] = 1;
+ }
+ }
+
+ index = dict_table_get_next_index(index);
+ } while (index);
+
+ if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
+ /* store statistics to SYS_STATS table */
+ dict_store_statistics(table);
+ }
+end:
+ index = dict_table_get_first_index(table);
+
+ dict_index_stat_mutex_enter(index);
+
+ table->stat_n_rows = index->stat_n_diff_key_vals[
+ dict_index_get_n_unique(index)];
+
+ dict_index_stat_mutex_exit(index);
+
+ table->stat_clustered_index_size = index->stat_index_size;
+
+ table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+ - index->stat_index_size;
+
+ table->stat_initialized = TRUE;
+
+ table->stat_modified_counter = 0;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics(
+/*===================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool sync)
+{
+ dict_update_statistics_low(table, FALSE, sync);
+}
+
+/**********************************************************************//**
+Prints info of a foreign key constraint. */
+static
+void
+dict_foreign_print_low(
+/*===================*/
+ dict_foreign_t* foreign) /*!< in: foreign key constraint */
+{
+ ulint i;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ fprintf(stderr, " FOREIGN KEY CONSTRAINT %s: %s (",
+ foreign->id, foreign->foreign_table_name);
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ fprintf(stderr, " %s", foreign->foreign_col_names[i]);
+ }
+
+ fprintf(stderr, " )\n"
+ " REFERENCES %s (",
+ foreign->referenced_table_name);
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ fprintf(stderr, " %s", foreign->referenced_col_names[i]);
+ }
+
+ fputs(" )\n", stderr);
+}
+
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+ dict_table_t* table) /*!< in: table */
+{
+ mutex_enter(&(dict_sys->mutex));
+ dict_table_print_low(table);
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/**********************************************************************//**
+Prints a table data when we know the table name. */
+UNIV_INTERN
+void
+dict_table_print_by_name(
+/*=====================*/
+ const char* name) /*!< in: table name */
+{
+ dict_table_t* table;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = dict_table_get_low(name);
+
+ ut_a(table);
+
+ dict_table_print_low(table);
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print_low(
+/*=================*/
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+ dict_foreign_t* foreign;
+ ulint i;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ if (srv_stats_auto_update)
+ dict_update_statistics_low(table, TRUE, FALSE);
+
+ fprintf(stderr,
+ "--------------------------------------\n"
+ "TABLE: name %s, id %lu %lu, flags %lx, columns %lu,"
+ " indexes %lu, appr.rows %lu\n"
+ " COLUMNS: ",
+ table->name,
+ (ulong) ut_dulint_get_high(table->id),
+ (ulong) ut_dulint_get_low(table->id),
+ (ulong) table->flags,
+ (ulong) table->n_cols,
+ (ulong) UT_LIST_GET_LEN(table->indexes),
+ (ulong) table->stat_n_rows);
+
+ for (i = 0; i < (ulint) table->n_cols; i++) {
+ dict_col_print_low(table, dict_table_get_nth_col(table, i));
+ fputs("; ", stderr);
+ }
+
+ putc('\n', stderr);
+
+ index = UT_LIST_GET_FIRST(table->indexes);
+
+ while (index != NULL) {
+ dict_index_print_low(index);
+ index = UT_LIST_GET_NEXT(indexes, index);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign != NULL) {
+ dict_foreign_print_low(foreign);
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign != NULL) {
+ dict_foreign_print_low(foreign);
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+}
+
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_col_t* col) /*!< in: column */
+{
+ dtype_t type;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ dict_col_copy_type(col, &type);
+ fprintf(stderr, "%s: ", dict_table_get_col_name(table,
+ dict_col_get_no(col)));
+
+ dtype_print(&type);
+}
+
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ib_int64_t n_vals;
+ ulint i;
+ const char* type_string;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ dict_index_stat_mutex_enter(index);
+
+ if (index->n_user_defined_cols > 0) {
+ n_vals = index->stat_n_diff_key_vals[
+ index->n_user_defined_cols];
+ } else {
+ n_vals = index->stat_n_diff_key_vals[1];
+ }
+
+ dict_index_stat_mutex_exit(index);
+
+ if (dict_index_is_clust(index)) {
+ type_string = "clustered index";
+ } else if (dict_index_is_unique(index)) {
+ type_string = "unique index";
+ } else {
+ type_string = "secondary index";
+ }
+
+ fprintf(stderr,
+ " INDEX: name %s, id %lu %lu, fields %lu/%lu,"
+ " uniq %lu, type %lu\n"
+ " root page %lu, appr.key vals %lu,"
+ " leaf pages %lu, size pages %lu\n"
+ " FIELDS: ",
+ index->name,
+ (ulong) ut_dulint_get_high(index->id),
+ (ulong) ut_dulint_get_low(index->id),
+ (ulong) index->n_user_defined_cols,
+ (ulong) index->n_fields,
+ (ulong) index->n_uniq,
+ (ulong) index->type,
+ (ulong) index->page,
+ (ulong) n_vals,
+ (ulong) index->stat_n_leaf_pages,
+ (ulong) index->stat_index_size);
+
+ for (i = 0; i < index->n_fields; i++) {
+ dict_field_print_low(dict_index_get_nth_field(index, i));
+ }
+
+ putc('\n', stderr);
+
+#ifdef UNIV_BTR_PRINT
+ btr_print_size(index);
+
+ btr_print_index(index, 7);
+#endif /* UNIV_BTR_PRINT */
+}
+
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+ const dict_field_t* field) /*!< in: field */
+{
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ fprintf(stderr, " %s", field->name);
+
+ if (field->prefix_len != 0) {
+ fprintf(stderr, "(%lu)", (ulong) field->prefix_len);
+ }
+}
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline) /*!< in: whether to add a newline */
+{
+ const char* stripped_id;
+ ulint i;
+
+ if (strchr(foreign->id, '/')) {
+ /* Strip the preceding database name from the constraint id */
+ stripped_id = foreign->id + 1
+ + dict_get_db_name_len(foreign->id);
+ } else {
+ stripped_id = foreign->id;
+ }
+
+ putc(',', file);
+
+ if (add_newline) {
+ /* SHOW CREATE TABLE wants constraints each printed nicely
+ on its own line, while error messages want no newlines
+ inserted. */
+ fputs("\n ", file);
+ }
+
+ fputs(" CONSTRAINT ", file);
+ ut_print_name(file, trx, FALSE, stripped_id);
+ fputs(" FOREIGN KEY (", file);
+
+ for (i = 0;;) {
+ ut_print_name(file, trx, FALSE, foreign->foreign_col_names[i]);
+ if (++i < foreign->n_fields) {
+ fputs(", ", file);
+ } else {
+ break;
+ }
+ }
+
+ fputs(") REFERENCES ", file);
+
+ if (dict_tables_have_same_db(foreign->foreign_table_name,
+ foreign->referenced_table_name)) {
+ /* Do not print the database name of the referenced table */
+ ut_print_name(file, trx, TRUE,
+ dict_remove_db_name(
+ foreign->referenced_table_name));
+ } else {
+ ut_print_name(file, trx, TRUE,
+ foreign->referenced_table_name);
+ }
+
+ putc(' ', file);
+ putc('(', file);
+
+ for (i = 0;;) {
+ ut_print_name(file, trx, FALSE,
+ foreign->referenced_col_names[i]);
+ if (++i < foreign->n_fields) {
+ fputs(", ", file);
+ } else {
+ break;
+ }
+ }
+
+ putc(')', file);
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+ fputs(" ON DELETE CASCADE", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+ fputs(" ON DELETE SET NULL", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+ fputs(" ON DELETE NO ACTION", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+ fputs(" ON UPDATE CASCADE", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+ fputs(" ON UPDATE SET NULL", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+ fputs(" ON UPDATE NO ACTION", file);
+ }
+}
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table */
+{
+ dict_foreign_t* foreign;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ if (foreign == NULL) {
+ mutex_exit(&(dict_sys->mutex));
+
+ return;
+ }
+
+ while (foreign != NULL) {
+ if (create_table_format) {
+ dict_print_info_on_foreign_key_in_create_format(
+ file, trx, foreign, TRUE);
+ } else {
+ ulint i;
+ fputs("; (", file);
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (i) {
+ putc(' ', file);
+ }
+
+ ut_print_name(file, trx, FALSE,
+ foreign->foreign_col_names[i]);
+ }
+
+ fputs(") REFER ", file);
+ ut_print_name(file, trx, TRUE,
+ foreign->referenced_table_name);
+ putc('(', file);
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (i) {
+ putc(' ', file);
+ }
+ ut_print_name(
+ file, trx, FALSE,
+ foreign->referenced_col_names[i]);
+ }
+
+ putc(')', file);
+
+ if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+ fputs(" ON DELETE CASCADE", file);
+ }
+
+ if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+ fputs(" ON DELETE SET NULL", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+ fputs(" ON DELETE NO ACTION", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+ fputs(" ON UPDATE CASCADE", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+ fputs(" ON UPDATE SET NULL", file);
+ }
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+ fputs(" ON UPDATE NO ACTION", file);
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+ FILE* file, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to print */
+{
+ fputs("index ", file);
+ ut_print_name(file, trx, FALSE, index->name);
+ fputs(" of table ", file);
+ ut_print_name(file, trx, TRUE, index->table_name);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void)
+/*===============*/
+{
+ dict_table_t* table;
+
+ /* create dummy table and index for REDUNDANT infimum and supremum */
+ table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0);
+ dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+ DATA_ENGLISH | DATA_NOT_NULL, 8);
+
+ dict_ind_redundant = dict_mem_index_create("SYS_DUMMY1", "SYS_DUMMY1",
+ DICT_HDR_SPACE, 0, 1);
+ dict_index_add_col(dict_ind_redundant, table,
+ dict_table_get_nth_col(table, 0), 0);
+ dict_ind_redundant->table = table;
+ /* create dummy table and index for COMPACT infimum and supremum */
+ table = dict_mem_table_create("SYS_DUMMY2",
+ DICT_HDR_SPACE, 1, DICT_TF_COMPACT);
+ dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+ DATA_ENGLISH | DATA_NOT_NULL, 8);
+ dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2",
+ DICT_HDR_SPACE, 0, 1);
+ dict_index_add_col(dict_ind_compact, table,
+ dict_table_get_nth_col(table, 0), 0);
+ dict_ind_compact->table = table;
+
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ dict_ind_redundant->cached = dict_ind_compact->cached = TRUE;
+}
+
+/**********************************************************************//**
+Frees dict_ind_redundant and dict_ind_compact. */
+static
+void
+dict_ind_free(void)
+/*===============*/
+{
+ dict_table_t* table;
+
+ table = dict_ind_compact->table;
+ dict_mem_index_free(dict_ind_compact);
+ dict_ind_compact = NULL;
+ dict_mem_table_free(table);
+
+ table = dict_ind_redundant->table;
+ dict_mem_index_free(dict_ind_redundant);
+ dict_ind_redundant = NULL;
+ dict_mem_table_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Get index by name
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name) /*!< in: name of the index to find */
+{
+ dict_index_t* index;
+
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (ut_strcmp(index->name, name) == 0) {
+
+ return(index);
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(NULL);
+
+}
+
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the tables
+foreign key list. */
+UNIV_INTERN
+void
+dict_table_replace_index_in_foreign_list(
+/*=====================================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index) /*!< in: index to be replaced */
+{
+ dict_foreign_t* foreign;
+
+ for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+ foreign;
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+ if (foreign->foreign_index == index) {
+ dict_index_t* new_index
+ = dict_foreign_find_equiv_index(foreign);
+ ut_a(new_index);
+
+ foreign->foreign_index = new_index;
+ }
+ }
+}
+
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*=====================================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name) /*!< in: name of the index to find */
+{
+ dict_index_t* index;
+ dict_index_t* min_index; /* Index with matching name and min(id) */
+
+ min_index = NULL;
+ index = dict_table_get_first_index(table);
+
+ while (index != NULL) {
+ if (ut_strcmp(index->name, name) == 0) {
+ if (!min_index
+ || ut_dulint_cmp(index->id, min_index->id) < 0) {
+
+ min_index = index;
+ }
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ return(min_index);
+
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table, /*!< in: Check for dup indexes
+ in this table */
+ ibool tmp_ok) /*!< in: TRUE=allow temporary
+ index names */
+{
+ /* Check for duplicates, ignoring indexes that are marked
+ as to be dropped */
+
+ const dict_index_t* index1;
+ const dict_index_t* index2;
+
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ /* The primary index _must_ exist */
+ ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
+
+ index1 = UT_LIST_GET_FIRST(table->indexes);
+
+ do {
+ ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX);
+
+ index2 = UT_LIST_GET_NEXT(indexes, index1);
+
+ while (index2) {
+
+ if (!index2->to_be_dropped) {
+ ut_ad(ut_strcmp(index1->name, index2->name));
+ }
+
+ index2 = UT_LIST_GET_NEXT(indexes, index2);
+ }
+
+ index1 = UT_LIST_GET_NEXT(indexes, index1);
+ } while (index1);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************************
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void)
+/*============*/
+{
+ ulint i;
+
+ /* Free the hash elements. We don't remove them from the table
+ because we are going to destroy the table anyway. */
+ for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) {
+ dict_table_t* table;
+
+ table = HASH_GET_FIRST(dict_sys->table_hash, i);
+
+ while (table) {
+ dict_table_t* prev_table = table;
+
+ table = HASH_GET_NEXT(name_hash, prev_table);
+#ifdef UNIV_DEBUG
+ ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N);
+#endif
+ /* Acquire only because it's a pre-condition. */
+ mutex_enter(&dict_sys->mutex);
+
+ dict_table_remove_from_cache(prev_table);
+
+ mutex_exit(&dict_sys->mutex);
+ }
+ }
+
+ hash_table_free(dict_sys->table_hash);
+
+ /* The elements are the same instance as in dict_sys->table_hash,
+ therefore we don't delete the individual elements. */
+ hash_table_free(dict_sys->table_id_hash);
+
+ dict_ind_free();
+
+ mutex_free(&dict_sys->mutex);
+
+ rw_lock_free(&dict_operation_lock);
+ memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock));
+
+ mutex_free(&dict_foreign_err_mutex);
+
+ mem_free(dict_sys);
+ dict_sys = NULL;
+
+ for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+ mutex_free(&dict_index_stat_mutex[i]);
+ }
+}
+
+/*************************************************************************
+set is_corrupt flag by space_id*/
+
+void
+dict_table_set_corrupt_by_space(
+/*============================*/
+ ulint space_id,
+ ibool need_mutex)
+{
+ dict_table_t* table;
+ ibool found = FALSE;
+
+ ut_a(!trx_sys_sys_space(space_id) && space_id < SRV_LOG_SPACE_FIRST_ID);
+
+ if (need_mutex)
+ mutex_enter(&(dict_sys->mutex));
+
+ table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+ while (table) {
+ if (table->space == space_id) {
+ table->is_corrupt = TRUE;
+ found = TRUE;
+ }
+
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ }
+
+ if (need_mutex)
+ mutex_exit(&(dict_sys->mutex));
+
+ if (!found) {
+ fprintf(stderr, "InnoDB: space to be marked as "
+ "crashed was not found for id %lu.\n",
+ (ulong) space_id);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c
new file mode 100644
index 00000000000..43c0810fe67
--- /dev/null
+++ b/storage/xtradb/dict/dict0load.c
@@ -0,0 +1,1572 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0load.c
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0load.h"
+#include "mysql_version.h"
+
+#ifdef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "rem0cmp.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+
+/****************************************************************//**
+Compare the name of an index column.
+@return TRUE if the i'th column of index is 'name'. */
+static
+ibool
+name_of_col_is(
+/*===========*/
+ const dict_table_t* table, /*!< in: table */
+ const dict_index_t* index, /*!< in: index */
+ ulint i, /*!< in: index field offset */
+ const char* name) /*!< in: name to compare to */
+{
+ ulint tmp = dict_col_get_no(dict_field_get_col(
+ dict_index_get_nth_field(
+ index, i)));
+
+ return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
+}
+
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+ const char* name) /*!< in: database name which ends in '/' */
+{
+ dict_table_t* sys_tables;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ heap = mem_heap_create(1000);
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+ ut_a(!dict_table_is_comp(sys_tables));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, name, ut_strlen(name));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ if (len < strlen(name)
+ || ut_memcmp(name, field, strlen(name)) != 0) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ if (!rec_get_deleted_flag(rec, 0)) {
+
+ /* We found one */
+
+ char* table_name = mem_strdupl((char*) field, len);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(table_name);
+ }
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ goto loop;
+}
+
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void)
+/*============*/
+{
+ dict_table_t* sys_tables;
+ dict_index_t* sys_index;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ mtr_t mtr;
+
+ /* Enlarge the fatal semaphore wait timeout during the InnoDB table
+ monitor printout */
+
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+
+ btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
+ TRUE, &mtr);
+loop:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* end of index */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ /* Restore the fatal semaphore wait timeout */
+
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ if (!rec_get_deleted_flag(rec, 0)) {
+
+ /* We found one */
+
+ char* table_name = mem_strdupl((char*) field, len);
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ table = dict_table_get_low(table_name);
+ mem_free(table_name);
+
+ if (table == NULL) {
+ fputs("InnoDB: Failed to load table ", stderr);
+ ut_print_namel(stderr, NULL, TRUE, (char*) field, len);
+ putc('\n', stderr);
+ } else {
+ /* The table definition was corrupt if there
+ is no index */
+
+ if (srv_stats_auto_update && dict_table_get_first_index(table)) {
+ dict_update_statistics_low(table, TRUE, FALSE);
+ }
+
+ dict_table_print_low(table);
+ }
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+ }
+
+ goto loop;
+}
+
+/********************************************************************//**
+Determine the flags of a table described in SYS_TABLES.
+@return compressed page size in kilobytes; or 0 if the tablespace is
+uncompressed, ULINT_UNDEFINED on error */
+static
+ulint
+dict_sys_tables_get_flags(
+/*======================*/
+ const rec_t* rec) /*!< in: a record of SYS_TABLES */
+{
+ const byte* field;
+ ulint len;
+ ulint n_cols;
+ ulint flags;
+
+ field = rec_get_nth_field_old(rec, 5, &len);
+ ut_a(len == 4);
+
+ flags = mach_read_from_4(field);
+
+ if (UNIV_LIKELY(flags == DICT_TABLE_ORDINARY)) {
+ return(0);
+ }
+
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+ n_cols = mach_read_from_4(field);
+
+ if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
+ /* New file formats require ROW_FORMAT=COMPACT. */
+ return(ULINT_UNDEFINED);
+ }
+
+ switch (flags & (DICT_TF_FORMAT_MASK | DICT_TF_COMPACT)) {
+ default:
+ case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+ case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
+ /* flags should be DICT_TABLE_ORDINARY,
+ or DICT_TF_FORMAT_MASK should be nonzero. */
+ return(ULINT_UNDEFINED);
+
+ case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
+#if DICT_TF_FORMAT_MAX > DICT_TF_FORMAT_ZIP
+# error "missing case labels for DICT_TF_FORMAT_ZIP .. DICT_TF_FORMAT_MAX"
+#endif
+ /* We support this format. */
+ break;
+ }
+
+ if (UNIV_UNLIKELY((flags & DICT_TF_ZSSIZE_MASK)
+ > (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT))) {
+ /* Unsupported compressed page size. */
+ return(ULINT_UNDEFINED);
+ }
+
+ if (UNIV_UNLIKELY(flags & (~0 << DICT_TF_BITS))) {
+ /* Some unused bits are set. */
+ return(ULINT_UNDEFINED);
+ }
+
+ return(flags);
+}
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+ ibool in_crash_recovery) /*!< in: are we doing a crash recovery */
+{
+ dict_table_t* sys_tables;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ ulint max_space_id;
+ mtr_t mtr;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+ ut_a(!dict_table_is_comp(sys_tables));
+
+ max_space_id = mtr_read_ulint(dict_hdr_get(&mtr)
+ + DICT_HDR_MAX_SPACE_ID,
+ MLOG_4BYTES, &mtr);
+ fil_set_max_space_id_if_bigger(max_space_id);
+
+ btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
+ TRUE, &mtr);
+loop:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* end of index */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ /* We must make the tablespace cache aware of the biggest
+ known space id */
+
+ /* printf("Biggest space id in data dictionary %lu\n",
+ max_space_id); */
+ fil_set_max_space_id_if_bigger(max_space_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return;
+ }
+
+ if (!rec_get_deleted_flag(rec, 0)) {
+
+ /* We found one */
+ const byte* field;
+ ulint len;
+ ulint space_id;
+ ulint flags;
+ char* name;
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ name = mem_strdupl((char*) field, len);
+
+ flags = dict_sys_tables_get_flags(rec);
+ if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+
+ field = rec_get_nth_field_old(rec, 5, &len);
+ flags = mach_read_from_4(field);
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has unknown type %lx.\n",
+ (ulong) flags);
+
+ goto loop;
+ }
+
+ field = rec_get_nth_field_old(rec, 9, &len);
+ ut_a(len == 4);
+
+ space_id = mach_read_from_4(field);
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ if (trx_sys_sys_space(space_id)) {
+ /* The system tablespace always exists. */
+ } else if (in_crash_recovery) {
+ /* Check that the tablespace (the .ibd file) really
+ exists; print a warning to the .err log if not.
+ Do not print warnings for temporary tables. */
+ ibool is_temp;
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ if (0x80000000UL & mach_read_from_4(field)) {
+ /* ROW_FORMAT=COMPACT: read the is_temp
+ flag from SYS_TABLES.MIX_LEN. */
+ field = rec_get_nth_field_old(rec, 7, &len);
+ is_temp = mach_read_from_4(field)
+ & DICT_TF2_TEMPORARY;
+ } else {
+ /* For tables created with old versions
+ of InnoDB, SYS_TABLES.MIX_LEN may contain
+ garbage. Such tables would always be
+ in ROW_FORMAT=REDUNDANT. Pretend that
+ all such tables are non-temporary. That is,
+ do not suppress error printouts about
+ temporary tables not being found. */
+ is_temp = FALSE;
+ }
+
+ fil_space_for_table_exists_in_mem(
+ space_id, name, is_temp, TRUE, !is_temp);
+ } else {
+ /* It is a normal database startup: create the space
+ object and check that the .ibd file exists. */
+
+ fil_open_single_table_tablespace(FALSE, space_id,
+ flags, name);
+ }
+
+ mem_free(name);
+
+ if (space_id > max_space_id) {
+ max_space_id = space_id;
+ }
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+ }
+
+ goto loop;
+}
+
+/********************************************************************//**
+Loads definitions for table columns. */
+static
+void
+dict_load_columns(
+/*==============*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap for temporary storage */
+{
+ dict_table_t* sys_columns;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ byte* buf;
+ char* name;
+ ulint mtype;
+ ulint prtype;
+ ulint col_len;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ mtr_start(&mtr);
+
+ sys_columns = dict_table_get_low("SYS_COLUMNS");
+ sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
+ ut_a(!dict_table_is_comp(sys_columns));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+ ut_a(!rec_get_deleted_flag(rec, 0));
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+ ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_ad(len == 4);
+ ut_a(i == mach_read_from_4(field));
+
+ ut_a(name_of_col_is(sys_columns, sys_index, 4, "NAME"));
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ name = mem_heap_strdupl(heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(rec, 5, &len);
+ mtype = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(rec, 6, &len);
+ prtype = mach_read_from_4(field);
+
+ if (dtype_get_charset_coll(prtype) == 0
+ && dtype_is_string_type(mtype)) {
+ /* The table was created with < 4.1.2. */
+
+ if (dtype_is_binary_string_type(mtype, prtype)) {
+ /* Use the binary collation for
+ string columns of binary type. */
+
+ prtype = dtype_form_prtype(
+ prtype,
+ DATA_MYSQL_BINARY_CHARSET_COLL);
+ } else {
+ /* Use the default charset for
+ other than binary columns. */
+
+ prtype = dtype_form_prtype(
+ prtype,
+ data_mysql_default_charset_coll);
+ }
+ }
+
+ field = rec_get_nth_field_old(rec, 7, &len);
+ col_len = mach_read_from_4(field);
+
+ ut_a(name_of_col_is(sys_columns, sys_index, 8, "PREC"));
+
+ dict_mem_table_add_col(table, heap, name,
+ mtype, prtype, col_len);
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Loads definitions for index fields. */
+static
+void
+dict_load_fields(
+/*=============*/
+ dict_index_t* index, /*!< in: index whose fields to load */
+ mem_heap_t* heap) /*!< in: memory heap for temporary storage */
+{
+ dict_table_t* sys_fields;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ ulint pos_and_prefix_len;
+ ulint prefix_len;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ byte* buf;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ mtr_start(&mtr);
+
+ sys_fields = dict_table_get_low("SYS_FIELDS");
+ sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
+ ut_a(!dict_table_is_comp(sys_fields));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, index->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i < index->n_fields; i++) {
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+ /* There could be delete marked records in SYS_FIELDS
+ because SYS_FIELDS.INDEX_ID can be updated
+ by ALTER TABLE ADD INDEX. */
+
+ if (rec_get_deleted_flag(rec, 0)) {
+
+ goto next_rec;
+ }
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_a(len == 4);
+
+ /* The next field stores the field position in the index
+ and a possible column prefix length if the index field
+ does not contain the whole column. The storage format is
+ like this: if there is at least one prefix field in the index,
+ then the HIGH 2 bytes contain the field number (== i) and the
+ low 2 bytes the prefix length for the field. Otherwise the
+ field number (== i) is contained in the 2 LOW bytes. */
+
+ pos_and_prefix_len = mach_read_from_4(field);
+
+ ut_a((pos_and_prefix_len & 0xFFFFUL) == i
+ || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16));
+
+ if ((i == 0 && pos_and_prefix_len > 0)
+ || (pos_and_prefix_len & 0xFFFF0000UL) > 0) {
+
+ prefix_len = pos_and_prefix_len & 0xFFFFUL;
+ } else {
+ prefix_len = 0;
+ }
+
+ ut_a(name_of_col_is(sys_fields, sys_index, 4, "COL_NAME"));
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+
+ dict_mem_index_add_field(index,
+ mem_heap_strdupl(heap,
+ (char*) field, len),
+ prefix_len);
+
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Loads definitions for table indexes. Adds them to the data dictionary
+cache.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
+table or DB_UNSUPPORTED if table has unknown index type */
+static
+ulint
+dict_load_indexes(
+/*==============*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap) /*!< in: memory heap for temporary storage */
+{
+ dict_table_t* sys_indexes;
+ dict_index_t* sys_index;
+ dict_index_t* index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint name_len;
+ char* name_buf;
+ ulint type;
+ ulint space;
+ ulint page_no;
+ ulint n_fields;
+ byte* buf;
+ ibool is_sys_table;
+ dulint id;
+ mtr_t mtr;
+ ulint error = DB_SUCCESS;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ if ((ut_dulint_get_high(table->id) == 0)
+ && (ut_dulint_get_low(table->id) < DICT_HDR_FIRST_ID)) {
+ is_sys_table = TRUE;
+ } else {
+ is_sys_table = FALSE;
+ }
+
+ mtr_start(&mtr);
+
+ sys_indexes = dict_table_get_low("SYS_INDEXES");
+ sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
+ ut_a(!dict_table_is_comp(sys_indexes));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (;;) {
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+
+ if (ut_memcmp(buf, field, len) != 0) {
+ break;
+ } else if (rec_get_deleted_flag(rec, 0)) {
+ /* Skip delete marked records */
+ goto next_rec;
+ }
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_ad(len == 8);
+ id = mach_read_from_8(field);
+
+ ut_a(name_of_col_is(sys_indexes, sys_index, 4, "NAME"));
+
+ field = rec_get_nth_field_old(rec, 4, &name_len);
+ name_buf = mem_heap_strdupl(heap, (char*) field, name_len);
+
+ field = rec_get_nth_field_old(rec, 5, &len);
+ n_fields = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(rec, 6, &len);
+ type = mach_read_from_4(field);
+
+ field = rec_get_nth_field_old(rec, 7, &len);
+ space = mach_read_from_4(field);
+
+ ut_a(name_of_col_is(sys_indexes, sys_index, 8, "PAGE_NO"));
+
+ field = rec_get_nth_field_old(rec, 8, &len);
+ page_no = mach_read_from_4(field);
+
+ /* We check for unsupported types first, so that the
+ subsequent checks are relevant for the supported types. */
+ if (type & ~(DICT_CLUSTERED | DICT_UNIQUE)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: unknown type %lu"
+ " of index %s of table %s\n",
+ (ulong) type, name_buf, table->name);
+
+ error = DB_UNSUPPORTED;
+ goto func_exit;
+ } else if (page_no == FIL_NULL) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to load index %s"
+ " for table %s\n"
+ "InnoDB: but the index tree has been freed!\n",
+ name_buf, table->name);
+
+ error = DB_CORRUPTION;
+ goto func_exit;
+ } else if ((type & DICT_CLUSTERED) == 0
+ && NULL == dict_table_get_first_index(table)) {
+
+ fputs("InnoDB: Error: trying to load index ",
+ stderr);
+ ut_print_name(stderr, NULL, FALSE, name_buf);
+ fputs(" for table ", stderr);
+ ut_print_name(stderr, NULL, TRUE, table->name);
+ fputs("\nInnoDB: but the first index"
+ " is not clustered!\n", stderr);
+
+ error = DB_CORRUPTION;
+ goto func_exit;
+ } else if (is_sys_table
+ && ((type & DICT_CLUSTERED)
+ || ((table == dict_sys->sys_tables)
+ && (name_len == (sizeof "ID_IND") - 1)
+ && (0 == ut_memcmp(name_buf,
+ "ID_IND", name_len))))) {
+
+ /* The index was created in memory already at booting
+ of the database server */
+ } else {
+ index = dict_mem_index_create(table->name, name_buf,
+ space, type, n_fields);
+ index->id = id;
+
+ dict_load_fields(index, heap);
+ error = dict_index_add_to_cache(table, index, page_no,
+ FALSE);
+ /* The data dictionary tables should never contain
+ invalid index definitions. If we ignored this error
+ and simply did not load this index definition, the
+ .frm file would disagree with the index definitions
+ inside InnoDB. */
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+ goto func_exit;
+ }
+ }
+
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(error);
+}
+
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table. Adds all these to the data
+dictionary cache.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+ const char* name) /*!< in: table name in the
+ databasename/tablename format */
+{
+ ibool ibd_file_missing = FALSE;
+ dict_table_t* table;
+ dict_table_t* sys_tables;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint space;
+ ulint n_cols;
+ ulint flags;
+ ulint err;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ heap = mem_heap_create(32000);
+
+ mtr_start(&mtr);
+
+ sys_tables = dict_table_get_low("SYS_TABLES");
+ sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+ ut_a(!dict_table_is_comp(sys_tables));
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, name, ut_strlen(name));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || rec_get_deleted_flag(rec, 0)) {
+ /* Not found */
+err_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(NULL);
+ }
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ /* Check if the table name in record is the searched one */
+ if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) {
+
+ goto err_exit;
+ }
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 9, "SPACE"));
+
+ field = rec_get_nth_field_old(rec, 9, &len);
+ space = mach_read_from_4(field);
+
+ /* Check if the tablespace exists and has the right name */
+ if (!trx_sys_sys_space(space)) {
+ flags = dict_sys_tables_get_flags(rec);
+
+ if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+ field = rec_get_nth_field_old(rec, 5, &len);
+ flags = mach_read_from_4(field);
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has unknown type %lx.\n",
+ (ulong) flags);
+ goto err_exit;
+ }
+ } else {
+ flags = 0;
+ }
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ n_cols = mach_read_from_4(field);
+
+ /* The high-order bit of N_COLS is the "compact format" flag.
+ For tables in that format, MIX_LEN may hold additional flags. */
+ if (n_cols & 0x80000000UL) {
+ ulint flags2;
+
+ flags |= DICT_TF_COMPACT;
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
+ field = rec_get_nth_field_old(rec, 7, &len);
+
+ flags2 = mach_read_from_4(field);
+
+ if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has unknown flags %lx.\n",
+ (ulong) flags2);
+
+ flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+ }
+
+ flags |= flags2 << DICT_TF2_SHIFT;
+ }
+
+ /* See if the tablespace is available. */
+ if (trx_sys_sys_space(space)) {
+ /* The system tablespace is always available. */
+ } else if (!fil_space_for_table_exists_in_mem(
+ space, name,
+ (flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY,
+ FALSE, FALSE)) {
+
+ if ((flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) {
+ /* Do not bother to retry opening temporary tables. */
+ ibd_file_missing = TRUE;
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: error: space object of table");
+ ut_print_filename(stderr, name);
+ fprintf(stderr, ",\n"
+ "InnoDB: space id %lu did not exist in memory."
+ " Retrying an open.\n",
+ (ulong) space);
+ /* Try to open the tablespace */
+ if (!fil_open_single_table_tablespace(
+ TRUE, space,
+ flags == DICT_TF_COMPACT ? 0 :
+ flags & ~(~0 << DICT_TF_BITS), name)) {
+ /* We failed to find a sensible
+ tablespace file */
+
+ ibd_file_missing = TRUE;
+ }
+ }
+ }
+
+ table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
+ flags);
+
+ table->ibd_file_missing = (unsigned int) ibd_file_missing;
+
+ ut_a(name_of_col_is(sys_tables, sys_index, 3, "ID"));
+
+ field = rec_get_nth_field_old(rec, 3, &len);
+ table->id = mach_read_from_8(field);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ dict_load_columns(table, heap);
+
+ dict_table_add_to_cache(table, heap);
+
+ mem_heap_empty(heap);
+
+ err = dict_load_indexes(table, heap);
+
+ /* Initialize table foreign_child value. Its value could be
+ changed when dict_load_foreigns() is called below */
+ table->fk_max_recusive_level = 0;
+
+ /* If the force recovery flag is set, we open the table irrespective
+ of the error condition, since the user may want to dump data from the
+ clustered index. However we load the foreign key information only if
+ all indexes were loaded. */
+ if (err == DB_SUCCESS) {
+ err = dict_load_foreigns(table->name, TRUE, TRUE);
+
+ if (err != DB_SUCCESS) {
+ dict_table_remove_from_cache(table);
+ table = NULL;
+ }
+ } else if (!srv_force_recovery) {
+ dict_table_remove_from_cache(table);
+ table = NULL;
+ }
+
+ table->fk_max_recusive_level = 0;
+#if 0
+ if (err != DB_SUCCESS && table != NULL) {
+
+ mutex_enter(&dict_foreign_err_mutex);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: could not make a foreign key"
+ " definition to match\n"
+ "InnoDB: the foreign key table"
+ " or the referenced table!\n"
+ "InnoDB: The data dictionary of InnoDB is corrupt."
+ " You may need to drop\n"
+ "InnoDB: and recreate the foreign key table"
+ " or the referenced table.\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n"
+ "InnoDB: Latest foreign key error printout:\n%s\n",
+ dict_foreign_err_buf);
+
+ mutex_exit(&dict_foreign_err_mutex);
+ }
+#endif /* 0 */
+ mem_heap_free(heap);
+
+ return(table);
+}
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ dulint table_id) /*!< in: table id */
+{
+ byte id_buf[8];
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sys_table_ids;
+ dict_table_t* sys_tables;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dict_table_t* table;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = NULL;
+
+ /* NOTE that the operation of this function is protected by
+ the dictionary mutex, and therefore no deadlocks can occur
+ with other dictionary operations. */
+
+ mtr_start(&mtr);
+ /*---------------------------------------------------*/
+ /* Get the secondary index based on ID for table SYS_TABLES */
+ sys_tables = dict_sys->sys_tables;
+ sys_table_ids = dict_table_get_next_index(
+ dict_table_get_first_index(sys_tables));
+ ut_a(!dict_table_is_comp(sys_tables));
+ heap = mem_heap_create(256);
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ /* Write the table id in byte format to id_buf */
+ mach_write_to_8(id_buf, table_id);
+
+ dfield_set_data(dfield, id_buf, 8);
+ dict_index_copy_types(tuple, sys_table_ids, 1);
+
+ btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* Not found */
+ goto func_exit;
+ }
+
+ /* Find the first record that is not delete marked */
+ while (rec_get_deleted_flag(rec, 0)) {
+ if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+ goto func_exit;
+ }
+ rec = btr_pcur_get_rec(&pcur);
+ }
+
+ /*---------------------------------------------------*/
+ /* Now we have the record in the secondary index containing the
+ table ID and NAME */
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+
+ /* Check if the table id in record is the one searched for */
+ if (ut_dulint_cmp(table_id, mach_read_from_8(field)) != 0) {
+ goto func_exit;
+ }
+
+ /* Now we get the table name from the record */
+ field = rec_get_nth_field_old(rec, 1, &len);
+ /* Load the table definition to memory */
+ table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len));
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(table);
+}
+
+/********************************************************************//**
+This function is called when the database is booted. Loads system table
+index definitions except for the clustered index which is added to the
+dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table) /*!< in: system table */
+{
+ mem_heap_t* heap;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ heap = mem_heap_create(1000);
+
+ dict_load_indexes(table, heap);
+
+ mem_heap_free(heap);
+}
+
+/********************************************************************//**
+Loads foreign key constraint col names (also for the referenced table). */
+static
+void
+dict_load_foreign_cols(
+/*===================*/
+ const char* id, /*!< in: foreign constraint id as a
+ null-terminated string */
+ dict_foreign_t* foreign)/*!< in: foreign constraint object */
+{
+ dict_table_t* sys_foreign_cols;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ foreign->foreign_col_names = mem_heap_alloc(
+ foreign->heap, foreign->n_fields * sizeof(void*));
+
+ foreign->referenced_col_names = mem_heap_alloc(
+ foreign->heap, foreign->n_fields * sizeof(void*));
+ mtr_start(&mtr);
+
+ sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+ sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
+ ut_a(!dict_table_is_comp(sys_foreign_cols));
+
+ tuple = dtuple_create(foreign->heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, id, ut_strlen(id));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ ut_a(btr_pcur_is_on_user_rec(&pcur));
+ ut_a(!rec_get_deleted_flag(rec, 0));
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_a(len == ut_strlen(id));
+ ut_a(ut_memcmp(id, field, len) == 0);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+ ut_a(len == 4);
+ ut_a(i == mach_read_from_4(field));
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ foreign->foreign_col_names[i] = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(rec, 5, &len);
+ foreign->referenced_col_names[i] = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Loads a foreign key constraint to the dictionary cache.
+@return DB_SUCCESS or error code */
+static
+ulint
+dict_load_foreign(
+/*==============*/
+ const char* id, /*!< in: foreign constraint id as a
+ null-terminated string */
+ ibool check_charsets,
+ /*!< in: TRUE=check charset compatibility */
+ ibool check_recursive)
+ /*!< in: Whether to record the foreign table
+ parent count to avoid unlimited recursive
+ load of chained foreign tables */
+{
+ dict_foreign_t* foreign;
+ dict_table_t* sys_foreign;
+ btr_pcur_t pcur;
+ dict_index_t* sys_index;
+ dtuple_t* tuple;
+ mem_heap_t* heap2;
+ dfield_t* dfield;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint n_fields_and_type;
+ mtr_t mtr;
+ dict_table_t* for_table;
+ dict_table_t* ref_table;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ heap2 = mem_heap_create(1000);
+
+ mtr_start(&mtr);
+
+ sys_foreign = dict_table_get_low("SYS_FOREIGN");
+ sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
+ ut_a(!dict_table_is_comp(sys_foreign));
+
+ tuple = dtuple_create(heap2, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, id, ut_strlen(id));
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)
+ || rec_get_deleted_flag(rec, 0)) {
+ /* Not found */
+
+ fprintf(stderr,
+ "InnoDB: Error A: cannot load foreign constraint %s\n",
+ id);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap2);
+
+ return(DB_ERROR);
+ }
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ /* Check if the id in record is the searched one */
+ if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) {
+
+ fprintf(stderr,
+ "InnoDB: Error B: cannot load foreign constraint %s\n",
+ id);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap2);
+
+ return(DB_ERROR);
+ }
+
+ /* Read the table names and the number of columns associated
+ with the constraint */
+
+ mem_heap_free(heap2);
+
+ foreign = dict_mem_foreign_create();
+
+ n_fields_and_type = mach_read_from_4(
+ rec_get_nth_field_old(rec, 5, &len));
+
+ ut_a(len == 4);
+
+ /* We store the type in the bits 24..29 of n_fields_and_type. */
+
+ foreign->type = (unsigned int) (n_fields_and_type >> 24);
+ foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+
+ foreign->id = mem_heap_strdup(foreign->heap, id);
+
+ field = rec_get_nth_field_old(rec, 3, &len);
+ foreign->foreign_table_name = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ field = rec_get_nth_field_old(rec, 4, &len);
+ foreign->referenced_table_name = mem_heap_strdupl(
+ foreign->heap, (char*) field, len);
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ dict_load_foreign_cols(id, foreign);
+
+ ref_table = dict_table_check_if_in_cache_low(
+ foreign->referenced_table_name);
+
+ /* We could possibly wind up in a deep recursive calls if
+ we call dict_table_get_low() again here if there
+ is a chain of tables concatenated together with
+ foreign constraints. In such case, each table is
+ both a parent and child of the other tables, and
+ act as a "link" in such table chains.
+ To avoid such scenario, we would need to check the
+ number of ancesters the current table has. If that
+ exceeds DICT_FK_MAX_CHAIN_LEN, we will stop loading
+ the child table.
+ Foreign constraints are loaded in a Breath First fashion,
+ that is, the index on FOR_NAME is scanned first, and then
+ index on REF_NAME. So foreign constrains in which
+ current table is a child (foreign table) are loaded first,
+ and then those constraints where current table is a
+ parent (referenced) table.
+ Thus we could check the parent (ref_table) table's
+ reference count (fk_max_recusive_level) to know how deep the
+ recursive call is. If the parent table (ref_table) is already
+ loaded, and its fk_max_recusive_level is larger than
+ DICT_FK_MAX_CHAIN_LEN, we will stop the recursive loading
+ by skipping loading the child table. It will not affect foreign
+ constraint check for DMLs since child table will be loaded
+ at that time for the constraint check. */
+ if (!ref_table
+ || ref_table->fk_max_recusive_level < DICT_FK_MAX_RECURSIVE_LOAD) {
+
+ /* If the foreign table is not yet in the dictionary cache, we
+ have to load it so that we are able to make type comparisons
+ in the next function call. */
+
+ for_table = dict_table_get_low(foreign->foreign_table_name);
+
+ if (for_table && ref_table && check_recursive) {
+ /* This is to record the longest chain of ancesters
+ this table has, if the parent has more ancesters
+ than this table has, record it after add 1 (for this
+ parent */
+ if (ref_table->fk_max_recusive_level
+ >= for_table->fk_max_recusive_level) {
+ for_table->fk_max_recusive_level =
+ ref_table->fk_max_recusive_level + 1;
+ }
+ }
+ }
+
+ /* Note that there may already be a foreign constraint object in
+ the dictionary cache for this constraint: then the following
+ call only sets the pointers in it to point to the appropriate table
+ and index objects and frees the newly created object foreign.
+ Adding to the cache should always succeed since we are not creating
+ a new foreign key constraint but loading one from the data
+ dictionary. */
+
+ return(dict_foreign_add_to_cache(foreign, check_charsets));
+}
+
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_load_foreigns(
+/*===============*/
+ const char* table_name, /*!< in: table name */
+ ibool check_recursive,/*!< in: Whether to check recursive
+ load of tables chained by FK */
+ ibool check_charsets) /*!< in: TRUE=check charset
+ compatibility */
+{
+ btr_pcur_t pcur;
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sec_index;
+ dict_table_t* sys_foreign;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ char* id ;
+ ulint err;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+ if (sys_foreign == NULL) {
+ /* No foreign keys defined yet in this database */
+
+ fprintf(stderr,
+ "InnoDB: Error: no foreign key system tables"
+ " in the database\n");
+
+ return(DB_ERROR);
+ }
+
+ ut_a(!dict_table_is_comp(sys_foreign));
+ mtr_start(&mtr);
+
+ /* Get the secondary index based on FOR_NAME from table
+ SYS_FOREIGN */
+
+ sec_index = dict_table_get_next_index(
+ dict_table_get_first_index(sys_foreign));
+start_load:
+ heap = mem_heap_create(256);
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(dfield, table_name, ut_strlen(table_name));
+ dict_index_copy_types(tuple, sec_index, 1);
+
+ btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* End of index */
+
+ goto load_next_index;
+ }
+
+ /* Now we have the record in the secondary index containing a table
+ name and a foreign constraint ID */
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ /* Check if the table name in the record is the one searched for; the
+ following call does the comparison in the latin1_swedish_ci
+ charset-collation, in a case-insensitive way. */
+
+ if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
+ dfield_get_type(dfield)->prtype,
+ dfield_get_data(dfield), dfield_get_len(dfield),
+ field, len)) {
+
+ goto load_next_index;
+ }
+
+ /* Since table names in SYS_FOREIGN are stored in a case-insensitive
+ order, we have to check that the table name matches also in a binary
+ string comparison. On Unix, MySQL allows table names that only differ
+ in character case. */
+
+ if (0 != ut_memcmp(field, table_name, len)) {
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(rec, 0)) {
+
+ goto next_rec;
+ }
+
+ /* Now we get a foreign key constraint id */
+ field = rec_get_nth_field_old(rec, 1, &len);
+ id = mem_heap_strdupl(heap, (char*) field, len);
+
+ btr_pcur_store_position(&pcur, &mtr);
+
+ mtr_commit(&mtr);
+
+ /* Load the foreign constraint definition to the dictionary cache */
+
+ err = dict_load_foreign(id, check_charsets, check_recursive);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_close(&pcur);
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ mtr_start(&mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ goto loop;
+
+load_next_index:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ sec_index = dict_table_get_next_index(sec_index);
+
+ if (sec_index != NULL) {
+
+ mtr_start(&mtr);
+
+ /* Switch to scan index on REF_NAME, fk_max_recusive_level
+ already been updated when scanning FOR_NAME index, no need to
+ update again */
+ check_recursive = FALSE;
+
+ goto start_load;
+ }
+
+ return(DB_SUCCESS);
+}
diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c
new file mode 100644
index 00000000000..f2d219bfd4f
--- /dev/null
+++ b/storage/xtradb/dict/dict0mem.c
@@ -0,0 +1,323 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0mem.c
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0mem.h"
+
+#ifdef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#include "rem0rec.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0lock.h"
+#endif /* !UNIV_HOTBACKUP */
+
+#define DICT_HEAP_SIZE 100 /*!< initial memory heap size when
+ creating a table or index object */
+
+/**********************************************************************//**
+Creates a table memory object.
+@return own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+ const char* name, /*!< in: table name */
+ ulint space, /*!< in: space where the clustered index of
+ the table is placed; this parameter is
+ ignored if the table is made a member of
+ a cluster */
+ ulint n_cols, /*!< in: number of columns */
+ ulint flags) /*!< in: table flags */
+{
+ dict_table_t* table;
+ mem_heap_t* heap;
+
+ ut_ad(name);
+ ut_a(!(flags & (~0 << DICT_TF2_BITS)));
+
+ heap = mem_heap_create(DICT_HEAP_SIZE);
+
+ table = mem_heap_zalloc(heap, sizeof(dict_table_t));
+
+ table->heap = heap;
+
+ table->flags = (unsigned int) flags;
+ table->name = ut_malloc(strlen(name) + 1);
+ memcpy(table->name, name, strlen(name) + 1);
+ table->space = (unsigned int) space;
+ table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS);
+
+ table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
+ * sizeof(dict_col_t));
+
+#ifndef UNIV_HOTBACKUP
+ table->autoinc_lock = mem_heap_alloc(heap, lock_get_size());
+
+ mutex_create(&table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
+
+ table->autoinc = 0;
+
+ /* The number of transactions that are either waiting on the
+ AUTOINC lock or have been granted the lock. */
+ table->n_waiting_or_granted_auto_inc_locks = 0;
+
+ table->is_corrupt = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
+ return(table);
+}
+
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_d(table->cached = FALSE);
+
+#ifndef UNIV_HOTBACKUP
+ mutex_free(&(table->autoinc_mutex));
+#endif /* UNIV_HOTBACKUP */
+ ut_free(table->name);
+ mem_heap_free(table->heap);
+}
+
+/****************************************************************//**
+Append 'name' to 'col_names'. @see dict_table_t::col_names
+@return new column names array */
+static
+const char*
+dict_add_col_name(
+/*==============*/
+ const char* col_names, /*!< in: existing column names, or
+ NULL */
+ ulint cols, /*!< in: number of existing columns */
+ const char* name, /*!< in: new column name */
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint old_len;
+ ulint new_len;
+ ulint total_len;
+ char* res;
+
+ ut_ad(!cols == !col_names);
+
+ /* Find out length of existing array. */
+ if (col_names) {
+ const char* s = col_names;
+ ulint i;
+
+ for (i = 0; i < cols; i++) {
+ s += strlen(s) + 1;
+ }
+
+ old_len = s - col_names;
+ } else {
+ old_len = 0;
+ }
+
+ new_len = strlen(name) + 1;
+ total_len = old_len + new_len;
+
+ res = mem_heap_alloc(heap, total_len);
+
+ if (old_len > 0) {
+ memcpy(res, col_names, old_len);
+ }
+
+ memcpy(res + old_len, name, new_len);
+
+ return(res);
+}
+
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision */
+{
+ dict_col_t* col;
+#ifndef UNIV_HOTBACKUP
+ ulint mbminlen;
+ ulint mbmaxlen;
+#endif /* !UNIV_HOTBACKUP */
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(!heap == !name);
+
+ i = table->n_def++;
+
+ if (name) {
+ if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
+ heap = table->heap;
+ }
+ if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
+ /* All preceding column names are empty. */
+ char* s = mem_heap_zalloc(heap, table->n_def);
+ table->col_names = s;
+ }
+
+ table->col_names = dict_add_col_name(table->col_names,
+ i, name, heap);
+ }
+
+ col = dict_table_get_nth_col(table, i);
+
+ col->ind = (unsigned int) i;
+ col->ord_part = 0;
+
+ col->mtype = (unsigned int) mtype;
+ col->prtype = (unsigned int) prtype;
+ col->len = (unsigned int) len;
+
+#ifndef UNIV_HOTBACKUP
+ dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
+
+ col->mbminlen = (unsigned int) mbminlen;
+ col->mbmaxlen = (unsigned int) mbmaxlen;
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ const char* table_name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ ulint space, /*!< in: space where the index tree is
+ placed, ignored if the index is of
+ the clustered type */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields) /*!< in: number of fields */
+{
+ dict_index_t* index;
+ mem_heap_t* heap;
+
+ ut_ad(table_name && index_name);
+
+ heap = mem_heap_create(DICT_HEAP_SIZE);
+ index = mem_heap_zalloc(heap, sizeof(dict_index_t));
+
+ index->heap = heap;
+
+ index->type = type;
+#ifndef UNIV_HOTBACKUP
+ index->space = (unsigned int) space;
+#endif /* !UNIV_HOTBACKUP */
+ index->name = mem_heap_strdup(heap, index_name);
+ index->table_name = table_name;
+ index->n_fields = (unsigned int) n_fields;
+ index->fields = mem_heap_alloc(heap, 1 + n_fields
+ * sizeof(dict_field_t));
+ /* The '1 +' above prevents allocation
+ of an empty mem block */
+#ifdef UNIV_DEBUG
+ index->magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+ return(index);
+}
+
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void)
+/*=========================*/
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(100);
+
+ foreign = mem_heap_zalloc(heap, sizeof(dict_foreign_t));
+
+ foreign->heap = heap;
+
+ return(foreign);
+}
+
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ const char* name, /*!< in: column name */
+ ulint prefix_len) /*!< in: 0 or the column prefix length
+ in a MySQL index like
+ INDEX (textcol(25)) */
+{
+ dict_field_t* field;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->n_def++;
+
+ field = dict_index_get_nth_field(index, index->n_def - 1);
+
+ field->name = name;
+ field->prefix_len = (unsigned int) prefix_len;
+}
+
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ mem_heap_free(index->heap);
+}
diff --git a/storage/xtradb/dyn/dyn0dyn.c b/storage/xtradb/dyn/dyn0dyn.c
new file mode 100644
index 00000000000..e1275f040f3
--- /dev/null
+++ b/storage/xtradb/dyn/dyn0dyn.c
@@ -0,0 +1,65 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dyn/dyn0dyn.c
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dyn0dyn.h"
+#ifdef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ mem_heap_t* heap;
+ dyn_block_t* block;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ if (arr->heap == NULL) {
+ UT_LIST_INIT(arr->base);
+ UT_LIST_ADD_FIRST(list, arr->base, arr);
+
+ arr->heap = mem_heap_create(sizeof(dyn_block_t));
+ }
+
+ block = dyn_array_get_last_block(arr);
+ block->used = block->used | DYN_BLOCK_FULL_FLAG;
+
+ heap = arr->heap;
+
+ block = mem_heap_alloc(heap, sizeof(dyn_block_t));
+
+ block->used = 0;
+
+ UT_LIST_ADD_LAST(list, arr->base, block);
+
+ return(block);
+}
diff --git a/storage/xtradb/eval/eval0eval.c b/storage/xtradb/eval/eval0eval.c
new file mode 100644
index 00000000000..589b0fa1576
--- /dev/null
+++ b/storage/xtradb/eval/eval0eval.c
@@ -0,0 +1,852 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0eval.c
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "eval0eval.h"
+
+#ifdef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#include "data0data.h"
+#include "row0sel.h"
+
+/** The RND function seed */
+static ulint eval_rnd = 128367121;
+
+/** Dummy adress used when we should allocate a buffer of size 0 in
+eval_node_alloc_val_buf */
+
+static byte eval_dummy;
+
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has an allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+ || que_node_get_type(node) == QUE_NODE_FUNC);
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ if (data && data != &eval_dummy) {
+ mem_free(data);
+ }
+
+ if (size == 0) {
+ data = &eval_dummy;
+ } else {
+ data = mem_alloc(size);
+ }
+
+ que_node_set_val_buf_size(node, size);
+
+ dfield_set_data(dfield, data, size);
+
+ return(data);
+}
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+ || que_node_get_type(node) == QUE_NODE_FUNC);
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ if (que_node_get_val_buf_size(node) > 0) {
+ ut_a(data);
+
+ mem_free(data);
+ }
+}
+
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node) /*!< in: comparison node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ int res;
+ ibool val;
+ int func;
+
+ ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
+
+ arg1 = cmp_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ res = cmp_dfield_dfield(que_node_get_val(arg1),
+ que_node_get_val(arg2));
+ val = TRUE;
+
+ func = cmp_node->func;
+
+ if (func == '=') {
+ if (res != 0) {
+ val = FALSE;
+ }
+ } else if (func == '<') {
+ if (res != -1) {
+ val = FALSE;
+ }
+ } else if (func == PARS_LE_TOKEN) {
+ if (res == 1) {
+ val = FALSE;
+ }
+ } else if (func == PARS_NE_TOKEN) {
+ if (res == 0) {
+ val = FALSE;
+ }
+ } else if (func == PARS_GE_TOKEN) {
+ if (res == -1) {
+ val = FALSE;
+ }
+ } else {
+ ut_ad(func == '>');
+
+ if (res != 1) {
+ val = FALSE;
+ }
+ }
+
+ eval_node_set_ibool_val(cmp_node, val);
+
+ return(val);
+}
+
+/*****************************************************************//**
+Evaluates a logical operation node. */
+UNIV_INLINE
+void
+eval_logical(
+/*=========*/
+ func_node_t* logical_node) /*!< in: logical operation node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ ibool val1;
+ ibool val2 = 0; /* remove warning */
+ ibool val = 0; /* remove warning */
+ int func;
+
+ ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC);
+
+ arg1 = logical_node->args;
+ arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */
+
+ val1 = eval_node_get_ibool_val(arg1);
+
+ if (arg2) {
+ val2 = eval_node_get_ibool_val(arg2);
+ }
+
+ func = logical_node->func;
+
+ if (func == PARS_AND_TOKEN) {
+ val = val1 & val2;
+ } else if (func == PARS_OR_TOKEN) {
+ val = val1 | val2;
+ } else if (func == PARS_NOT_TOKEN) {
+ val = TRUE - val1;
+ } else {
+ ut_error;
+ }
+
+ eval_node_set_ibool_val(logical_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an arithmetic operation node. */
+UNIV_INLINE
+void
+eval_arith(
+/*=======*/
+ func_node_t* arith_node) /*!< in: arithmetic operation node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ lint val1;
+ lint val2 = 0; /* remove warning */
+ lint val;
+ int func;
+
+ ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC);
+
+ arg1 = arith_node->args;
+ arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */
+
+ val1 = eval_node_get_int_val(arg1);
+
+ if (arg2) {
+ val2 = eval_node_get_int_val(arg2);
+ }
+
+ func = arith_node->func;
+
+ if (func == '+') {
+ val = val1 + val2;
+ } else if ((func == '-') && arg2) {
+ val = val1 - val2;
+ } else if (func == '-') {
+ val = -val1;
+ } else if (func == '*') {
+ val = val1 * val2;
+ } else {
+ ut_ad(func == '/');
+ val = val1 / val2;
+ }
+
+ eval_node_set_int_val(arith_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an aggregate operation node. */
+UNIV_INLINE
+void
+eval_aggregate(
+/*===========*/
+ func_node_t* node) /*!< in: aggregate operation node */
+{
+ que_node_t* arg;
+ lint val;
+ lint arg_val;
+ int func;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ val = eval_node_get_int_val(node);
+
+ func = node->func;
+
+ if (func == PARS_COUNT_TOKEN) {
+
+ val = val + 1;
+ } else {
+ ut_ad(func == PARS_SUM_TOKEN);
+
+ arg = node->args;
+ arg_val = eval_node_get_int_val(arg);
+
+ val = val + arg_val;
+ }
+
+ eval_node_set_int_val(node, val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node where the function is not relevant
+in benchmarks. */
+static
+void
+eval_predefined_2(
+/*==============*/
+ func_node_t* func_node) /*!< in: predefined function node */
+{
+ que_node_t* arg;
+ que_node_t* arg1;
+ que_node_t* arg2 = 0; /* remove warning (??? bug ???) */
+ lint int_val;
+ byte* data;
+ ulint len1;
+ ulint len2;
+ int func;
+ ulint i;
+
+ ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+ arg1 = func_node->args;
+
+ if (arg1) {
+ arg2 = que_node_get_next(arg1);
+ }
+
+ func = func_node->func;
+
+ if (func == PARS_PRINTF_TOKEN) {
+
+ arg = arg1;
+
+ while (arg) {
+ dfield_print(que_node_get_val(arg));
+
+ arg = que_node_get_next(arg);
+ }
+
+ putc('\n', stderr);
+
+ } else if (func == PARS_ASSERT_TOKEN) {
+
+ if (!eval_node_get_ibool_val(arg1)) {
+ fputs("SQL assertion fails in a stored procedure!\n",
+ stderr);
+ }
+
+ ut_a(eval_node_get_ibool_val(arg1));
+
+ /* This function, or more precisely, a debug procedure,
+ returns no value */
+
+ } else if (func == PARS_RND_TOKEN) {
+
+ len1 = (ulint)eval_node_get_int_val(arg1);
+ len2 = (ulint)eval_node_get_int_val(arg2);
+
+ ut_ad(len2 >= len1);
+
+ if (len2 > len1) {
+ int_val = (lint) (len1
+ + (eval_rnd % (len2 - len1 + 1)));
+ } else {
+ int_val = (lint) len1;
+ }
+
+ eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+
+ eval_node_set_int_val(func_node, int_val);
+
+ } else if (func == PARS_RND_STR_TOKEN) {
+
+ len1 = (ulint)eval_node_get_int_val(arg1);
+
+ data = eval_node_ensure_val_buf(func_node, len1);
+
+ for (i = 0; i < len1; i++) {
+ data[i] = (byte)(97 + (eval_rnd % 3));
+
+ eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+ }
+ } else {
+ ut_error;
+ }
+}
+
+/*****************************************************************//**
+Evaluates a notfound-function node. */
+UNIV_INLINE
+void
+eval_notfound(
+/*==========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ sym_node_t* cursor;
+ sel_node_t* sel_node;
+ ibool ibool_val;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
+
+ cursor = arg1;
+
+ ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
+
+ if (cursor->token_type == SYM_LIT) {
+
+ ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)),
+ "SQL", 3) == 0);
+
+ sel_node = cursor->sym_table->query_graph->last_sel_node;
+ } else {
+ sel_node = cursor->alias->cursor_def;
+ }
+
+ if (sel_node->state == SEL_NODE_NO_MORE_ROWS) {
+ ibool_val = TRUE;
+ } else {
+ ibool_val = FALSE;
+ }
+
+ eval_node_set_ibool_val(func_node, ibool_val);
+}
+
+/*****************************************************************//**
+Evaluates a substr-function node. */
+UNIV_INLINE
+void
+eval_substr(
+/*========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ que_node_t* arg3;
+ dfield_t* dfield;
+ byte* str1;
+ ulint len1;
+ ulint len2;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ ut_ad(func_node->func == PARS_SUBSTR_TOKEN);
+
+ arg3 = que_node_get_next(arg2);
+
+ str1 = dfield_get_data(que_node_get_val(arg1));
+
+ len1 = (ulint)eval_node_get_int_val(arg2);
+ len2 = (ulint)eval_node_get_int_val(arg3);
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1 + len1, len2);
+}
+
+/*****************************************************************//**
+Evaluates a replstr-procedure node. */
+static
+void
+eval_replstr(
+/*=========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ que_node_t* arg3;
+ que_node_t* arg4;
+ byte* str1;
+ byte* str2;
+ ulint len1;
+ ulint len2;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ ut_ad(que_node_get_type(arg1) == QUE_NODE_SYMBOL);
+
+ arg3 = que_node_get_next(arg2);
+ arg4 = que_node_get_next(arg3);
+
+ str1 = dfield_get_data(que_node_get_val(arg1));
+ str2 = dfield_get_data(que_node_get_val(arg2));
+
+ len1 = (ulint)eval_node_get_int_val(arg3);
+ len2 = (ulint)eval_node_get_int_val(arg4);
+
+ if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2)
+ || (dfield_get_len(que_node_get_val(arg2)) < len2)) {
+
+ ut_error;
+ }
+
+ ut_memcpy(str1 + len1, str2, len2);
+}
+
+/*****************************************************************//**
+Evaluates an instr-function node. */
+static
+void
+eval_instr(
+/*=======*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ dfield_t* dfield1;
+ dfield_t* dfield2;
+ lint int_val;
+ byte* str1;
+ byte* str2;
+ byte match_char;
+ ulint len1;
+ ulint len2;
+ ulint i;
+ ulint j;
+
+ arg1 = func_node->args;
+ arg2 = que_node_get_next(arg1);
+
+ dfield1 = que_node_get_val(arg1);
+ dfield2 = que_node_get_val(arg2);
+
+ str1 = dfield_get_data(dfield1);
+ str2 = dfield_get_data(dfield2);
+
+ len1 = dfield_get_len(dfield1);
+ len2 = dfield_get_len(dfield2);
+
+ if (len2 == 0) {
+ ut_error;
+ }
+
+ match_char = str2[0];
+
+ for (i = 0; i < len1; i++) {
+ /* In this outer loop, the number of matched characters is 0 */
+
+ if (str1[i] == match_char) {
+
+ if (i + len2 > len1) {
+
+ break;
+ }
+
+ for (j = 1;; j++) {
+ /* We have already matched j characters */
+
+ if (j == len2) {
+ int_val = i + 1;
+
+ goto match_found;
+ }
+
+ if (str1[i + j] != str2[j]) {
+
+ break;
+ }
+ }
+ }
+ }
+
+ int_val = 0;
+
+match_found:
+ eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_binary_to_number(
+/*==================*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ dfield_t* dfield;
+ byte* str1;
+ byte* str2;
+ ulint len1;
+ ulint int_val;
+
+ arg1 = func_node->args;
+
+ dfield = que_node_get_val(arg1);
+
+ str1 = dfield_get_data(dfield);
+ len1 = dfield_get_len(dfield);
+
+ if (len1 > 4) {
+ ut_error;
+ }
+
+ if (len1 == 4) {
+ str2 = str1;
+ } else {
+ int_val = 0;
+ str2 = (byte*)&int_val;
+
+ ut_memcpy(str2 + (4 - len1), str1, len1);
+ }
+
+ eval_node_copy_and_alloc_val(func_node, str2, 4);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+static
+void
+eval_concat(
+/*========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+ dfield_t* dfield;
+ byte* data;
+ ulint len;
+ ulint len1;
+
+ arg = func_node->args;
+ len = 0;
+
+ while (arg) {
+ len1 = dfield_get_len(que_node_get_val(arg));
+
+ len += len1;
+
+ arg = que_node_get_next(arg);
+ }
+
+ data = eval_node_ensure_val_buf(func_node, len);
+
+ arg = func_node->args;
+ len = 0;
+
+ while (arg) {
+ dfield = que_node_get_val(arg);
+ len1 = dfield_get_len(dfield);
+
+ ut_memcpy(data + len, dfield_get_data(dfield), len1);
+
+ len += len1;
+
+ arg = que_node_get_next(arg);
+ }
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. If the first argument is an integer,
+this function looks at the second argument which is the integer length in
+bytes, and converts the integer to a VARCHAR.
+If the first argument is of some other type, this function converts it to
+BINARY. */
+UNIV_INLINE
+void
+eval_to_binary(
+/*===========*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ dfield_t* dfield;
+ byte* str1;
+ ulint len;
+ ulint len1;
+
+ arg1 = func_node->args;
+
+ str1 = dfield_get_data(que_node_get_val(arg1));
+
+ if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
+
+ len = dfield_get_len(que_node_get_val(arg1));
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1, len);
+
+ return;
+ }
+
+ arg2 = que_node_get_next(arg1);
+
+ len1 = (ulint)eval_node_get_int_val(arg2);
+
+ if (len1 > 4) {
+
+ ut_error;
+ }
+
+ dfield = que_node_get_val(func_node);
+
+ dfield_set_data(dfield, str1 + (4 - len1), len1);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_predefined(
+/*============*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg1;
+ lint int_val;
+ byte* data;
+ int func;
+
+ func = func_node->func;
+
+ arg1 = func_node->args;
+
+ if (func == PARS_LENGTH_TOKEN) {
+
+ int_val = (lint)dfield_get_len(que_node_get_val(arg1));
+
+ } else if (func == PARS_TO_CHAR_TOKEN) {
+
+ /* Convert number to character string as a
+ signed decimal integer. */
+
+ ulint uint_val;
+ int int_len;
+
+ int_val = eval_node_get_int_val(arg1);
+
+ /* Determine the length of the string. */
+
+ if (int_val == 0) {
+ int_len = 1; /* the number 0 occupies 1 byte */
+ } else {
+ int_len = 0;
+ if (int_val < 0) {
+ uint_val = ((ulint) -int_val - 1) + 1;
+ int_len++; /* reserve space for minus sign */
+ } else {
+ uint_val = (ulint) int_val;
+ }
+ for (; uint_val > 0; int_len++) {
+ uint_val /= 10;
+ }
+ }
+
+ /* allocate the string */
+ data = eval_node_ensure_val_buf(func_node, int_len + 1);
+
+ /* add terminating NUL character */
+ data[int_len] = 0;
+
+ /* convert the number */
+
+ if (int_val == 0) {
+ data[0] = '0';
+ } else {
+ int tmp;
+ if (int_val < 0) {
+ data[0] = '-'; /* preceding minus sign */
+ uint_val = ((ulint) -int_val - 1) + 1;
+ } else {
+ uint_val = (ulint) int_val;
+ }
+ for (tmp = int_len; uint_val > 0; uint_val /= 10) {
+ data[--tmp] = (byte)
+ ('0' + (byte)(uint_val % 10));
+ }
+ }
+
+ dfield_set_len(que_node_get_val(func_node), int_len);
+
+ return;
+
+ } else if (func == PARS_TO_NUMBER_TOKEN) {
+
+ int_val = atoi((char*)
+ dfield_get_data(que_node_get_val(arg1)));
+
+ } else if (func == PARS_SYSDATE_TOKEN) {
+ int_val = (lint)ut_time();
+ } else {
+ eval_predefined_2(func_node);
+
+ return;
+ }
+
+ eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+ ulint class;
+ ulint func;
+
+ ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+ class = func_node->class;
+ func = func_node->func;
+
+ arg = func_node->args;
+
+ /* Evaluate first the argument list */
+ while (arg) {
+ eval_exp(arg);
+
+ /* The functions are not defined for SQL null argument
+ values, except for eval_cmp and notfound */
+
+ if (dfield_is_null(que_node_get_val(arg))
+ && (class != PARS_FUNC_CMP)
+ && (func != PARS_NOTFOUND_TOKEN)
+ && (func != PARS_PRINTF_TOKEN)) {
+ ut_error;
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ if (class == PARS_FUNC_CMP) {
+ eval_cmp(func_node);
+ } else if (class == PARS_FUNC_ARITH) {
+ eval_arith(func_node);
+ } else if (class == PARS_FUNC_AGGREGATE) {
+ eval_aggregate(func_node);
+ } else if (class == PARS_FUNC_PREDEFINED) {
+
+ if (func == PARS_NOTFOUND_TOKEN) {
+ eval_notfound(func_node);
+ } else if (func == PARS_SUBSTR_TOKEN) {
+ eval_substr(func_node);
+ } else if (func == PARS_REPLSTR_TOKEN) {
+ eval_replstr(func_node);
+ } else if (func == PARS_INSTR_TOKEN) {
+ eval_instr(func_node);
+ } else if (func == PARS_BINARY_TO_NUMBER_TOKEN) {
+ eval_binary_to_number(func_node);
+ } else if (func == PARS_CONCAT_TOKEN) {
+ eval_concat(func_node);
+ } else if (func == PARS_TO_BINARY_TOKEN) {
+ eval_to_binary(func_node);
+ } else {
+ eval_predefined(func_node);
+ }
+ } else {
+ ut_ad(class == PARS_FUNC_LOGICAL);
+
+ eval_logical(func_node);
+ }
+}
diff --git a/storage/xtradb/eval/eval0proc.c b/storage/xtradb/eval/eval0proc.c
new file mode 100644
index 00000000000..3a4218d92bf
--- /dev/null
+++ b/storage/xtradb/eval/eval0proc.c
@@ -0,0 +1,295 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0proc.c
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "eval0proc.h"
+
+#ifdef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ if_node_t* node;
+ elsif_node_t* elsif_node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_IF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Evaluate the condition */
+
+ eval_exp(node->cond);
+
+ if (eval_node_get_ibool_val(node->cond)) {
+
+ /* The condition evaluated to TRUE: start execution
+ from the first statement in the statement list */
+
+ thr->run_node = node->stat_list;
+
+ } else if (node->else_part) {
+ thr->run_node = node->else_part;
+
+ } else if (node->elsif_list) {
+ elsif_node = node->elsif_list;
+
+ for (;;) {
+ eval_exp(elsif_node->cond);
+
+ if (eval_node_get_ibool_val(
+ elsif_node->cond)) {
+
+ /* The condition evaluated to TRUE:
+ start execution from the first
+ statement in the statement list */
+
+ thr->run_node = elsif_node->stat_list;
+
+ break;
+ }
+
+ elsif_node = que_node_get_next(elsif_node);
+
+ if (elsif_node == NULL) {
+ thr->run_node = NULL;
+
+ break;
+ }
+ }
+ } else {
+ thr->run_node = NULL;
+ }
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ while_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
+
+ ut_ad((thr->prev_node == que_node_get_parent(node))
+ || (que_node_get_next(thr->prev_node) == NULL));
+
+ /* Evaluate the condition */
+
+ eval_exp(node->cond);
+
+ if (eval_node_get_ibool_val(node->cond)) {
+
+ /* The condition evaluated to TRUE: start execution
+ from the first statement in the statement list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ assign_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
+
+ /* Evaluate the value to assign */
+
+ eval_exp(node->val);
+
+ eval_node_copy_val(node->var->alias, node->val);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ for_node_t* node;
+ que_node_t* parent;
+ lint loop_var_value;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
+
+ parent = que_node_get_parent(node);
+
+ if (thr->prev_node != parent) {
+
+ /* Move to the next statement */
+ thr->run_node = que_node_get_next(thr->prev_node);
+
+ if (thr->run_node != NULL) {
+
+ return(thr);
+ }
+
+ /* Increment the value of loop_var */
+
+ loop_var_value = 1 + eval_node_get_int_val(node->loop_var);
+ } else {
+ /* Initialize the loop */
+
+ eval_exp(node->loop_start_limit);
+ eval_exp(node->loop_end_limit);
+
+ loop_var_value = eval_node_get_int_val(node->loop_start_limit);
+
+ node->loop_end_value
+ = (int) eval_node_get_int_val(node->loop_end_limit);
+ }
+
+ /* Check if we should do another loop */
+
+ if (loop_var_value > node->loop_end_value) {
+
+ /* Enough loops done */
+
+ thr->run_node = parent;
+ } else {
+ eval_node_set_int_val(node->loop_var, loop_var_value);
+
+ thr->run_node = node->stat_list;
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ exit_node_t* node;
+ que_node_t* loop_node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
+
+ /* Loops exit by setting thr->run_node as the loop node's parent, so
+ find our containing loop node and get its parent. */
+
+ loop_node = que_node_get_containing_loop_node(node);
+
+ /* If someone uses an EXIT statement outside of a loop, this will
+ trigger. */
+ ut_a(loop_node);
+
+ thr->run_node = que_node_get_parent(loop_node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ return_node_t* node;
+ que_node_t* parent;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
+
+ parent = node;
+
+ while (que_node_get_type(parent) != QUE_NODE_PROC) {
+
+ parent = que_node_get_parent(parent);
+ }
+
+ ut_a(parent);
+
+ thr->run_node = que_node_get_parent(parent);
+
+ return(thr);
+}
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c
new file mode 100644
index 00000000000..a8520187013
--- /dev/null
+++ b/storage/xtradb/fil/fil0fil.c
@@ -0,0 +1,5438 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.c
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "row0row.h"
+#include "que0que.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+
+/*
+ IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+ =============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier.
+
+Some operating systems do not support many open files at the same time,
+though NT seems to tolerate at least 900 open files. Therefore, we put the
+open files in an LRU-list. If we need to open another file, we may close the
+file at the end of the LRU-list. When an i/o-operation is pending on a file,
+the file cannot be closed. We take the file nodes with pending i/o-operations
+out of the LRU-list and keep a count of pending operations. When an operation
+completes, we decrement the count and return the file node to the LRU-list if
+the count drops to zero. */
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+UNIV_INTERN const char* fil_path_to_mysql_datadir = ".";
+
+/** The number of fsyncs done to the log */
+UNIV_INTERN ulint fil_n_log_flushes = 0;
+
+/** Number of pending redo log flushes */
+UNIV_INTERN ulint fil_n_pending_log_flushes = 0;
+/** Number of pending tablespace flushes */
+UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0;
+
+/** The null file address */
+UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0};
+
+/** File node of a tablespace or the log data space */
+struct fil_node_struct {
+ fil_space_t* space; /*!< backpointer to the space where this node
+ belongs */
+ char* name; /*!< path to the file */
+ ibool open; /*!< TRUE if file open */
+ os_file_t handle; /*!< OS handle to the file, if file open */
+ ibool is_raw_disk;/*!< TRUE if the 'file' is actually a raw
+ device or a raw disk partition */
+ ulint size; /*!< size of the file in database pages, 0 if
+ not known yet; the possible last incomplete
+ megabyte may be ignored if space == 0 */
+ ulint n_pending;
+ /*!< count of pending i/o's on this file;
+ closing of the file is not allowed if
+ this is > 0 */
+ ulint n_pending_flushes;
+ /*!< count of pending flushes on this file;
+ closing of the file is not allowed if
+ this is > 0 */
+ ib_int64_t modification_counter;/*!< when we write to the file we
+ increment this by one */
+ ib_int64_t flush_counter;/*!< up to what
+ modification_counter value we have
+ flushed the modifications to disk */
+ UT_LIST_NODE_T(fil_node_t) chain;
+ /*!< link field for the file chain */
+ UT_LIST_NODE_T(fil_node_t) LRU;
+ /*!< link field for the LRU list */
+ ulint magic_n;/*!< FIL_NODE_MAGIC_N */
+};
+
+/** Value of fil_node_struct::magic_n */
+#define FIL_NODE_MAGIC_N 89389
+
+/** Tablespace or log data space: let us call them by a common name space */
+struct fil_space_struct {
+ char* name; /*!< space name = the path to the first file in
+ it */
+ ulint id; /*!< space id */
+ ib_int64_t tablespace_version;
+ /*!< in DISCARD/IMPORT this timestamp
+ is used to check if we should ignore
+ an insert buffer merge request for a
+ page because it actually was for the
+ previous incarnation of the space */
+ ibool mark; /*!< this is set to TRUE at database startup if
+ the space corresponds to a table in the InnoDB
+ data dictionary; so we can print a warning of
+ orphaned tablespaces */
+ ibool stop_ios;/*!< TRUE if we want to rename the
+ .ibd file of tablespace and want to
+ stop temporarily posting of new i/o
+ requests on the file */
+ ibool stop_ibuf_merges;
+ /*!< we set this TRUE when we start
+ deleting a single-table tablespace */
+ ibool is_being_deleted;
+ /*!< this is set to TRUE when we start
+ deleting a single-table tablespace and its
+ file; when this flag is set no further i/o
+ or flush requests can be placed on this space,
+ though there may be such requests still being
+ processed on this space */
+ ulint purpose;/*!< FIL_TABLESPACE, FIL_LOG, or
+ FIL_ARCH_LOG */
+ UT_LIST_BASE_NODE_T(fil_node_t) chain;
+ /*!< base node for the file chain */
+ ulint size; /*!< space size in pages; 0 if a single-table
+ tablespace whose size we do not know yet;
+ last incomplete megabytes in data files may be
+ ignored if space == 0 */
+ ulint flags; /*!< compressed page size and file format, or 0 */
+ ulint n_reserved_extents;
+ /*!< number of reserved free extents for
+ ongoing operations like B-tree page split */
+ ulint n_pending_flushes; /*!< this is positive when flushing
+ the tablespace to disk; dropping of the
+ tablespace is forbidden if this is positive */
+ ulint n_pending_ibuf_merges;/*!< this is positive
+ when merging insert buffer entries to
+ a page so that we may need to access
+ the ibuf bitmap page in the
+ tablespade: dropping of the tablespace
+ is forbidden if this is positive */
+ hash_node_t hash; /*!< hash chain node */
+ hash_node_t name_hash;/*!< hash chain the name_hash table */
+#ifndef UNIV_HOTBACKUP
+ rw_lock_t latch; /*!< latch protecting the file space storage
+ allocation */
+#endif /* !UNIV_HOTBACKUP */
+ UT_LIST_NODE_T(fil_space_t) unflushed_spaces;
+ /*!< list of spaces with at least one unflushed
+ file we have written to */
+ ibool is_in_unflushed_spaces; /*!< TRUE if this space is
+ currently in unflushed_spaces */
+ ibool is_corrupt;
+ UT_LIST_NODE_T(fil_space_t) space_list;
+ /*!< list of all spaces */
+ ulint magic_n;/*!< FIL_SPACE_MAGIC_N */
+};
+
+/** Value of fil_space_struct::magic_n */
+#define FIL_SPACE_MAGIC_N 89472
+
+/** The tablespace memory cache */
+typedef struct fil_system_struct fil_system_t;
+
+/** The tablespace memory cache; also the totality of logs (the log
+data space) is stored here; below we talk about tablespaces, but also
+the ib_logfiles form a 'space' and it is handled here */
+
+struct fil_system_struct {
+#ifndef UNIV_HOTBACKUP
+ mutex_t mutex; /*!< The mutex protecting the cache */
+#endif /* !UNIV_HOTBACKUP */
+ hash_table_t* spaces; /*!< The hash table of spaces in the
+ system; they are hashed on the space
+ id */
+ hash_table_t* name_hash; /*!< hash table based on the space
+ name */
+ UT_LIST_BASE_NODE_T(fil_node_t) LRU;
+ /*!< base node for the LRU list of the
+ most recently used open files with no
+ pending i/o's; if we start an i/o on
+ the file, we first remove it from this
+ list, and return it to the start of
+ the list when the i/o ends;
+ log files and the system tablespace are
+ not put to this list: they are opened
+ after the startup, and kept open until
+ shutdown */
+ UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
+ /*!< base node for the list of those
+ tablespaces whose files contain
+ unflushed writes; those spaces have
+ at least one file node where
+ modification_counter > flush_counter */
+ ulint n_open; /*!< number of files currently open */
+ ulint max_n_open; /*!< n_open is not allowed to exceed
+ this */
+ ib_int64_t modification_counter;/*!< when we write to a file we
+ increment this by one */
+ ulint max_assigned_id;/*!< maximum space id in the existing
+ tables, or assigned during the time
+ mysqld has been up; at an InnoDB
+ startup we scan the data dictionary
+ and set here the maximum of the
+ space id's of the tables there */
+ ib_int64_t tablespace_version;
+ /*!< a counter which is incremented for
+ every space object memory creation;
+ every space mem object gets a
+ 'timestamp' from this; in DISCARD/
+ IMPORT this is used to check if we
+ should ignore an insert buffer merge
+ request */
+ UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+ /*!< list of all file spaces */
+ ibool space_id_reuse_warned;
+ /* !< TRUE if fil_space_create()
+ has issued a warning about
+ potential space_id reuse */
+};
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+static fil_system_t* fil_system = NULL;
+
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ fil_space_t* space); /*!< in: space */
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ ulint type); /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+ the node as modified if
+ type == OS_FILE_WRITE */
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return space id, ULINT_UNDEFINED if not found */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+ const char* name); /*!< in: table name in the standard
+ 'databasename/tablename' format */
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files. */
+static
+ibool
+fil_space_free(
+/*===========*/
+ /* out: TRUE if success */
+ ulint id, /* in: space id */
+ ibool own_mutex);/* in: TRUE if own system->mutex */
+/********************************************************************//**
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+ulint
+fil_read(
+/*=====*/
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /*!< in: how many bytes to read; this must not
+ cross a file boundary; in aio this must be a
+ block size multiple */
+ void* buf, /*!< in/out: buffer where to store data read;
+ in aio this must be appropriately aligned */
+ void* message) /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+{
+ return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
+ byte_offset, len, buf, message));
+}
+
+/********************************************************************//**
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+ulint
+fil_write(
+/*======*/
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in aio
+ this must be divisible by the OS block size */
+ ulint len, /*!< in: how many bytes to write; this must
+ not cross a file boundary; in aio this must
+ be a block size multiple */
+ void* buf, /*!< in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ void* message) /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+{
+ return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
+ byte_offset, len, buf, message));
+}
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ HASH_SEARCH(hash, fil_system->spaces, id,
+ fil_space_t*, space,
+ ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+ space->id == id);
+
+ return(space);
+}
+
+/*******************************************************************//**
+Returns the table space by a given name, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_name(
+/*==================*/
+ const char* name) /*!< in: space name */
+{
+ fil_space_t* space;
+ ulint fold;
+
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ fold = ut_fold_string(name);
+
+ HASH_SEARCH(name_hash, fil_system->name_hash, fold,
+ fil_space_t*, space,
+ ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+ !strcmp(name, space->name));
+
+ return(space);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+ ib_int64_t version = -1;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space) {
+ version = space->tablespace_version;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(version);
+}
+
+/*******************************************************************//**
+Returns the latch of a file space.
+@return latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ ulint id, /*!< in: space id */
+ ulint* flags) /*!< out: tablespace flags */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+
+ if (flags) {
+ *flags = space->flags;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(&(space->latch));
+}
+
+/*******************************************************************//**
+Returns the type of a file space.
+@return FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(space->purpose);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Checks if all the file nodes in a space are flushed. The caller must hold
+the fil_system mutex.
+@return TRUE if all are flushed */
+static
+ibool
+fil_space_is_flushed(
+/*=================*/
+ fil_space_t* space) /*!< in: space */
+{
+ fil_node_t* node;
+
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node) {
+ if (node->modification_counter > node->flush_counter) {
+
+ return(FALSE);
+ }
+
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed. */
+UNIV_INTERN
+void
+fil_node_create(
+/*============*/
+ const char* name, /*!< in: file name (file must be closed) */
+ ulint size, /*!< in: file size in database blocks, rounded
+ downwards to an integer */
+ ulint id, /*!< in: space id where to append */
+ ibool is_raw) /*!< in: TRUE if a raw device or
+ a raw disk partition */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+
+ ut_a(fil_system);
+ ut_a(name);
+
+ mutex_enter(&fil_system->mutex);
+
+ node = mem_alloc(sizeof(fil_node_t));
+
+ node->name = mem_strdup(name);
+ node->open = FALSE;
+
+ ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+ node->is_raw_disk = is_raw;
+ node->size = size;
+ node->magic_n = FIL_NODE_MAGIC_N;
+ node->n_pending = 0;
+ node->n_pending_flushes = 0;
+
+ node->modification_counter = 0;
+ node->flush_counter = 0;
+
+ space = fil_space_get_by_id(id);
+
+ if (!space) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: Could not find tablespace %lu for\n"
+ "InnoDB: file ", (ulong) id);
+ ut_print_filename(stderr, name);
+ fputs(" in the tablespace memory cache.\n", stderr);
+ mem_free(node->name);
+
+ mem_free(node);
+
+ mutex_exit(&fil_system->mutex);
+
+ return;
+ }
+
+ space->size += size;
+
+ node->space = space;
+
+ UT_LIST_ADD_LAST(chain, space->chain, node);
+
+ if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
+
+ fil_system->max_assigned_id = id;
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/********************************************************************//**
+Opens a the file of a node of a tablespace. The caller must own the fil_system
+mutex. */
+static
+void
+fil_node_open_file(
+/*===============*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ fil_space_t* space) /*!< in: space */
+{
+ ib_uint64_t size_bytes;
+ ulint size_low;
+ ulint size_high;
+ ibool ret;
+ ibool success;
+ byte* buf2;
+ byte* page;
+ ulint space_id;
+ ulint flags;
+
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->n_pending == 0);
+ ut_a(node->open == FALSE);
+
+ if (node->size == 0) {
+ /* It must be a single-table tablespace and we do not know the
+ size of the file yet. First we open the file in the normal
+ mode, no async I/O here, for simplicity. Then do some checks,
+ and close the file again.
+ NOTE that we could not use the simple file read function
+ os_file_read() in Windows to read from a file opened for
+ async I/O! */
+
+ node->handle = os_file_create_simple_no_error_handling(
+ node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Fatal error: cannot open %s\n."
+ "InnoDB: Have you deleted .ibd files"
+ " under a running mysqld server?\n",
+ node->name);
+ ut_a(0);
+ }
+
+ os_file_get_size(node->handle, &size_low, &size_high);
+
+ size_bytes = (((ib_uint64_t)size_high) << 32)
+ + (ib_uint64_t)size_low;
+#ifdef UNIV_HOTBACKUP
+ if (trx_sys_sys_space(space->id)) {
+ node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ os_file_close(node->handle);
+ goto add_size;
+ }
+#endif /* UNIV_HOTBACKUP */
+ ut_a(space->purpose != FIL_LOG);
+ ut_a(!trx_sys_sys_space(space->id));
+
+ if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error: the size of single-table"
+ " tablespace file %s\n"
+ "InnoDB: is only %lu %lu,"
+ " should be at least %lu!\n",
+ node->name,
+ (ulong) size_high,
+ (ulong) size_low,
+ (ulong) (FIL_IBD_FILE_INITIAL_SIZE
+ * UNIV_PAGE_SIZE));
+
+ ut_a(0);
+ }
+
+ /* Read the first page of the tablespace */
+
+ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+ /* Align the memory for file i/o if we might have O_DIRECT
+ set */
+ page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ success = os_file_read(node->handle, page, 0, 0,
+ UNIV_PAGE_SIZE);
+ space_id = fsp_header_get_space_id(page);
+ flags = fsp_header_get_flags(page);
+
+ ut_free(buf2);
+
+ /* Close the file now that we have read the space id from it */
+
+ os_file_close(node->handle);
+
+ if (UNIV_UNLIKELY(space_id != space->id)) {
+ fprintf(stderr,
+ "InnoDB: Error: tablespace id is %lu"
+ " in the data dictionary\n"
+ "InnoDB: but in file %s it is %lu!\n",
+ space->id, node->name, space_id);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
+ || trx_sys_sys_space(space_id))) {
+ fprintf(stderr,
+ "InnoDB: Error: tablespace id %lu"
+ " in file %s is not sensible\n",
+ (ulong) space_id, node->name);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(space->flags != flags)) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %lx"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file %s are %lx!\n",
+ space->flags, node->name, flags);
+
+ ut_error;
+ }
+
+ if (size_bytes >= 1024 * 1024) {
+ /* Truncate the size to whole megabytes. */
+ size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+ }
+
+ if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+ node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ } else {
+ node->size = (ulint)
+ (size_bytes
+ / dict_table_flags_to_zip_size(flags));
+ }
+
+#ifdef UNIV_HOTBACKUP
+add_size:
+#endif /* UNIV_HOTBACKUP */
+ space->size += node->size;
+ }
+
+ /* printf("Opening file %s\n", node->name); */
+
+ /* Open the file for reading and writing, in Windows normally in the
+ unbuffered async I/O mode, though global variables may make
+ os_file_create() to fall back to the normal file I/O mode. */
+
+ if (space->purpose == FIL_LOG) {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_LOG_FILE, &ret);
+ } else if (node->is_raw_disk) {
+ node->handle = os_file_create(node->name,
+ OS_FILE_OPEN_RAW,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
+ } else {
+ node->handle = os_file_create(node->name, OS_FILE_OPEN,
+ OS_FILE_AIO, OS_DATA_FILE, &ret);
+ }
+
+ ut_a(ret);
+
+ node->open = TRUE;
+
+ system->n_open++;
+
+ if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) {
+ /* Put the node to the LRU list */
+ UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+ }
+}
+
+/**********************************************************************//**
+Closes a file. */
+static
+void
+fil_node_close_file(
+/*================*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system) /*!< in: tablespace memory cache */
+{
+ ibool ret;
+
+ ut_ad(node && system);
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->open);
+ ut_a(node->n_pending == 0);
+ ut_a(node->n_pending_flushes == 0);
+ ut_a(node->modification_counter == node->flush_counter);
+
+ ret = os_file_close(node->handle);
+ ut_a(ret);
+
+ /* printf("Closing file %s\n", node->name); */
+
+ node->open = FALSE;
+ ut_a(system->n_open > 0);
+ system->n_open--;
+
+ if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
+ ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+ /* The node is in the LRU list, remove it */
+ UT_LIST_REMOVE(LRU, system->LRU, node);
+ }
+}
+
+/********************************************************************//**
+Tries to close a file in the LRU list. The caller must hold the fil_sys
+mutex.
+@return TRUE if success, FALSE if should retry later; since i/o's
+generally complete in < 100 ms, and as InnoDB writes at most 128 pages
+from the buffer pool in a batch, and then immediately flushes the
+files, there is a good chance that the next time we find a suitable
+node from the LRU list */
+static
+ibool
+fil_try_to_close_file_in_LRU(
+/*=========================*/
+ ibool print_info) /*!< in: if TRUE, prints information why it
+ cannot close a file */
+{
+ fil_node_t* node;
+
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ node = UT_LIST_GET_LAST(fil_system->LRU);
+
+ if (print_info) {
+ fprintf(stderr,
+ "InnoDB: fil_sys open file LRU len %lu\n",
+ (ulong) UT_LIST_GET_LEN(fil_system->LRU));
+ }
+
+ while (node != NULL) {
+ if (node->modification_counter == node->flush_counter
+ && node->n_pending_flushes == 0) {
+
+ fil_node_close_file(node, fil_system);
+
+ return(TRUE);
+ }
+
+ if (print_info && node->n_pending_flushes > 0) {
+ fputs("InnoDB: cannot close file ", stderr);
+ ut_print_filename(stderr, node->name);
+ fprintf(stderr, ", because n_pending_flushes %lu\n",
+ (ulong) node->n_pending_flushes);
+ }
+
+ if (print_info
+ && node->modification_counter != node->flush_counter) {
+ fputs("InnoDB: cannot close file ", stderr);
+ ut_print_filename(stderr, node->name);
+ fprintf(stderr,
+ ", because mod_count %ld != fl_count %ld\n",
+ (long) node->modification_counter,
+ (long) node->flush_counter);
+ }
+
+ node = UT_LIST_GET_PREV(LRU, node);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Reserves the fil_system mutex and tries to make sure we can open at least one
+file while holding it. This should be called before calling
+fil_node_prepare_for_io(), because that function may need to open a file. */
+static
+void
+fil_mutex_enter_and_prepare_for_io(
+/*===============================*/
+ ulint space_id) /*!< in: space id */
+{
+ fil_space_t* space;
+ ibool success;
+ ibool print_info = FALSE;
+ ulint count = 0;
+ ulint count2 = 0;
+
+retry:
+ mutex_enter(&fil_system->mutex);
+
+ if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+ /* We keep log files and system tablespace files always open;
+ this is important in preventing deadlocks in this module, as
+ a page read completion often performs another read from the
+ insert buffer. The insert buffer is in tablespace 0, and we
+ cannot end up waiting in this function. */
+
+ return;
+ }
+
+ if (fil_system->n_open < fil_system->max_n_open) {
+
+ return;
+ }
+
+ space = fil_space_get_by_id(space_id);
+
+ if (space != NULL && space->stop_ios) {
+ /* We are going to do a rename file and want to stop new i/o's
+ for a while */
+
+ if (count2 > 20000) {
+ fputs("InnoDB: Warning: tablespace ", stderr);
+ ut_print_filename(stderr, space->name);
+ fprintf(stderr,
+ " has i/o ops stopped for a long time %lu\n",
+ (ulong) count2);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ os_thread_sleep(20000);
+
+ count2++;
+
+ goto retry;
+ }
+
+ /* If the file is already open, no need to do anything; if the space
+ does not exist, we handle the situation in the function which called
+ this function */
+
+ if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
+
+ return;
+ }
+
+ if (count > 1) {
+ print_info = TRUE;
+ }
+
+ /* Too many files are open, try to close some */
+close_more:
+ success = fil_try_to_close_file_in_LRU(print_info);
+
+ if (success && fil_system->n_open >= fil_system->max_n_open) {
+
+ goto close_more;
+ }
+
+ if (fil_system->n_open < fil_system->max_n_open) {
+ /* Ok */
+
+ return;
+ }
+
+ if (count >= 2) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: too many (%lu) files stay open"
+ " while the maximum\n"
+ "InnoDB: allowed value would be %lu.\n"
+ "InnoDB: You may need to raise the value of"
+ " innodb_open_files in\n"
+ "InnoDB: my.cnf.\n",
+ (ulong) fil_system->n_open,
+ (ulong) fil_system->max_n_open);
+
+ return;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+ /* Wake the i/o-handler threads to make sure pending i/o's are
+ performed */
+ os_aio_simulated_wake_handler_threads();
+
+ os_thread_sleep(20000);
+#endif
+ /* Flush tablespaces so that we can close modified files in the LRU
+ list */
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ count++;
+
+ goto retry;
+}
+
+/*******************************************************************//**
+Frees a file node object from a tablespace memory cache. */
+static
+void
+fil_node_free(
+/*==========*/
+ fil_node_t* node, /*!< in, own: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ fil_space_t* space) /*!< in: space where the file node is chained */
+{
+ ut_ad(node && system && space);
+ ut_ad(mutex_own(&(system->mutex)));
+ ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+ ut_a(node->n_pending == 0);
+
+ if (node->open) {
+ /* We fool the assertion in fil_node_close_file() to think
+ there are no unflushed modifications in the file */
+
+ node->modification_counter = node->flush_counter;
+
+ if (space->is_in_unflushed_spaces
+ && fil_space_is_flushed(space)) {
+
+ space->is_in_unflushed_spaces = FALSE;
+
+ UT_LIST_REMOVE(unflushed_spaces,
+ system->unflushed_spaces,
+ space);
+ }
+
+ fil_node_close_file(node, system);
+ }
+
+ space->size -= node->size;
+
+ UT_LIST_REMOVE(chain, space->chain, node);
+
+ mem_free(node->name);
+ mem_free(node);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+ ulint id, /*!< in: space id */
+ ulint trunc_len) /*!< in: truncate by this much; it is an error
+ if this does not equal to the combined size of
+ some initial files in the space */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+
+ while (trunc_len > 0) {
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len);
+
+ trunc_len -= node->size * UNIV_PAGE_SIZE;
+
+ fil_node_free(node, fil_system, space);
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/*******************************************************************//**
+Creates a space memory object and puts it to the tablespace memory cache. If
+there is an error, prints an error message to the .err log.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+ const char* name, /*!< in: space name */
+ ulint id, /*!< in: space id */
+ ulint flags, /*!< in: compressed page size
+ and file format, or 0 */
+ ulint purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+ fil_space_t* space;
+
+ /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+ ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
+ ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+try_again:
+ /*printf(
+ "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name,
+ purpose);*/
+
+ ut_a(fil_system);
+ ut_a(name);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_name(name);
+
+ if (UNIV_LIKELY_NULL(space)) {
+ ulint namesake_id;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: trying to init to the"
+ " tablespace memory cache\n"
+ "InnoDB: a tablespace %lu of name ", (ulong) id);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, ",\n"
+ "InnoDB: but a tablespace %lu of the same name\n"
+ "InnoDB: already exists in the"
+ " tablespace memory cache!\n",
+ (ulong) space->id);
+
+ if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) {
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ fprintf(stderr,
+ "InnoDB: We assume that InnoDB did a crash recovery,"
+ " and you had\n"
+ "InnoDB: an .ibd file for which the table"
+ " did not exist in the\n"
+ "InnoDB: InnoDB internal data dictionary in the"
+ " ibdata files.\n"
+ "InnoDB: We assume that you later removed the"
+ " .ibd and .frm files,\n"
+ "InnoDB: and are now trying to recreate the table."
+ " We now remove the\n"
+ "InnoDB: conflicting tablespace object"
+ " from the memory cache and try\n"
+ "InnoDB: the init again.\n");
+
+ namesake_id = space->id;
+
+ mutex_exit(&fil_system->mutex);
+
+ fil_space_free(namesake_id, FALSE);
+
+ goto try_again;
+ }
+
+ space = fil_space_get_by_id(id);
+
+ if (UNIV_LIKELY_NULL(space)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to add tablespace %lu"
+ " of name ", (ulong) id);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: to the tablespace memory cache,"
+ " but tablespace\n"
+ "InnoDB: %lu of name ", (ulong) space->id);
+ ut_print_filename(stderr, space->name);
+ fputs(" already exists in the tablespace\n"
+ "InnoDB: memory cache!\n", stderr);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ space = mem_alloc(sizeof(fil_space_t));
+
+ space->name = mem_strdup(name);
+ space->id = id;
+
+ fil_system->tablespace_version++;
+ space->tablespace_version = fil_system->tablespace_version;
+ space->mark = FALSE;
+
+ if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on)
+ && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) {
+ if (!fil_system->space_id_reuse_warned) {
+ fil_system->space_id_reuse_warned = TRUE;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: allocated tablespace %lu,"
+ " old maximum was %lu\n",
+ (ulong) id,
+ (ulong) fil_system->max_assigned_id);
+ }
+
+ fil_system->max_assigned_id = id;
+ }
+
+ space->stop_ios = FALSE;
+ space->stop_ibuf_merges = FALSE;
+ space->is_being_deleted = FALSE;
+ space->purpose = purpose;
+ space->size = 0;
+ space->flags = flags;
+
+ space->n_reserved_extents = 0;
+
+ space->n_pending_flushes = 0;
+ space->n_pending_ibuf_merges = 0;
+
+ UT_LIST_INIT(space->chain);
+ space->magic_n = FIL_SPACE_MAGIC_N;
+
+ rw_lock_create(&space->latch, SYNC_FSP);
+
+ HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
+
+ HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+ ut_fold_string(name), space);
+ space->is_in_unflushed_spaces = FALSE;
+
+ space->is_corrupt = FALSE;
+
+ UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+ ulint* space_id) /*!< in/out: space id */
+{
+ ulint id;
+ ibool success;
+
+ mutex_enter(&fil_system->mutex);
+
+ id = *space_id;
+
+ if (id < fil_system->max_assigned_id) {
+ id = fil_system->max_assigned_id;
+ }
+
+ id++;
+
+ if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ "InnoDB: Warning: you are running out of new"
+ " single-table tablespace id's.\n"
+ "InnoDB: Current counter is %lu and it"
+ " must not exceed %lu!\n"
+ "InnoDB: To reset the counter to zero"
+ " you have to dump all your tables and\n"
+ "InnoDB: recreate the whole InnoDB installation.\n",
+ (ulong) id,
+ (ulong) SRV_LOG_SPACE_FIRST_ID);
+ }
+
+ success = (id < SRV_LOG_SPACE_FIRST_ID);
+
+ if (success) {
+ *space_id = fil_system->max_assigned_id = id;
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ "InnoDB: You have run out of single-table"
+ " tablespace id's!\n"
+ "InnoDB: Current counter is %lu.\n"
+ "InnoDB: To reset the counter to zero you"
+ " have to dump all your tables and\n"
+ "InnoDB: recreate the whole InnoDB installation.\n",
+ (ulong) id);
+ *space_id = ULINT_UNDEFINED;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(success);
+}
+
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files.
+@return TRUE if success */
+static
+ibool
+fil_space_free(
+/*===========*/
+ /* out: TRUE if success */
+ ulint id, /* in: space id */
+ ibool own_mutex) /* in: TRUE if own system->mutex */
+{
+ fil_space_t* space;
+ fil_space_t* namespace;
+ fil_node_t* fil_node;
+
+ if (!own_mutex) {
+ mutex_enter(&fil_system->mutex);
+ }
+
+ space = fil_space_get_by_id(id);
+
+ if (!space) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: trying to remove tablespace %lu"
+ " from the cache but\n"
+ "InnoDB: it is not there.\n", (ulong) id);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
+
+ namespace = fil_space_get_by_name(space->name);
+ ut_a(namespace);
+ ut_a(space == namespace);
+
+ HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+ ut_fold_string(space->name), space);
+
+ if (space->is_in_unflushed_spaces) {
+ space->is_in_unflushed_spaces = FALSE;
+
+ UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces,
+ space);
+ }
+
+ UT_LIST_REMOVE(space_list, fil_system->space_list, space);
+
+ ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+ ut_a(0 == space->n_pending_flushes);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+
+ while (fil_node != NULL) {
+ fil_node_free(fil_node, fil_system, space);
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+ }
+
+ ut_a(0 == UT_LIST_GET_LEN(space->chain));
+
+ if (!own_mutex) {
+ mutex_exit(&fil_system->mutex);
+ }
+
+ rw_lock_free(&(space->latch));
+
+ mem_free(space->name);
+ mem_free(space);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+ ulint id) /*!< in: space id */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+ ulint size;
+
+ ut_ad(fil_system);
+
+ fil_mutex_enter_and_prepare_for_io(id);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ mutex_exit(&fil_system->mutex);
+
+ return(0);
+ }
+
+ if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+ ut_a(id != 0);
+
+ ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ /* It must be a single-table tablespace and we have not opened
+ the file yet; the following calls will open it and update the
+ size fields */
+
+ fil_node_prepare_for_io(node, fil_system, space);
+ fil_node_complete_io(node, fil_system, OS_FILE_READ);
+ }
+
+ size = space->size;
+
+ mutex_exit(&fil_system->mutex);
+
+ return(size);
+}
+
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+ ulint id) /*!< in: space id */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+ ulint flags;
+
+ ut_ad(fil_system);
+
+ if (UNIV_UNLIKELY(!id)) {
+ return(0);
+ }
+
+ fil_mutex_enter_and_prepare_for_io(id);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ mutex_exit(&fil_system->mutex);
+
+ return(ULINT_UNDEFINED);
+ }
+
+ if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+ ut_a(id != 0);
+
+ ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ /* It must be a single-table tablespace and we have not opened
+ the file yet; the following calls will open it and update the
+ size fields */
+
+ fil_node_prepare_for_io(node, fil_system, space);
+ fil_node_complete_io(node, fil_system, OS_FILE_READ);
+ }
+
+ flags = space->flags;
+
+ mutex_exit(&fil_system->mutex);
+
+ return(flags);
+}
+
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return(dict_table_flags_to_zip_size(flags));
+ }
+
+ return(flags);
+}
+
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint page_no)/*!< in: page number */
+{
+ if (fil_space_get_size(id) > page_no) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+ ulint hash_size, /*!< in: hash table size */
+ ulint max_n_open) /*!< in: max number of open files */
+{
+ ut_a(fil_system == NULL);
+
+ ut_a(hash_size > 0);
+ ut_a(max_n_open > 0);
+
+ fil_system = mem_zalloc(sizeof(fil_system_t));
+
+ mutex_create(&fil_system->mutex, SYNC_ANY_LATCH);
+
+ fil_system->spaces = hash_create(hash_size);
+ fil_system->name_hash = hash_create(hash_size);
+
+ UT_LIST_INIT(fil_system->LRU);
+
+ fil_system->max_n_open = max_n_open;
+
+ fil_system->max_assigned_id = TRX_SYS_SPACE_MAX;
+}
+
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void)
+/*==========================================*/
+{
+ fil_space_t* space;
+ fil_node_t* node;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space != NULL) {
+ if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) {
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node != NULL) {
+ if (!node->open) {
+ fil_node_open_file(node, fil_system,
+ space);
+ }
+ if (fil_system->max_n_open
+ < 10 + fil_system->n_open) {
+ fprintf(stderr,
+ "InnoDB: Warning: you must"
+ " raise the value of"
+ " innodb_open_files in\n"
+ "InnoDB: my.cnf! Remember that"
+ " InnoDB keeps all log files"
+ " and all system\n"
+ "InnoDB: tablespace files open"
+ " for the whole time mysqld is"
+ " running, and\n"
+ "InnoDB: needs to open also"
+ " some .ibd files if the"
+ " file-per-table storage\n"
+ "InnoDB: model is used."
+ " Current open files %lu,"
+ " max allowed"
+ " open files %lu.\n",
+ (ulong) fil_system->n_open,
+ (ulong) fil_system->max_n_open);
+ }
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void)
+/*=====================*/
+{
+ fil_space_t* space;
+ fil_node_t* node;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space != NULL) {
+ fil_space_t* prev_space = space;
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node != NULL) {
+ if (node->open) {
+ fil_node_close_file(node, fil_system);
+ }
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ fil_space_free(prev_space->id, TRUE);
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id) /*!< in: maximum known id */
+{
+ if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
+ fprintf(stderr,
+ "InnoDB: Fatal error: max tablespace id"
+ " is too high, %lu\n", (ulong) max_id);
+ ut_error;
+ }
+
+ mutex_enter(&fil_system->mutex);
+
+ if (fil_system->max_assigned_id < max_id) {
+
+ fil_system->max_assigned_id = max_id;
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page header
+of the first page of a data file of the system tablespace (space 0),
+which is uncompressed. */
+static
+ulint
+fil_write_lsn_and_arch_no_to_file(
+/*==============================*/
+ ulint sum_of_sizes, /*!< in: combined size of previous files
+ in space, in database pages */
+ ib_uint64_t lsn, /*!< in: lsn to write */
+ ulint arch_log_no __attribute__((unused)))
+ /*!< in: archived log number to write */
+{
+ byte* buf1;
+ byte* buf;
+
+ buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
+ buf = ut_align(buf1, UNIV_PAGE_SIZE);
+
+ fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+ mach_write_ull(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+
+ fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+ mem_free(buf1);
+
+ return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+ ib_uint64_t lsn, /*!< in: lsn to write */
+ ulint arch_log_no) /*!< in: latest archived log
+ file number */
+{
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint sum_of_sizes;
+ ulint err;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space) {
+ /* We only write the lsn to all existing data files which have
+ been open during the lifetime of the mysqld process; they are
+ represented by the space objects in the tablespace memory
+ cache. Note that all data files in the system tablespace 0 are
+ always open. */
+
+ if (space->purpose == FIL_TABLESPACE
+ && space->id == 0) {
+ sum_of_sizes = 0;
+
+ node = UT_LIST_GET_FIRST(space->chain);
+ while (node) {
+ mutex_exit(&fil_system->mutex);
+
+ err = fil_write_lsn_and_arch_no_to_file(
+ sum_of_sizes, lsn, arch_log_no);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ mutex_enter(&fil_system->mutex);
+
+ sum_of_sizes += node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+UNIV_INTERN
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+ os_file_t data_file, /*!< in: open data file */
+ ibool one_read_already, /*!< in: TRUE if min and max
+ parameters below already
+ contain sensible data */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint* min_arch_log_no, /*!< in/out: */
+ ulint* max_arch_log_no, /*!< in/out: */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t* min_flushed_lsn, /*!< in/out: */
+ ib_uint64_t* max_flushed_lsn) /*!< in/out: */
+{
+ byte* buf;
+ byte* buf2;
+ ib_uint64_t flushed_lsn;
+
+ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+ /* Align the memory for a possible read from a raw device */
+ buf = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
+
+ flushed_lsn = mach_read_ull(buf + FIL_PAGE_FILE_FLUSH_LSN);
+
+ ut_free(buf2);
+
+ if (!one_read_already) {
+ *min_flushed_lsn = flushed_lsn;
+ *max_flushed_lsn = flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+ *min_arch_log_no = arch_log_no;
+ *max_arch_log_no = arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+ return;
+ }
+
+ if (*min_flushed_lsn > flushed_lsn) {
+ *min_flushed_lsn = flushed_lsn;
+ }
+ if (*max_flushed_lsn < flushed_lsn) {
+ *max_flushed_lsn = flushed_lsn;
+ }
+#ifdef UNIV_LOG_ARCHIVE
+ if (*min_arch_log_no > arch_log_no) {
+ *min_arch_log_no = arch_log_no;
+ }
+ if (*max_arch_log_no < arch_log_no) {
+ *max_arch_log_no = arch_log_no;
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+}
+
+/*================ SINGLE-TABLE TABLESPACES ==========================*/
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return TRUE if being deleted, and ibuf merges should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to do ibuf merge to a"
+ " dropped tablespace %lu\n",
+ (ulong) id);
+ }
+
+ if (space == NULL || space->stop_ibuf_merges) {
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+ }
+
+ space->n_pending_ibuf_merges++;
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Decrements the count of pending insert buffer page merges. */
+UNIV_INTERN
+void
+fil_decr_pending_ibuf_merges(
+/*=========================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ fprintf(stderr,
+ "InnoDB: Error: decrementing ibuf merge of a"
+ " dropped tablespace %lu\n",
+ (ulong) id);
+ }
+
+ if (space != NULL) {
+ space->n_pending_ibuf_merges--;
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the database directory for a table if it does not exist yet. */
+static
+void
+fil_create_directory_for_tablename(
+/*===============================*/
+ const char* name) /*!< in: name in the standard
+ 'databasename/tablename' format */
+{
+ const char* namend;
+ char* path;
+ ulint len;
+
+ len = strlen(fil_path_to_mysql_datadir);
+ namend = strchr(name, '/');
+ ut_a(namend);
+ path = mem_alloc(len + (namend - name) + 2);
+
+ memcpy(path, fil_path_to_mysql_datadir, len);
+ path[len] = '/';
+ memcpy(path + len + 1, name, namend - name);
+ path[len + (namend - name) + 1] = 0;
+
+ srv_normalize_path_for_win(path);
+
+ ut_a(os_file_create_directory(path, FALSE));
+ mem_free(path);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a log record about an .ibd file create/rename/delete. */
+static
+void
+fil_op_write_log(
+/*=============*/
+ ulint type, /*!< in: MLOG_FILE_CREATE,
+ MLOG_FILE_CREATE2,
+ MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id, /*!< in: space id */
+ ulint log_flags, /*!< in: redo log flags (stored
+ in the page number field) */
+ ulint flags, /*!< in: compressed page size
+ and file format
+ if type==MLOG_FILE_CREATE2, or 0 */
+ const char* name, /*!< in: table name in the familiar
+ 'databasename/tablename' format, or
+ the file path in the case of
+ MLOG_FILE_DELETE */
+ const char* new_name, /*!< in: if type is MLOG_FILE_RENAME,
+ the new table name in the
+ 'databasename/tablename' format */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+ ulint len;
+
+ log_ptr = mlog_open(mtr, 11 + 2 + 1);
+
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash recovery:
+ in that case mlog_open returns NULL */
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_for_file_op(
+ type, space_id, log_flags, log_ptr, mtr);
+ if (type == MLOG_FILE_CREATE2) {
+ mach_write_to_4(log_ptr, flags);
+ log_ptr += 4;
+ }
+ /* Let us store the strings as null-terminated for easier readability
+ and handling */
+
+ len = strlen(name) + 1;
+
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, (byte*) name, len);
+
+ if (type == MLOG_FILE_RENAME) {
+ len = strlen(new_name) + 1;
+ log_ptr = mlog_open(mtr, 2 + len);
+ ut_a(log_ptr);
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, (byte*) new_name, len);
+ }
+}
+#endif
+
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+ byte* ptr, /*!< in: buffer containing the log record body,
+ or an initial segment of it, if the record does
+ not fir completely between ptr and end_ptr */
+ byte* end_ptr, /*!< in: buffer end */
+ ulint type, /*!< in: the type of this log record */
+ ulint space_id, /*!< in: the space id of the tablespace in
+ question, or 0 if the log record should
+ only be parsed but not replayed */
+ ulint log_flags) /*!< in: redo log flags
+ (stored in the page number parameter) */
+{
+ ulint name_len;
+ ulint new_name_len;
+ const char* name;
+ const char* new_name = NULL;
+ ulint flags = 0;
+
+ if (type == MLOG_FILE_CREATE2) {
+ if (end_ptr < ptr + 4) {
+
+ return(NULL);
+ }
+
+ flags = mach_read_from_4(ptr);
+ ptr += 4;
+ }
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ name_len = mach_read_from_2(ptr);
+
+ ptr += 2;
+
+ if (end_ptr < ptr + name_len) {
+
+ return(NULL);
+ }
+
+ name = (const char*) ptr;
+
+ ptr += name_len;
+
+ if (type == MLOG_FILE_RENAME) {
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ new_name_len = mach_read_from_2(ptr);
+
+ ptr += 2;
+
+ if (end_ptr < ptr + new_name_len) {
+
+ return(NULL);
+ }
+
+ new_name = (const char*) ptr;
+
+ ptr += new_name_len;
+ }
+
+ /* We managed to parse a full log record body */
+ /*
+ printf("Parsed log rec of type %lu space %lu\n"
+ "name %s\n", type, space_id, name);
+
+ if (type == MLOG_FILE_RENAME) {
+ printf("new name %s\n", new_name);
+ }
+ */
+ if (!space_id) {
+
+ return(ptr);
+ }
+
+ /* Let us try to perform the file operation, if sensible. Note that
+ ibbackup has at this stage already read in all space id info to the
+ fil0fil.c data structures.
+
+ NOTE that our algorithm is not guaranteed to work correctly if there
+ were renames of tables during the backup. See ibbackup code for more
+ on the problem. */
+
+ switch (type) {
+ case MLOG_FILE_DELETE:
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ ut_a(fil_delete_tablespace(space_id));
+ }
+
+ break;
+
+ case MLOG_FILE_RENAME:
+ /* We do the rename based on space id, not old file name;
+ this should guarantee that after the log replay each .ibd file
+ has the correct name for the latest log sequence number; the
+ proof is left as an exercise :) */
+
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ /* Create the database directory for the new name, if
+ it does not exist yet */
+ fil_create_directory_for_tablename(new_name);
+
+ /* Rename the table if there is not yet a tablespace
+ with the same name */
+
+ if (fil_get_space_id_for_table(new_name)
+ == ULINT_UNDEFINED) {
+ /* We do not care of the old name, that is
+ why we pass NULL as the first argument */
+ if (!fil_rename_tablespace(NULL, space_id,
+ new_name)) {
+ ut_error;
+ }
+ }
+ }
+
+ break;
+
+ case MLOG_FILE_CREATE:
+ case MLOG_FILE_CREATE2:
+ if (fil_tablespace_exists_in_mem(space_id)) {
+ /* Do nothing */
+ } else if (fil_get_space_id_for_table(name)
+ != ULINT_UNDEFINED) {
+ /* Do nothing */
+ } else if (log_flags & MLOG_FILE_FLAG_TEMP) {
+ /* Temporary table, do nothing */
+ } else {
+ /* Create the database directory for name, if it does
+ not exist yet */
+ fil_create_directory_for_tablename(name);
+
+ if (fil_create_new_single_table_tablespace(
+ space_id, name, FALSE, flags,
+ FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+ ut_error;
+ }
+ }
+
+ break;
+
+ default:
+ ut_error;
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_delete_tablespace(
+/*==================*/
+ ulint id) /*!< in: space id */
+{
+ ibool success;
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint count = 0;
+ char* path;
+
+ ut_a(id != 0);
+stop_ibuf_merges:
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space != NULL) {
+ space->stop_ibuf_merges = TRUE;
+
+ if (space->n_pending_ibuf_merges == 0) {
+ mutex_exit(&fil_system->mutex);
+
+ count = 0;
+
+ goto try_again;
+ } else {
+ if (count > 5000) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: trying to"
+ " delete tablespace ", stderr);
+ ut_print_filename(stderr, space->name);
+ fprintf(stderr, ",\n"
+ "InnoDB: but there are %lu pending"
+ " ibuf merges on it.\n"
+ "InnoDB: Loop %lu.\n",
+ (ulong) space->n_pending_ibuf_merges,
+ (ulong) count);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ os_thread_sleep(20000);
+ count++;
+
+ goto stop_ibuf_merges;
+ }
+ }
+
+ mutex_exit(&fil_system->mutex);
+ count = 0;
+
+try_again:
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: cannot delete tablespace %lu\n"
+ "InnoDB: because it is not found in the"
+ " tablespace memory cache.\n",
+ (ulong) id);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ ut_a(space);
+ ut_a(space->n_pending_ibuf_merges == 0);
+
+ space->is_being_deleted = TRUE;
+
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ if (space->n_pending_flushes > 0 || node->n_pending > 0) {
+ if (count > 1000) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: trying to"
+ " delete tablespace ", stderr);
+ ut_print_filename(stderr, space->name);
+ fprintf(stderr, ",\n"
+ "InnoDB: but there are %lu flushes"
+ " and %lu pending i/o's on it\n"
+ "InnoDB: Loop %lu.\n",
+ (ulong) space->n_pending_flushes,
+ (ulong) node->n_pending,
+ (ulong) count);
+ }
+ mutex_exit(&fil_system->mutex);
+ os_thread_sleep(20000);
+
+ count++;
+
+ goto try_again;
+ }
+
+ path = mem_strdup(space->name);
+
+ mutex_exit(&fil_system->mutex);
+#ifndef UNIV_HOTBACKUP
+ /* Invalidate in the buffer pool all pages belonging to the
+ tablespace. Since we have set space->is_being_deleted = TRUE, readahead
+ or ibuf merge can no longer read more pages of this tablespace to the
+ buffer pool. Thus we can clean the tablespace out of the buffer pool
+ completely and permanently. The flag is_being_deleted also prevents
+ fil_flush() from being applied to this tablespace. */
+
+ buf_LRU_invalidate_tablespace(id);
+#endif
+ /* printf("Deleting tablespace %s id %lu\n", space->name, id); */
+
+ success = fil_space_free(id, FALSE);
+
+ if (success) {
+ success = os_file_delete(path);
+
+ if (!success) {
+ success = os_file_delete_if_exists(path);
+ }
+ }
+
+ if (success) {
+#ifndef UNIV_HOTBACKUP
+ /* Write a log record about the deletion of the .ibd
+ file, so that ibbackup can replay it in the
+ --apply-log phase. We use a dummy mtr and the familiar
+ log write mechanism. */
+ mtr_t mtr;
+
+ /* When replaying the operation in ibbackup, do not try
+ to write any log record */
+ mtr_start(&mtr);
+
+ fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr);
+ mtr_commit(&mtr);
+#endif
+ mem_free(path);
+
+ return(TRUE);
+ }
+
+ mem_free(path);
+
+ return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_discard_tablespace(
+/*===================*/
+ ulint id) /*!< in: space id */
+{
+ ibool success;
+
+ success = fil_delete_tablespace(id);
+
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Warning: cannot delete tablespace %lu"
+ " in DISCARD TABLESPACE.\n"
+ "InnoDB: But let us remove the"
+ " insert buffer entries for this tablespace.\n",
+ (ulong) id);
+ }
+
+ /* Remove all insert buffer entries for the tablespace */
+
+ ibuf_delete_for_discarded_space(id);
+
+ return(success);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Renames the memory cache structures of a single-table tablespace.
+@return TRUE if success */
+static
+ibool
+fil_rename_tablespace_in_mem(
+/*=========================*/
+ fil_space_t* space, /*!< in: tablespace memory object */
+ fil_node_t* node, /*!< in: file node of that tablespace */
+ const char* path) /*!< in: new name */
+{
+ fil_space_t* space2;
+ const char* old_name = space->name;
+
+ ut_ad(mutex_own(&fil_system->mutex));
+
+ space2 = fil_space_get_by_name(old_name);
+ if (space != space2) {
+ fputs("InnoDB: Error: cannot find ", stderr);
+ ut_print_filename(stderr, old_name);
+ fputs(" in tablespace memory cache\n", stderr);
+
+ return(FALSE);
+ }
+
+ space2 = fil_space_get_by_name(path);
+ if (space2 != NULL) {
+ fputs("InnoDB: Error: ", stderr);
+ ut_print_filename(stderr, path);
+ fputs(" is already in tablespace memory cache\n", stderr);
+
+ return(FALSE);
+ }
+
+ HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+ ut_fold_string(space->name), space);
+ mem_free(space->name);
+ mem_free(node->name);
+
+ space->name = mem_strdup(path);
+ node->name = mem_strdup(path);
+
+ HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+ ut_fold_string(path), space);
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Allocates a file name for a single-table tablespace. The string must be freed
+by caller with mem_free().
+@return own: file name */
+static
+char*
+fil_make_ibd_name(
+/*==============*/
+ const char* name, /*!< in: table name or a dir path of a
+ TEMPORARY table */
+ ibool is_temp) /*!< in: TRUE if it is a dir path */
+{
+ ulint namelen = strlen(name);
+ ulint dirlen = strlen(fil_path_to_mysql_datadir);
+ char* filename = mem_alloc(namelen + dirlen + sizeof "/.ibd");
+
+ if (is_temp) {
+ memcpy(filename, name, namelen);
+ memcpy(filename + namelen, ".ibd", sizeof ".ibd");
+ } else {
+ memcpy(filename, fil_path_to_mysql_datadir, dirlen);
+ filename[dirlen] = '/';
+
+ memcpy(filename + dirlen + 1, name, namelen);
+ memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd");
+ }
+
+ srv_normalize_path_for_win(filename);
+
+ return(filename);
+}
+
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+ const char* old_name, /*!< in: old table name in the standard
+ databasename/tablename format of
+ InnoDB, or NULL if we do the rename
+ based on the space id only */
+ ulint id, /*!< in: space id */
+ const char* new_name) /*!< in: new table name in the standard
+ databasename/tablename format
+ of InnoDB */
+{
+ ibool success;
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint count = 0;
+ char* path;
+ ibool old_name_was_specified = TRUE;
+ char* old_path;
+
+ ut_a(id != 0);
+
+ if (old_name == NULL) {
+ old_name = "(name not specified)";
+ old_name_was_specified = FALSE;
+ }
+retry:
+ count++;
+
+ if (count > 1000) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: problems renaming ", stderr);
+ ut_print_filename(stderr, old_name);
+ fputs(" to ", stderr);
+ ut_print_filename(stderr, new_name);
+ fprintf(stderr, ", %lu iterations\n", (ulong) count);
+ }
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL) {
+ fprintf(stderr,
+ "InnoDB: Error: cannot find space id %lu"
+ " in the tablespace memory cache\n"
+ "InnoDB: though the table ", (ulong) id);
+ ut_print_filename(stderr, old_name);
+ fputs(" in a rename operation should have that id\n", stderr);
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ if (count > 25000) {
+ space->stop_ios = FALSE;
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ /* We temporarily close the .ibd file because we do not trust that
+ operating systems can rename an open file. For the closing we have to
+ wait until there are no pending i/o's or flushes on the file. */
+
+ space->stop_ios = TRUE;
+
+ ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ if (node->n_pending > 0 || node->n_pending_flushes > 0) {
+ /* There are pending i/o's or flushes, sleep for a while and
+ retry */
+
+ mutex_exit(&fil_system->mutex);
+
+ os_thread_sleep(20000);
+
+ goto retry;
+
+ } else if (node->modification_counter > node->flush_counter) {
+ /* Flush the space */
+
+ mutex_exit(&fil_system->mutex);
+
+ os_thread_sleep(20000);
+
+ fil_flush(id);
+
+ goto retry;
+
+ } else if (node->open) {
+ /* Close the file */
+
+ fil_node_close_file(node, fil_system);
+ }
+
+ /* Check that the old name in the space is right */
+
+ if (old_name_was_specified) {
+ old_path = fil_make_ibd_name(old_name, FALSE);
+
+ ut_a(strcmp(space->name, old_path) == 0);
+ ut_a(strcmp(node->name, old_path) == 0);
+ } else {
+ old_path = mem_strdup(space->name);
+ }
+
+ /* Rename the tablespace and the node in the memory cache */
+ path = fil_make_ibd_name(new_name, FALSE);
+ success = fil_rename_tablespace_in_mem(space, node, path);
+
+ if (success) {
+ success = os_file_rename(old_path, path);
+
+ if (!success) {
+ /* We have to revert the changes we made
+ to the tablespace memory cache */
+
+ ut_a(fil_rename_tablespace_in_mem(space, node,
+ old_path));
+ }
+ }
+
+ mem_free(path);
+ mem_free(old_path);
+
+ space->stop_ios = FALSE;
+
+ mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+ if (success) {
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name,
+ &mtr);
+ mtr_commit(&mtr);
+ }
+#endif
+ return(success);
+}
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fil_create_new_single_table_tablespace(
+/*===================================*/
+ ulint space_id, /*!< in: space id */
+ const char* tablename, /*!< in: the table name in the usual
+ databasename/tablename format
+ of InnoDB, or a dir path to a temp
+ table */
+ ibool is_temp, /*!< in: TRUE if a table created with
+ CREATE TEMPORARY TABLE */
+ ulint flags, /*!< in: tablespace flags */
+ ulint size) /*!< in: the initial size of the
+ tablespace file in pages,
+ must be >= FIL_IBD_FILE_INITIAL_SIZE */
+{
+ os_file_t file;
+ ibool ret;
+ ulint err;
+ byte* buf2;
+ byte* page;
+ ibool success;
+ char* path;
+
+ ut_a(space_id > 0);
+ ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
+ ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+ /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+ ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
+ ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+ path = fil_make_ibd_name(tablename, is_temp);
+
+ file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ if (ret == FALSE) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error creating file ", stderr);
+ ut_print_filename(stderr, path);
+ fputs(".\n", stderr);
+
+ /* The following call will print an error message */
+
+ err = os_file_get_last_error(TRUE);
+
+ if (err == OS_FILE_ALREADY_EXISTS) {
+ fputs("InnoDB: The file already exists though"
+ " the corresponding table did not\n"
+ "InnoDB: exist in the InnoDB data dictionary."
+ " Have you moved InnoDB\n"
+ "InnoDB: .ibd files around without using the"
+ " SQL commands\n"
+ "InnoDB: DISCARD TABLESPACE and"
+ " IMPORT TABLESPACE, or did\n"
+ "InnoDB: mysqld crash in the middle of"
+ " CREATE TABLE? You can\n"
+ "InnoDB: resolve the problem by"
+ " removing the file ", stderr);
+ ut_print_filename(stderr, path);
+ fputs("\n"
+ "InnoDB: under the 'datadir' of MySQL.\n",
+ stderr);
+
+ mem_free(path);
+ return(DB_TABLESPACE_ALREADY_EXISTS);
+ }
+
+ if (err == OS_FILE_DISK_FULL) {
+
+ mem_free(path);
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ mem_free(path);
+ return(DB_ERROR);
+ }
+
+ ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0);
+
+ if (!ret) {
+ err = DB_OUT_OF_FILE_SPACE;
+error_exit:
+ os_file_close(file);
+error_exit2:
+ os_file_delete(path);
+
+ mem_free(path);
+ return(err);
+ }
+
+ /* printf("Creating tablespace %s id %lu\n", path, space_id); */
+
+ /* We have to write the space id to the file immediately and flush the
+ file to disk. This is because in crash recovery we must be aware what
+ tablespaces exist and what are their space id's, so that we can apply
+ the log records to the right file. It may take quite a while until
+ buffer pool flush algorithms write anything to the file and flush it to
+ disk. If we would not write here anything, the file would be filled
+ with zeros from the call of os_file_set_size(), until a buffer pool
+ flush would write to it. */
+
+ buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+ /* Align the memory for file i/o if we might have O_DIRECT set */
+ page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ memset(page, '\0', UNIV_PAGE_SIZE);
+
+ fsp_header_init_fields(page, space_id, flags);
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+
+ if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+ buf_flush_init_for_writing(page, NULL, 0);
+ ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE);
+ } else {
+ page_zip_des_t page_zip;
+ ulint zip_size;
+
+ zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
+ << ((flags & DICT_TF_ZSSIZE_MASK)
+ >> DICT_TF_ZSSIZE_SHIFT));
+
+ page_zip_set_size(&page_zip, zip_size);
+ page_zip.data = page + UNIV_PAGE_SIZE;
+#ifdef UNIV_DEBUG
+ page_zip.m_start =
+#endif /* UNIV_DEBUG */
+ page_zip.m_end = page_zip.m_nonempty =
+ page_zip.n_blobs = 0;
+ buf_flush_init_for_writing(page, &page_zip, 0);
+ ret = os_file_write(path, file, page_zip.data, 0, 0, zip_size);
+ }
+
+ ut_free(buf2);
+
+ if (!ret) {
+ fputs("InnoDB: Error: could not write the first page"
+ " to tablespace ", stderr);
+ ut_print_filename(stderr, path);
+ putc('\n', stderr);
+ err = DB_ERROR;
+ goto error_exit;
+ }
+
+ ret = os_file_flush(file);
+
+ if (!ret) {
+ fputs("InnoDB: Error: file flush of tablespace ", stderr);
+ ut_print_filename(stderr, path);
+ fputs(" failed\n", stderr);
+ err = DB_ERROR;
+ goto error_exit;
+ }
+
+ os_file_close(file);
+
+ success = fil_space_create(path, space_id, flags, FIL_TABLESPACE);
+
+ if (!success) {
+ err = DB_ERROR;
+ goto error_exit2;
+ }
+
+ fil_node_create(path, size, space_id, FALSE);
+
+#ifndef UNIV_HOTBACKUP
+ {
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ fil_op_write_log(flags
+ ? MLOG_FILE_CREATE2
+ : MLOG_FILE_CREATE,
+ space_id,
+ is_temp ? MLOG_FILE_FLAG_TEMP : 0,
+ flags,
+ tablename, NULL, &mtr);
+
+ mtr_commit(&mtr);
+ }
+#endif
+ mem_free(path);
+ return(DB_SUCCESS);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+ const char* name, /*!< in: table name in the
+ databasename/tablename format */
+ ib_uint64_t current_lsn) /*!< in: reset lsn's if the lsn stamped
+ to FIL_PAGE_FILE_FLUSH_LSN in the
+ first page is too high */
+{
+ os_file_t file;
+ char* filepath;
+ byte* page;
+ byte* buf2;
+ ib_uint64_t flush_lsn;
+ ulint space_id;
+ ib_int64_t file_size;
+ ib_int64_t offset;
+ ulint zip_size;
+ ibool success;
+ page_zip_des_t page_zip;
+
+ filepath = fil_make_ibd_name(name, FALSE);
+
+ file = os_file_create_simple_no_error_handling(
+ filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: trying to open a table,"
+ " but could not\n"
+ "InnoDB: open the tablespace file ", stderr);
+ ut_print_filename(stderr, filepath);
+ fputs("!\n", stderr);
+ mem_free(filepath);
+
+ return(FALSE);
+ }
+
+ /* Read the first page of the tablespace */
+
+ buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+ /* Align the memory for file i/o if we might have O_DIRECT set */
+ page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ /* We have to read the file flush lsn from the header of the file */
+
+ flush_lsn = mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN);
+
+ if (current_lsn >= flush_lsn) {
+ /* Ok */
+ success = TRUE;
+
+ goto func_exit;
+ }
+
+ space_id = fsp_header_get_space_id(page);
+ zip_size = fsp_header_get_zip_size(page);
+
+ page_zip_des_init(&page_zip);
+ page_zip_set_size(&page_zip, zip_size);
+ if (zip_size) {
+ page_zip.data = page + UNIV_PAGE_SIZE;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Flush lsn in the tablespace file %lu"
+ " to be imported\n"
+ "InnoDB: is %llu, which exceeds current"
+ " system lsn %llu.\n"
+ "InnoDB: We reset the lsn's in the file ",
+ (ulong) space_id,
+ flush_lsn, current_lsn);
+ ut_print_filename(stderr, filepath);
+ fputs(".\n", stderr);
+
+ ut_a(ut_is_2pow(zip_size));
+ ut_a(zip_size <= UNIV_PAGE_SIZE);
+
+ /* Loop through all the pages in the tablespace and reset the lsn and
+ the page checksum if necessary */
+
+ file_size = os_file_get_size_as_iblonglong(file);
+
+ for (offset = 0; offset < file_size;
+ offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
+ success = os_file_read(file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32),
+ zip_size ? zip_size : UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+ if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) {
+ /* We have to reset the lsn */
+
+ if (zip_size) {
+ memcpy(page_zip.data, page, zip_size);
+ buf_flush_init_for_writing(
+ page, &page_zip, current_lsn);
+ success = os_file_write(
+ filepath, file, page_zip.data,
+ (ulint) offset & 0xFFFFFFFFUL,
+ (ulint) (offset >> 32), zip_size);
+ } else {
+ buf_flush_init_for_writing(
+ page, NULL, current_lsn);
+ success = os_file_write(
+ filepath, file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32),
+ UNIV_PAGE_SIZE);
+ }
+
+ if (!success) {
+
+ goto func_exit;
+ }
+ }
+ }
+
+ success = os_file_flush(file);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ /* We now update the flush_lsn stamp at the start of the file */
+ success = os_file_read(file, page, 0, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+
+ mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
+
+ success = os_file_write(filepath, file, page, 0, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE);
+ if (!success) {
+
+ goto func_exit;
+ }
+ success = os_file_flush(file);
+func_exit:
+ os_file_close(file);
+ ut_free(buf2);
+ mem_free(filepath);
+
+ return(success);
+}
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+ ibool check_space_id, /*!< in: should we check that the space
+ id in the file is right; we assume
+ that this function runs much faster
+ if no check is made, since accessing
+ the file inode probably is much
+ faster (the OS caches them) than
+ accessing the first page of the file */
+ ulint id, /*!< in: space id */
+ ulint flags, /*!< in: tablespace flags */
+ const char* name) /*!< in: table name in the
+ databasename/tablename format */
+{
+ os_file_t file;
+ char* filepath;
+ ibool success;
+ byte* buf2;
+ byte* page;
+ ulint space_id;
+ ulint space_flags;
+
+ filepath = fil_make_ibd_name(name, FALSE);
+
+ /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+ ROW_FORMAT=COMPACT
+ ((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+ ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
+ format, the tablespace flags should equal
+ (table->flags & ~(~0 << DICT_TF_BITS)). */
+ ut_a(flags != DICT_TF_COMPACT);
+ ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+ file = os_file_create_simple_no_error_handling(
+ filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: trying to open a table,"
+ " but could not\n"
+ "InnoDB: open the tablespace file ", stderr);
+ ut_print_filename(stderr, filepath);
+ fputs("!\n"
+ "InnoDB: Have you moved InnoDB .ibd files around"
+ " without using the\n"
+ "InnoDB: commands DISCARD TABLESPACE and"
+ " IMPORT TABLESPACE?\n"
+ "InnoDB: It is also possible that this is"
+ " a temporary table #sql...,\n"
+ "InnoDB: and MySQL removed the .ibd file for this.\n"
+ "InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+ "InnoDB: for how to resolve the issue.\n", stderr);
+
+ mem_free(filepath);
+
+ return(FALSE);
+ }
+
+ if (!check_space_id) {
+ space_id = id;
+
+ goto skip_check;
+ }
+
+ /* Read the first page of the tablespace */
+
+ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+ /* Align the memory for file i/o if we might have O_DIRECT set */
+ page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ /* We have to read the tablespace id and flags from the file. */
+
+ space_id = fsp_header_get_space_id(page);
+ space_flags = fsp_header_get_flags(page);
+
+ if (srv_expand_import
+ && (space_id != id || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
+ ibool file_is_corrupt = FALSE;
+ byte* buf3;
+ byte* descr_page;
+ ibool descr_is_corrupt = FALSE;
+ dulint old_id[31];
+ dulint new_id[31];
+ ulint root_page[31];
+ ulint n_index;
+ os_file_t info_file = (os_file_t) -1;
+ char* info_file_path;
+ ulint i;
+ int len;
+ ib_uint64_t current_lsn;
+ ulint size_low, size_high, size, free_limit;
+ ib_int64_t size_bytes, free_limit_bytes;
+ dict_table_t* table;
+ dict_index_t* index;
+ fil_system_t* system;
+ fil_node_t* node = NULL;
+ fil_space_t* space;
+
+ buf3 = ut_malloc(2 * UNIV_PAGE_SIZE);
+ descr_page = ut_align(buf3, UNIV_PAGE_SIZE);
+
+ current_lsn = log_get_lsn();
+
+ /* check the header page's consistency */
+ if (buf_page_is_corrupted(page,
+ dict_table_flags_to_zip_size(space_flags))) {
+ fprintf(stderr, "InnoDB: page 0 of %s seems corrupt.\n", filepath);
+ file_is_corrupt = TRUE;
+ descr_is_corrupt = TRUE;
+ }
+
+ /* store as first descr page */
+ memcpy(descr_page, page, UNIV_PAGE_SIZE);
+
+ /* get free limit (page number) of the table space */
+/* these should be same to the definition in fsp0fsp.c */
+#define FSP_HEADER_OFFSET FIL_PAGE_DATA
+#define FSP_FREE_LIMIT 12
+ free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page);
+ free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE;
+
+ /* overwrite fsp header */
+ fsp_header_init_fields(page, id, flags);
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
+ space_id = id;
+ space_flags = flags;
+ if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn)
+ mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? (!srv_fast_checksum
+ ? buf_calc_page_new_checksum(page)
+ : buf_calc_page_new_checksum_32(page))
+ : BUF_NO_CHECKSUM_MAGIC);
+ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ srv_use_checksums
+ ? buf_calc_page_old_checksum(page)
+ : BUF_NO_CHECKSUM_MAGIC);
+ success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ /* get file size */
+ os_file_get_size(file, &size_low, &size_high);
+ size_bytes = (((ib_int64_t)size_high) << 32)
+ + (ib_int64_t)size_low;
+
+ if (size_bytes < free_limit_bytes) {
+ free_limit_bytes = size_bytes;
+ fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath);
+ file_is_corrupt = TRUE;
+ }
+
+ /* get cruster index information */
+ table = dict_table_get_low(name);
+ index = dict_table_get_first_index(table);
+ ut_a(index->page==3);
+
+ /* read metadata from .exp file */
+ n_index = 0;
+ memset(old_id, 0, sizeof(old_id));
+ memset(new_id, 0, sizeof(new_id));
+ memset(root_page, 0, sizeof(root_page));
+
+ info_file_path = fil_make_ibd_name(name, FALSE);
+ len = strlen(info_file_path);
+ info_file_path[len - 3] = 'e';
+ info_file_path[len - 2] = 'x';
+ info_file_path[len - 1] = 'p';
+
+ info_file = os_file_create_simple_no_error_handling(
+ info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ if (!success) {
+ fprintf(stderr, "InnoDB: cannot open %s\n", info_file_path);
+ file_is_corrupt = TRUE;
+ goto skip_info;
+ }
+ success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE);
+ if (!success) {
+ fprintf(stderr, "InnoDB: cannot read %s\n", info_file_path);
+ file_is_corrupt = TRUE;
+ goto skip_info;
+ }
+ if (mach_read_from_4(page) != 0x78706f72UL
+ || mach_read_from_4(page + 4) != 0x74696e66UL) {
+ fprintf(stderr, "InnoDB: %s seems not to be a correct .exp file\n", info_file_path);
+ file_is_corrupt = TRUE;
+ goto skip_info;
+ }
+
+ fprintf(stderr, "InnoDB: import: extended import of %s is started.\n", name);
+
+ n_index = mach_read_from_4(page + 8);
+ fprintf(stderr, "InnoDB: import: %lu indexes are detected.\n", (ulong)n_index);
+ for (i = 0; i < n_index; i++) {
+ new_id[i] =
+ dict_table_get_index_on_name(table,
+ (char*)(page + (i + 1) * 512 + 12))->id;
+ old_id[i] = mach_read_from_8(page + (i + 1) * 512);
+ root_page[i] = mach_read_from_4(page + (i + 1) * 512 + 8);
+ }
+
+skip_info:
+ if (info_file != (os_file_t) -1)
+ os_file_close(info_file);
+
+ /*
+ if (size_bytes >= 1024 * 1024) {
+ size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+ }
+ */
+ if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ib_int64_t offset;
+
+ size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+ /* over write space id of all pages */
+ rec_offs_init(offsets_);
+
+ fprintf(stderr, "InnoDB: Progress in %%:");
+
+ for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) {
+ ulint checksum_field;
+ ulint old_checksum_field;
+ ibool page_is_corrupt;
+
+ success = os_file_read(file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+
+ page_is_corrupt = FALSE;
+
+ /* check consistency */
+ if (memcmp(page + FIL_PAGE_LSN + 4,
+ page + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+ page_is_corrupt = TRUE;
+ }
+
+ if (mach_read_from_4(page + FIL_PAGE_OFFSET)
+ != offset / UNIV_PAGE_SIZE) {
+
+ page_is_corrupt = TRUE;
+ }
+
+ checksum_field = mach_read_from_4(page
+ + FIL_PAGE_SPACE_OR_CHKSUM);
+
+ old_checksum_field = mach_read_from_4(
+ page + UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+ if (old_checksum_field != mach_read_from_4(page
+ + FIL_PAGE_LSN)
+ && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && old_checksum_field
+ != buf_calc_page_old_checksum(page)) {
+
+ page_is_corrupt = TRUE;
+ }
+
+ if (!srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum(page)) {
+
+ page_is_corrupt = TRUE;
+ }
+
+ if (srv_fast_checksum
+ && checksum_field != 0
+ && checksum_field != BUF_NO_CHECKSUM_MAGIC
+ && checksum_field
+ != buf_calc_page_new_checksum_32(page)
+ && checksum_field
+ != buf_calc_page_new_checksum(page)) {
+
+ page_is_corrupt = TRUE;
+ }
+
+ /* if it is free page, inconsistency is acceptable */
+ if (!offset) {
+ /* header page*/
+ /* it should be overwritten already */
+ ut_a(!page_is_corrupt);
+
+ } else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) {
+ /* descr page (not header) */
+ if (page_is_corrupt) {
+ file_is_corrupt = TRUE;
+ descr_is_corrupt = TRUE;
+ } else {
+ ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES);
+ descr_is_corrupt = FALSE;
+ }
+
+ /* store as descr page */
+ memcpy(descr_page, page, UNIV_PAGE_SIZE);
+
+ } else if (descr_is_corrupt) {
+ /* unknown state of the page */
+ if (page_is_corrupt) {
+ file_is_corrupt = TRUE;
+ }
+
+ } else {
+ /* check free page or not */
+ /* These definitions should be same to fsp0fsp.c */
+#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define XDES_BITMAP (FLST_NODE_SIZE + 12)
+#define XDES_BITS_PER_PAGE 2
+#define XDES_FREE_BIT 0
+#define XDES_SIZE \
+ (XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+ /*descr = descr_page + XDES_ARR_OFFSET + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)*/
+ /*xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)*/
+ byte* descr;
+ ulint index;
+ ulint byte_index;
+ ulint bit_index;
+
+ descr = descr_page + XDES_ARR_OFFSET
+ + XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE);
+
+ index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE);
+ byte_index = index / 8;
+ bit_index = index % 8;
+
+ if (ut_bit_get_nth(mach_read_from_1(descr + XDES_BITMAP + byte_index), bit_index)) {
+ /* free page */
+ if (page_is_corrupt) {
+ goto skip_write;
+ }
+ } else {
+ /* not free */
+ if (page_is_corrupt) {
+ file_is_corrupt = TRUE;
+ }
+ }
+ }
+
+ if (page_is_corrupt) {
+ fprintf(stderr, " [errp:%lld]", offset / UNIV_PAGE_SIZE);
+
+ /* cannot treat corrupt page */
+ goto skip_write;
+ }
+
+ if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) {
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
+
+ for (i = 0; (ulint) i < n_index; i++) {
+ if ((ulint) (offset / UNIV_PAGE_SIZE) == root_page[i]) {
+ /* this is index root page */
+ mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+ + FSEG_HDR_SPACE, id);
+ mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+ + FSEG_HDR_SPACE, id);
+ break;
+ }
+ }
+
+ if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
+ dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID));
+
+ if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
+ && ut_dulint_cmp(old_id[0], tmp) == 0) {
+ /* leaf page of cluster index, reset trx_id of records */
+ rec_t* rec;
+ rec_t* supremum;
+ ulint n_recs;
+
+ supremum = page_get_supremum_rec(page);
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ n_recs = page_get_n_recs(page);
+
+ while (rec && rec != supremum && n_recs > 0) {
+ ulint n_fields;
+ ulint i;
+ ulint offset = index->trx_id_offset;
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ n_fields = rec_offs_n_fields(offsets);
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+ trx_write_trx_id(rec + offset, ut_dulint_create(0, 1));
+
+ for (i = 0; i < n_fields; i++) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint local_len;
+ byte* data;
+
+ data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+ local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id);
+ }
+ }
+
+ rec = page_rec_get_next(rec);
+ n_recs--;
+ }
+ }
+
+ for (i = 0; i < n_index; i++) {
+ if (ut_dulint_cmp(old_id[i], tmp) == 0) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]);
+ break;
+ }
+ }
+ }
+
+ if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) {
+ mach_write_ull(page + FIL_PAGE_LSN, current_lsn);
+ mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ current_lsn);
+ }
+
+ mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+ srv_use_checksums
+ ? (!srv_fast_checksum
+ ? buf_calc_page_new_checksum(page)
+ : buf_calc_page_new_checksum_32(page))
+ : BUF_NO_CHECKSUM_MAGIC);
+ mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+ srv_use_checksums
+ ? buf_calc_page_old_checksum(page)
+ : BUF_NO_CHECKSUM_MAGIC);
+
+ success = os_file_write(filepath, file, page,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ (ulint)(offset >> 32), UNIV_PAGE_SIZE);
+ }
+
+skip_write:
+ if (free_limit_bytes
+ && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)
+ != ((offset * 100) / free_limit_bytes)) {
+ fprintf(stderr, " %lu",
+ (ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes));
+ }
+ }
+
+ fprintf(stderr, " done.\n");
+
+ /* update SYS_INDEXES set root page */
+ index = dict_table_get_first_index(table);
+ while (index) {
+ for (i = 0; i < n_index; i++) {
+ if (ut_dulint_cmp(new_id[i], index->id) == 0) {
+ break;
+ }
+ }
+
+ if (i != n_index
+ && root_page[i] != index->page) {
+ /* must update */
+ ulint error;
+ trx_t* trx;
+ pars_info_t* info = NULL;
+
+ trx = trx_allocate_for_mysql();
+ trx->op_info = "extended import";
+
+ info = pars_info_create();
+
+ pars_info_add_dulint_literal(info, "indexid", new_id[i]);
+ pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]);
+
+ error = que_eval_sql(info,
+ "PROCEDURE UPDATE_INDEX_PAGE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES"
+ " SET PAGE_NO = :new_page"
+ " WHERE ID = :indexid;\n"
+ "COMMIT WORK;\n"
+ "END;\n",
+ FALSE, trx);
+
+ if (error != DB_SUCCESS) {
+ fprintf(stderr, "InnoDB: failed to update SYS_INDEXES\n");
+ }
+
+ trx_commit_for_mysql(trx);
+
+ trx_free_for_mysql(trx);
+
+ index->page = root_page[i];
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ } else {
+ /* zip page? */
+ size = (ulint)
+ (size_bytes
+ / dict_table_flags_to_zip_size(flags));
+ fprintf(stderr, "InnoDB: import: table %s seems to be in newer format."
+ " It may not be able to treated for now.\n", name);
+ }
+ /* .exp file should be removed */
+ success = os_file_delete(info_file_path);
+ if (!success) {
+ success = os_file_delete_if_exists(info_file_path);
+ }
+ mem_free(info_file_path);
+
+ system = fil_system;
+ mutex_enter(&(system->mutex));
+ space = fil_space_get_by_id(id);
+ if (space)
+ node = UT_LIST_GET_FIRST(space->chain);
+ if (node && node->size < size) {
+ space->size += (size - node->size);
+ node->size = size;
+ }
+ mutex_exit(&(system->mutex));
+
+ ut_free(buf3);
+
+ if (file_is_corrupt) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: file ",
+ stderr);
+ ut_print_filename(stderr, filepath);
+ fprintf(stderr, " seems to be corrupt.\n"
+ "InnoDB: anyway, all not corrupt pages were tried to be converted to salvage.\n"
+ "InnoDB: ##### CAUTION #####\n"
+ "InnoDB: ## The .ibd must cause to crash InnoDB, though re-import would seem to be succeeded.\n"
+ "InnoDB: ## If you don't have knowledge about salvaging data from .ibd, you should not use the file.\n"
+ "InnoDB: ###################\n");
+ success = FALSE;
+
+ ut_free(buf2);
+
+ goto func_exit;
+ }
+ }
+
+ ut_free(buf2);
+
+ if (UNIV_UNLIKELY(space_id != id
+ || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: tablespace id and flags in file ",
+ stderr);
+ ut_print_filename(stderr, filepath);
+ fprintf(stderr, " are %lu and %lu, but in the InnoDB\n"
+ "InnoDB: data dictionary they are %lu and %lu.\n"
+ "InnoDB: Have you moved InnoDB .ibd files"
+ " around without using the\n"
+ "InnoDB: commands DISCARD TABLESPACE and"
+ " IMPORT TABLESPACE?\n"
+ "InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+ "InnoDB: for how to resolve the issue.\n",
+ (ulong) space_id, (ulong) space_flags,
+ (ulong) id, (ulong) flags);
+
+ success = FALSE;
+
+ goto func_exit;
+ }
+
+skip_check:
+ success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+
+ if (!success) {
+ goto func_exit;
+ }
+
+ /* We do not measure the size of the file, that is why we pass the 0
+ below */
+
+ fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+ os_file_close(file);
+ mem_free(filepath);
+
+ return(success);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Allocates a file name for an old version of a single-table tablespace.
+The string must be freed by caller with mem_free()!
+@return own: file name */
+static
+char*
+fil_make_ibbackup_old_name(
+/*=======================*/
+ const char* name) /*!< in: original file name */
+{
+ static const char suffix[] = "_ibbackup_old_vers_";
+ ulint len = strlen(name);
+ char* path = mem_alloc(len + (15 + sizeof suffix));
+
+ memcpy(path, name, len);
+ memcpy(path + len, suffix, (sizeof suffix) - 1);
+ ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix);
+ return(path);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.c data structures. */
+static
+void
+fil_load_single_table_tablespace(
+/*=============================*/
+ const char* dbname, /*!< in: database name */
+ const char* filename) /*!< in: file name (not a path),
+ including the .ibd extension */
+{
+ os_file_t file;
+ char* filepath;
+ ibool success;
+ byte* buf2;
+ byte* page;
+ ulint space_id;
+ ulint flags;
+ ulint size_low;
+ ulint size_high;
+ ib_uint64_t size;
+#ifdef UNIV_HOTBACKUP
+ fil_space_t* space;
+#endif
+ filepath = mem_alloc(strlen(dbname) + strlen(filename)
+ + strlen(fil_path_to_mysql_datadir) + 3);
+
+ sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname,
+ filename);
+ srv_normalize_path_for_win(filepath);
+#ifdef __WIN__
+# ifndef UNIV_HOTBACKUP
+ /* If lower_case_table_names is 0 or 2, then MySQL allows database
+ directory names with upper case letters. On Windows, all table and
+ database names in InnoDB are internally always in lower case. Put the
+ file path to lower case, so that we are consistent with InnoDB's
+ internal data dictionary. */
+
+ dict_casedn_str(filepath);
+# endif /* !UNIV_HOTBACKUP */
+#endif
+ file = os_file_create_simple_no_error_handling(
+ filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ fprintf(stderr,
+ "InnoDB: Error: could not open single-table tablespace"
+ " file\n"
+ "InnoDB: %s!\n"
+ "InnoDB: We do not continue the crash recovery,"
+ " because the table may become\n"
+ "InnoDB: corrupt if we cannot apply the log records"
+ " in the InnoDB log to it.\n"
+ "InnoDB: To fix the problem and start mysqld:\n"
+ "InnoDB: 1) If there is a permission problem"
+ " in the file and mysqld cannot\n"
+ "InnoDB: open the file, you should"
+ " modify the permissions.\n"
+ "InnoDB: 2) If the table is not needed, or you can"
+ " restore it from a backup,\n"
+ "InnoDB: then you can remove the .ibd file,"
+ " and InnoDB will do a normal\n"
+ "InnoDB: crash recovery and ignore that table.\n"
+ "InnoDB: 3) If the file system or the"
+ " disk is broken, and you cannot remove\n"
+ "InnoDB: the .ibd file, you can set"
+ " innodb_force_recovery > 0 in my.cnf\n"
+ "InnoDB: and force InnoDB to continue crash"
+ " recovery here.\n", filepath);
+
+ mem_free(filepath);
+
+ if (srv_force_recovery > 0) {
+ fprintf(stderr,
+ "InnoDB: innodb_force_recovery"
+ " was set to %lu. Continuing crash recovery\n"
+ "InnoDB: even though we cannot access"
+ " the .ibd file of this table.\n",
+ srv_force_recovery);
+ return;
+ }
+
+ exit(1);
+ }
+
+ success = os_file_get_size(file, &size_low, &size_high);
+
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ fprintf(stderr,
+ "InnoDB: Error: could not measure the size"
+ " of single-table tablespace file\n"
+ "InnoDB: %s!\n"
+ "InnoDB: We do not continue crash recovery,"
+ " because the table will become\n"
+ "InnoDB: corrupt if we cannot apply the log records"
+ " in the InnoDB log to it.\n"
+ "InnoDB: To fix the problem and start mysqld:\n"
+ "InnoDB: 1) If there is a permission problem"
+ " in the file and mysqld cannot\n"
+ "InnoDB: access the file, you should"
+ " modify the permissions.\n"
+ "InnoDB: 2) If the table is not needed,"
+ " or you can restore it from a backup,\n"
+ "InnoDB: then you can remove the .ibd file,"
+ " and InnoDB will do a normal\n"
+ "InnoDB: crash recovery and ignore that table.\n"
+ "InnoDB: 3) If the file system or the disk is broken,"
+ " and you cannot remove\n"
+ "InnoDB: the .ibd file, you can set"
+ " innodb_force_recovery > 0 in my.cnf\n"
+ "InnoDB: and force InnoDB to continue"
+ " crash recovery here.\n", filepath);
+
+ os_file_close(file);
+ mem_free(filepath);
+
+ if (srv_force_recovery > 0) {
+ fprintf(stderr,
+ "InnoDB: innodb_force_recovery"
+ " was set to %lu. Continuing crash recovery\n"
+ "InnoDB: even though we cannot access"
+ " the .ibd file of this table.\n",
+ srv_force_recovery);
+ return;
+ }
+
+ exit(1);
+ }
+
+ /* TODO: What to do in other cases where we cannot access an .ibd
+ file during a crash recovery? */
+
+ /* Every .ibd file is created >= 4 pages in size. Smaller files
+ cannot be ok. */
+
+ size = (((ib_uint64_t)size_high) << 32) + (ib_uint64_t)size_low;
+#ifndef UNIV_HOTBACKUP
+ if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error: the size of single-table tablespace"
+ " file %s\n"
+ "InnoDB: is only %lu %lu, should be at least %lu!",
+ filepath,
+ (ulong) size_high,
+ (ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE));
+ os_file_close(file);
+ mem_free(filepath);
+
+ return;
+ }
+#endif
+ /* Read the first page of the tablespace if the size big enough */
+
+ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+ /* Align the memory for file i/o if we might have O_DIRECT set */
+ page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+ success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+ /* We have to read the tablespace id from the file */
+
+ space_id = fsp_header_get_space_id(page);
+ flags = fsp_header_get_flags(page);
+ } else {
+ space_id = ULINT_UNDEFINED;
+ flags = 0;
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
+ fprintf(stderr,
+ "InnoDB: Error: tablespace id %lu in file %s"
+ " is not sensible\n",
+ (ulong) space_id,
+ filepath);
+ goto func_exit;
+ }
+#else
+ if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
+ char* new_path;
+
+ fprintf(stderr,
+ "InnoDB: Renaming tablespace %s of id %lu,\n"
+ "InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+ "InnoDB: because its size %" PRId64 " is too small"
+ " (< 4 pages 16 kB each),\n"
+ "InnoDB: or the space id in the file header"
+ " is not sensible.\n"
+ "InnoDB: This can happen in an ibbackup run,"
+ " and is not dangerous.\n",
+ filepath, space_id, filepath, size);
+ os_file_close(file);
+
+ new_path = fil_make_ibbackup_old_name(filepath);
+ ut_a(os_file_rename(filepath, new_path));
+
+ ut_free(buf2);
+ mem_free(filepath);
+ mem_free(new_path);
+
+ return;
+ }
+
+ /* A backup may contain the same space several times, if the space got
+ renamed at a sensitive time. Since it is enough to have one version of
+ the space, we rename the file if a space with the same space id
+ already exists in the tablespace memory cache. We rather rename the
+ file than delete it, because if there is a bug, we do not want to
+ destroy valuable data. */
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (space) {
+ char* new_path;
+
+ fprintf(stderr,
+ "InnoDB: Renaming tablespace %s of id %lu,\n"
+ "InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+ "InnoDB: because space %s with the same id\n"
+ "InnoDB: was scanned earlier. This can happen"
+ " if you have renamed tables\n"
+ "InnoDB: during an ibbackup run.\n",
+ filepath, space_id, filepath,
+ space->name);
+ os_file_close(file);
+
+ new_path = fil_make_ibbackup_old_name(filepath);
+
+ mutex_exit(&fil_system->mutex);
+
+ ut_a(os_file_rename(filepath, new_path));
+
+ ut_free(buf2);
+ mem_free(filepath);
+ mem_free(new_path);
+
+ return;
+ }
+ mutex_exit(&fil_system->mutex);
+#endif
+ success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+
+ if (!success) {
+
+ if (srv_force_recovery > 0) {
+ fprintf(stderr,
+ "InnoDB: innodb_force_recovery"
+ " was set to %lu. Continuing crash recovery\n"
+ "InnoDB: even though the tablespace creation"
+ " of this table failed.\n",
+ srv_force_recovery);
+ goto func_exit;
+ }
+
+ exit(1);
+ }
+
+ /* We do not use the size information we have about the file, because
+ the rounding formula for extents and pages is somewhat complex; we
+ let fil_node_open() do that task. */
+
+ fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+ os_file_close(file);
+ ut_free(buf2);
+ mem_free(filepath);
+}
+
+/***********************************************************************//**
+A fault-tolerant function that tries to read the next file name in the
+directory. We retry 100 times if os_file_readdir_next_file() returns -1. The
+idea is to read as much good data as we can and jump over bad data.
+@return 0 if ok, -1 if error even after the retries, 1 if at the end
+of the directory */
+static
+int
+fil_file_readdir_next_file(
+/*=======================*/
+ ulint* err, /*!< out: this is set to DB_ERROR if an error
+ was encountered, otherwise not changed */
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
+{
+ ulint i;
+ int ret;
+
+ for (i = 0; i < 100; i++) {
+ ret = os_file_readdir_next_file(dirname, dir, info);
+
+ if (ret != -1) {
+
+ return(ret);
+ }
+
+ fprintf(stderr,
+ "InnoDB: Error: os_file_readdir_next_file()"
+ " returned -1 in\n"
+ "InnoDB: directory %s\n"
+ "InnoDB: Crash recovery may have failed"
+ " for some .ibd files!\n", dirname);
+
+ *err = DB_ERROR;
+ }
+
+ return(-1);
+}
+
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_load_single_table_tablespaces(void)
+/*===================================*/
+{
+ int ret;
+ char* dbpath = NULL;
+ ulint dbpath_len = 100;
+ os_file_dir_t dir;
+ os_file_dir_t dbdir;
+ os_file_stat_t dbinfo;
+ os_file_stat_t fileinfo;
+ ulint err = DB_SUCCESS;
+
+ /* The datadir of MySQL is always the default directory of mysqld */
+
+ dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE);
+
+ if (dir == NULL) {
+
+ return(DB_ERROR);
+ }
+
+ dbpath = mem_alloc(dbpath_len);
+
+ /* Scan all directories under the datadir. They are the database
+ directories of MySQL. */
+
+ ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir,
+ &dbinfo);
+ while (ret == 0) {
+ ulint len;
+ /* printf("Looking at %s in datadir\n", dbinfo.name); */
+
+ if (dbinfo.type == OS_FILE_TYPE_FILE
+ || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
+
+ goto next_datadir_item;
+ }
+
+ /* We found a symlink or a directory; try opening it to see
+ if a symlink is a directory */
+
+ len = strlen(fil_path_to_mysql_datadir)
+ + strlen (dbinfo.name) + 2;
+ if (len > dbpath_len) {
+ dbpath_len = len;
+
+ if (dbpath) {
+ mem_free(dbpath);
+ }
+
+ dbpath = mem_alloc(dbpath_len);
+ }
+ sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir,
+ dbinfo.name);
+ srv_normalize_path_for_win(dbpath);
+
+ dbdir = os_file_opendir(dbpath, FALSE);
+
+ if (dbdir != NULL) {
+ /* printf("Opened dir %s\n", dbinfo.name); */
+
+ /* We found a database directory; loop through it,
+ looking for possible .ibd files in it */
+
+ ret = fil_file_readdir_next_file(&err, dbpath, dbdir,
+ &fileinfo);
+ while (ret == 0) {
+ /* printf(
+ " Looking at file %s\n", fileinfo.name); */
+
+ if (fileinfo.type == OS_FILE_TYPE_DIR) {
+
+ goto next_file_item;
+ }
+
+ /* We found a symlink or a file */
+ if (strlen(fileinfo.name) > 4
+ && 0 == strcmp(fileinfo.name
+ + strlen(fileinfo.name) - 4,
+ ".ibd")) {
+ /* The name ends in .ibd; try opening
+ the file */
+ fil_load_single_table_tablespace(
+ dbinfo.name, fileinfo.name);
+ }
+next_file_item:
+ ret = fil_file_readdir_next_file(&err,
+ dbpath, dbdir,
+ &fileinfo);
+ }
+
+ if (0 != os_file_closedir(dbdir)) {
+ fputs("InnoDB: Warning: could not"
+ " close database directory ", stderr);
+ ut_print_filename(stderr, dbpath);
+ putc('\n', stderr);
+
+ err = DB_ERROR;
+ }
+ }
+
+next_datadir_item:
+ ret = fil_file_readdir_next_file(&err,
+ fil_path_to_mysql_datadir,
+ dir, &dbinfo);
+ }
+
+ mem_free(dbpath);
+
+ if (0 != os_file_closedir(dir)) {
+ fprintf(stderr,
+ "InnoDB: Error: could not close MySQL datadir\n");
+
+ return(DB_ERROR);
+ }
+
+ return(err);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return TRUE if does not exist or is being\ deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+ ulint id, /*!< in: space id */
+ ib_int64_t version)/*!< in: tablespace_version should be this; if
+ you pass -1 as the value of this, then this
+ parameter is ignored */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ if (space == NULL || space->is_being_deleted) {
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+ }
+
+ if (version != ((ib_int64_t)-1)
+ && space->tablespace_version != version) {
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(space != NULL);
+}
+
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+ ulint id, /*!< in: space id */
+ const char* name, /*!< in: table name in the standard
+ 'databasename/tablename' format or
+ the dir path to a temp table */
+ ibool is_temp, /*!< in: TRUE if created with CREATE
+ TEMPORARY TABLE */
+ ibool mark_space, /*!< in: in crash recovery, at database
+ startup we mark all spaces which have
+ an associated table in the InnoDB
+ data dictionary, so that
+ we can print a warning about orphaned
+ tablespaces */
+ ibool print_error_if_does_not_exist)
+ /*!< in: print detailed error
+ information to the .err log if a
+ matching tablespace is not found from
+ memory */
+{
+ fil_space_t* namespace;
+ fil_space_t* space;
+ char* path;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ path = fil_make_ibd_name(name, is_temp);
+
+ /* Look if there is a space with the same id */
+
+ space = fil_space_get_by_id(id);
+
+ /* Look if there is a space with the same name; the name is the
+ directory path from the datadir to the file */
+
+ namespace = fil_space_get_by_name(path);
+ if (space && space == namespace) {
+ /* Found */
+
+ if (mark_space) {
+ space->mark = TRUE;
+ }
+
+ mem_free(path);
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+ }
+
+ if (!print_error_if_does_not_exist) {
+
+ mem_free(path);
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ if (space == NULL) {
+ if (namespace == NULL) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary"
+ " has tablespace id %lu,\n"
+ "InnoDB: but tablespace with that id"
+ " or name does not exist. Have\n"
+ "InnoDB: you deleted or moved .ibd files?\n"
+ "InnoDB: This may also be a table created with"
+ " CREATE TEMPORARY TABLE\n"
+ "InnoDB: whose .ibd and .frm files"
+ " MySQL automatically removed, but the\n"
+ "InnoDB: table still exists in the"
+ " InnoDB internal data dictionary.\n",
+ (ulong) id);
+ } else {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary has"
+ " tablespace id %lu,\n"
+ "InnoDB: but a tablespace with that id"
+ " does not exist. There is\n"
+ "InnoDB: a tablespace of name %s and id %lu,"
+ " though. Have\n"
+ "InnoDB: you deleted or moved .ibd files?\n",
+ (ulong) id, namespace->name,
+ (ulong) namespace->id);
+ }
+error_exit:
+ fputs("InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+ "InnoDB: for how to resolve the issue.\n", stderr);
+
+ mem_free(path);
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+ }
+
+ if (0 != strcmp(space->name, path)) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_filename(stderr, name);
+ fprintf(stderr, "\n"
+ "InnoDB: in InnoDB data dictionary has"
+ " tablespace id %lu,\n"
+ "InnoDB: but the tablespace with that id"
+ " has name %s.\n"
+ "InnoDB: Have you deleted or moved .ibd files?\n",
+ (ulong) id, space->name);
+
+ if (namespace != NULL) {
+ fputs("InnoDB: There is a tablespace"
+ " with the right name\n"
+ "InnoDB: ", stderr);
+ ut_print_filename(stderr, namespace->name);
+ fprintf(stderr, ", but its id is %lu.\n",
+ (ulong) namespace->id);
+ }
+
+ goto error_exit;
+ }
+
+ mem_free(path);
+ mutex_exit(&fil_system->mutex);
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return space id, ULINT_UNDEFINED if not found */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+ const char* name) /*!< in: table name in the standard
+ 'databasename/tablename' format */
+{
+ fil_space_t* namespace;
+ ulint id = ULINT_UNDEFINED;
+ char* path;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ path = fil_make_ibd_name(name, FALSE);
+
+ /* Look if there is a space with the same name; the name is the
+ directory path to the file */
+
+ namespace = fil_space_get_by_name(path);
+
+ if (namespace) {
+ id = namespace->id;
+ }
+
+ mem_free(path);
+
+ mutex_exit(&fil_system->mutex);
+
+ return(id);
+}
+
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+ ulint* actual_size, /*!< out: size of the space after extension;
+ if we ran out of disk space this may be lower
+ than the desired size */
+ ulint space_id, /*!< in: space id */
+ ulint size_after_extend)/*!< in: desired size in pages after the
+ extension; if the current space size is bigger
+ than this already, the function does nothing */
+{
+ fil_node_t* node;
+ fil_space_t* space;
+ byte* buf2;
+ byte* buf;
+ ulint buf_size;
+ ulint start_page_no;
+ ulint file_start_page_no;
+ ulint offset_high;
+ ulint offset_low;
+ ulint page_size;
+ ibool success = TRUE;
+
+ fil_mutex_enter_and_prepare_for_io(space_id);
+
+ space = fil_space_get_by_id(space_id);
+ ut_a(space);
+
+ if (space->size >= size_after_extend) {
+ /* Space already big enough */
+
+ *actual_size = space->size;
+
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+ }
+
+ page_size = dict_table_flags_to_zip_size(space->flags);
+ if (!page_size) {
+ page_size = UNIV_PAGE_SIZE;
+ }
+
+ node = UT_LIST_GET_LAST(space->chain);
+
+ fil_node_prepare_for_io(node, fil_system, space);
+
+ start_page_no = space->size;
+ file_start_page_no = space->size - node->size;
+
+ /* Extend at most 64 pages at a time */
+ buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
+ buf2 = mem_alloc(buf_size + page_size);
+ buf = ut_align(buf2, page_size);
+
+ memset(buf, 0, buf_size);
+
+ while (start_page_no < size_after_extend) {
+ ulint n_pages = ut_min(buf_size / page_size,
+ size_after_extend - start_page_no);
+
+ offset_high = (start_page_no - file_start_page_no)
+ / (4096 * ((1024 * 1024) / page_size));
+ offset_low = ((start_page_no - file_start_page_no)
+ % (4096 * ((1024 * 1024) / page_size)))
+ * page_size;
+#ifdef UNIV_HOTBACKUP
+ success = os_file_write(node->name, node->handle, buf,
+ offset_low, offset_high,
+ page_size * n_pages);
+#else
+ success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+ node->name, node->handle, buf,
+ offset_low, offset_high,
+ page_size * n_pages,
+ NULL, NULL, NULL);
+#endif
+ if (success) {
+ node->size += n_pages;
+ space->size += n_pages;
+
+ os_has_said_disk_full = FALSE;
+ } else {
+ /* Let us measure the size of the file to determine
+ how much we were able to extend it */
+
+ n_pages = ((ulint)
+ (os_file_get_size_as_iblonglong(
+ node->handle)
+ / page_size)) - node->size;
+
+ node->size += n_pages;
+ space->size += n_pages;
+
+ break;
+ }
+
+ start_page_no += n_pages;
+ }
+
+ mem_free(buf2);
+
+ fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
+
+ *actual_size = space->size;
+
+#ifndef UNIV_HOTBACKUP
+ if (space_id == 0) {
+ ulint pages_per_mb = (1024 * 1024) / page_size;
+
+ /* Keep the last data file size info up to date, rounded to
+ full megabytes */
+
+ srv_data_file_sizes[srv_n_data_files - 1]
+ = (node->size / pages_per_mb) * pages_per_mb;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ /*
+ printf("Extended %s to %lu, actual size %lu pages\n", space->name,
+ size_after_extend, *actual_size); */
+ mutex_exit(&fil_system->mutex);
+
+ fil_flush(space_id);
+
+ return(success);
+}
+
+#ifdef UNIV_HOTBACKUP
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be applied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void)
+/*======================================*/
+{
+ fil_space_t* space;
+ byte* buf;
+ ulint actual_size;
+ ulint size_in_header;
+ ulint error;
+ ibool success;
+
+ buf = mem_alloc(UNIV_PAGE_SIZE);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+ while (space) {
+ ut_a(space->purpose == FIL_TABLESPACE);
+
+ mutex_exit(&fil_system->mutex); /* no need to protect with a
+ mutex, because this is a
+ single-threaded operation */
+ error = fil_read(TRUE, space->id,
+ dict_table_flags_to_zip_size(space->flags),
+ 0, 0, UNIV_PAGE_SIZE, buf, NULL);
+ ut_a(error == DB_SUCCESS);
+
+ size_in_header = fsp_get_size_low(buf);
+
+ success = fil_extend_space_to_desired_size(
+ &actual_size, space->id, size_in_header);
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Error: could not extend the"
+ " tablespace of %s\n"
+ "InnoDB: to the size stored in header,"
+ " %lu pages;\n"
+ "InnoDB: size after extension %lu pages\n"
+ "InnoDB: Check that you have free disk space"
+ " and retry!\n",
+ space->name, size_in_header, actual_size);
+ exit(1);
+ }
+
+ mutex_enter(&fil_system->mutex);
+
+ space = UT_LIST_GET_NEXT(space_list, space);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ mem_free(buf);
+}
+#endif
+
+/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
+
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_free_now, /*!< in: number of free extents now */
+ ulint n_to_reserve) /*!< in: how many one wants to reserve */
+{
+ fil_space_t* space;
+ ibool success;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+
+ if (space->n_reserved_extents + n_to_reserve > n_free_now) {
+ success = FALSE;
+ } else {
+ space->n_reserved_extents += n_to_reserve;
+ success = TRUE;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(success);
+}
+
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_reserved) /*!< in: how many one reserved */
+{
+ fil_space_t* space;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+ ut_a(space->n_reserved_extents >= n_reserved);
+
+ space->n_reserved_extents -= n_reserved;
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+ ulint id) /*!< in: space id */
+{
+ fil_space_t* space;
+ ulint n;
+
+ ut_ad(fil_system);
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(id);
+
+ ut_a(space);
+
+ n = space->n_reserved_extents;
+
+ mutex_exit(&fil_system->mutex);
+
+ return(n);
+}
+
+/*============================ FILE I/O ================================*/
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ fil_space_t* space) /*!< in: space */
+{
+ ut_ad(node && system && space);
+ ut_ad(mutex_own(&(system->mutex)));
+
+ if (system->n_open > system->max_n_open + 5) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: open files %lu"
+ " exceeds the limit %lu\n",
+ (ulong) system->n_open,
+ (ulong) system->max_n_open);
+ }
+
+ if (node->open == FALSE) {
+ /* File is closed: open it */
+ ut_a(node->n_pending == 0);
+
+ fil_node_open_file(node, system, space);
+ }
+
+ if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
+ && !trx_sys_sys_space(space->id)) {
+ /* The node is in the LRU list, remove it */
+
+ ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+ UT_LIST_REMOVE(LRU, system->LRU, node);
+ }
+
+ node->n_pending++;
+}
+
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+ fil_node_t* node, /*!< in: file node */
+ fil_system_t* system, /*!< in: tablespace memory cache */
+ ulint type) /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+ the node as modified if
+ type == OS_FILE_WRITE */
+{
+ ut_ad(node);
+ ut_ad(system);
+ ut_ad(mutex_own(&(system->mutex)));
+
+ ut_a(node->n_pending > 0);
+
+ node->n_pending--;
+
+ if (type == OS_FILE_WRITE) {
+ system->modification_counter++;
+ node->modification_counter = system->modification_counter;
+
+ if (!node->space->is_in_unflushed_spaces) {
+
+ node->space->is_in_unflushed_spaces = TRUE;
+ UT_LIST_ADD_FIRST(unflushed_spaces,
+ system->unflushed_spaces,
+ node->space);
+ }
+ }
+
+ if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
+ && !trx_sys_sys_space(node->space->id)) {
+ /* The node must be put back to the LRU list */
+ UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+ }
+}
+
+/********************************************************************//**
+Report information about an invalid page access. */
+static
+void
+fil_report_invalid_page_access(
+/*===========================*/
+ ulint block_offset, /*!< in: block offset */
+ ulint space_id, /*!< in: space id */
+ const char* space_name, /*!< in: space name */
+ ulint byte_offset, /*!< in: byte offset */
+ ulint len, /*!< in: I/O length */
+ ulint type) /*!< in: I/O type */
+{
+ fprintf(stderr,
+ "InnoDB: Error: trying to access page number %lu"
+ " in space %lu,\n"
+ "InnoDB: space name %s,\n"
+ "InnoDB: which is outside the tablespace bounds.\n"
+ "InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n"
+ "InnoDB: If you get this error at mysqld startup,"
+ " please check that\n"
+ "InnoDB: your my.cnf matches the ibdata files"
+ " that you have in the\n"
+ "InnoDB: MySQL server.\n",
+ (ulong) block_offset, (ulong) space_id, space_name,
+ (ulong) byte_offset, (ulong) len, (ulong) type);
+}
+
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INTERN
+ulint
+_fil_io(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE,
+ ORed to OS_FILE_LOG, if a log i/o
+ and ORed to OS_AIO_SIMULATED_WAKE_LATER
+ if simulated aio and we want to post a
+ batch of i/os; NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len, /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+ void* buf, /*!< in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+ void* message, /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+ trx_t* trx)
+{
+ ulint mode;
+ fil_space_t* space;
+ fil_node_t* node;
+ ulint offset_high;
+ ulint offset_low;
+ ibool ret;
+ ulint is_log;
+ ulint wake_later;
+
+ is_log = type & OS_FILE_LOG;
+ type = type & ~OS_FILE_LOG;
+
+ wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
+ type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+ ut_ad(byte_offset < UNIV_PAGE_SIZE);
+ ut_ad(!zip_size || !byte_offset);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(buf);
+ ut_ad(len > 0);
+//#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE
+//# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE"
+//#endif
+ ut_ad(fil_validate());
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+ /* ibuf bitmap pages must be read in the sync aio mode: */
+ ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
+ || !ibuf_bitmap_page(zip_size, block_offset)
+ || sync || is_log);
+ ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
+ || ibuf_page(space_id, zip_size, block_offset, NULL));
+# endif /* UNIV_LOG_DEBUG */
+ if (sync) {
+ mode = OS_AIO_SYNC;
+ } else if (is_log) {
+ mode = OS_AIO_LOG;
+ } else if (type == OS_FILE_READ
+ && !recv_no_ibuf_operations
+ && ibuf_page(space_id, zip_size, block_offset, NULL)) {
+ mode = OS_AIO_IBUF;
+ } else {
+ mode = OS_AIO_NORMAL;
+ }
+#else /* !UNIV_HOTBACKUP */
+ ut_a(sync);
+ mode = OS_AIO_SYNC;
+#endif /* !UNIV_HOTBACKUP */
+
+ if (type == OS_FILE_READ) {
+ srv_data_read+= len;
+ } else if (type == OS_FILE_WRITE) {
+ srv_data_written+= len;
+ }
+
+ /* Reserve the fil_system mutex and make sure that we can open at
+ least one file while holding it, if the file is not already open */
+
+ fil_mutex_enter_and_prepare_for_io(space_id);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (!space) {
+ mutex_exit(&fil_system->mutex);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: trying to do i/o"
+ " to a tablespace which does not exist.\n"
+ "InnoDB: i/o type %lu, space id %lu,"
+ " page no. %lu, i/o length %lu bytes\n",
+ (ulong) type, (ulong) space_id, (ulong) block_offset,
+ (ulong) len);
+
+ return(DB_TABLESPACE_DELETED);
+ }
+
+ ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(node == NULL)) {
+ fil_report_invalid_page_access(
+ block_offset, space_id, space->name,
+ byte_offset, len, type);
+
+ ut_error;
+ }
+
+ if (space->id != 0 && node->size == 0) {
+ /* We do not know the size of a single-table tablespace
+ before we open the file */
+
+ break;
+ }
+
+ if (node->size > block_offset) {
+ /* Found! */
+ break;
+ } else {
+ block_offset -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+
+ /* Open file if closed */
+ fil_node_prepare_for_io(node, fil_system, space);
+
+ /* Check that at least the start offset is within the bounds of a
+ single-table tablespace */
+ if (UNIV_UNLIKELY(node->size <= block_offset)
+ && space->id != 0 && space->purpose == FIL_TABLESPACE) {
+
+ fil_report_invalid_page_access(
+ block_offset, space_id, space->name, byte_offset,
+ len, type);
+
+ ut_error;
+ }
+
+ /* Now we have made the changes in the data structures of fil_system */
+ mutex_exit(&fil_system->mutex);
+
+ /* Calculate the low 32 bits and the high 32 bits of the file offset */
+
+ if (!zip_size) {
+ offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
+ offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT)
+ & 0xFFFFFFFFUL) + byte_offset;
+
+ ut_a(node->size - block_offset
+ >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
+ / UNIV_PAGE_SIZE));
+ } else {
+ ulint zip_size_shift;
+ switch (zip_size) {
+ case 1024: zip_size_shift = 10; break;
+ case 2048: zip_size_shift = 11; break;
+ case 4096: zip_size_shift = 12; break;
+ case 8192: zip_size_shift = 13; break;
+ case 16384: zip_size_shift = 14; break;
+ default: ut_error;
+ }
+ offset_high = block_offset >> (32 - zip_size_shift);
+ offset_low = (block_offset << zip_size_shift & 0xFFFFFFFFUL)
+ + byte_offset;
+ ut_a(node->size - block_offset
+ >= (len + (zip_size - 1)) / zip_size);
+ }
+
+ /* Do aio */
+
+ ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+
+ if (srv_pass_corrupt_table && space->is_corrupt) {
+ /* should ignore i/o for the crashed space */
+ mutex_enter(&fil_system->mutex);
+ fil_node_complete_io(node, fil_system, type);
+ mutex_exit(&fil_system->mutex);
+ if (mode == OS_AIO_NORMAL) {
+ ut_a(space->purpose == FIL_TABLESPACE);
+ buf_page_io_complete(message, trx);
+ }
+ if (type == OS_FILE_READ) {
+ return(DB_TABLESPACE_DELETED);
+ } else {
+ return(DB_SUCCESS);
+ }
+ } else {
+ ut_a(!space->is_corrupt);
+#ifdef UNIV_HOTBACKUP
+ /* In ibbackup do normal i/o, not aio */
+ if (type == OS_FILE_READ) {
+ ret = os_file_read(node->handle, buf, offset_low, offset_high,
+ len);
+ } else {
+ ret = os_file_write(node->name, node->handle, buf,
+ offset_low, offset_high, len);
+ }
+#else
+ /* Queue the aio request */
+ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+ offset_low, offset_high, len, node, message, trx);
+#endif
+ } /**/
+
+ ut_a(ret);
+
+ if (mode == OS_AIO_SYNC) {
+ /* The i/o operation is already completed when we return from
+ os_aio: */
+
+ mutex_enter(&fil_system->mutex);
+
+ fil_node_complete_io(node, fil_system, type);
+
+ mutex_exit(&fil_system->mutex);
+
+ ut_ad(fil_validate());
+ }
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Confirm whether the parameters are valid or not */
+UNIV_INTERN
+ibool
+fil_area_is_exist(
+/*==============*/
+ ulint space_id, /*!< in: space id */
+ ulint zip_size __attribute__((unused)),
+ /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset __attribute__((unused)),
+ /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len __attribute__((unused)))
+ /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+{
+ fil_space_t* space;
+ fil_node_t* node;
+
+ /* Reserve the fil_system mutex and make sure that we can open at
+ least one file while holding it, if the file is not already open */
+
+ fil_mutex_enter_and_prepare_for_io(space_id);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (!space) {
+ mutex_exit(&fil_system->mutex);
+ return(FALSE);
+ }
+
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(node == NULL)) {
+ mutex_exit(&fil_system->mutex);
+ return(FALSE);
+ }
+
+ if (space->id != 0 && node->size == 0) {
+ /* We do not know the size of a single-table tablespace
+ before we open the file */
+
+ break;
+ }
+
+ if (node->size > block_offset) {
+ /* Found! */
+ break;
+ } else {
+ block_offset -= node->size;
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+ }
+
+ /* Open file if closed */
+ fil_node_prepare_for_io(node, fil_system, space);
+ fil_node_complete_io(node, fil_system, OS_FILE_READ);
+
+ /* Check that at least the start offset is within the bounds of a
+ single-table tablespace */
+ if (UNIV_UNLIKELY(node->size <= block_offset)
+ && space->id != 0 && space->purpose == FIL_TABLESPACE) {
+ mutex_exit(&fil_system->mutex);
+ return(FALSE);
+ }
+
+ mutex_exit(&fil_system->mutex);
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+ ulint segment) /*!< in: the number of the segment in the aio
+ array to wait for */
+{
+ ibool ret;
+ fil_node_t* fil_node;
+ void* message;
+ ulint type;
+
+ ut_ad(fil_validate());
+
+ if (os_aio_use_native_aio) {
+ srv_set_io_thread_op_info(segment, "native aio handle");
+#ifdef WIN_ASYNC_IO
+ ret = os_aio_windows_handle(segment, 0, &fil_node,
+ &message, &type);
+#else
+ ret = 0; /* Eliminate compiler warning */
+ ut_error;
+#endif
+ } else {
+ srv_set_io_thread_op_info(segment, "simulated aio handle");
+
+ ret = os_aio_simulated_handle(segment, &fil_node,
+ &message, &type);
+ }
+
+ ut_a(ret);
+
+ srv_set_io_thread_op_info(segment, "complete io for fil node");
+
+ mutex_enter(&fil_system->mutex);
+
+ fil_node_complete_io(fil_node, fil_system, type);
+
+ mutex_exit(&fil_system->mutex);
+
+ ut_ad(fil_validate());
+
+ /* Do the i/o handling */
+ /* IMPORTANT: since i/o handling for reads will read also the insert
+ buffer in tablespace 0, you have to be very careful not to introduce
+ deadlocks in the i/o system. We keep tablespace 0 data files always
+ open, and use a special i/o thread to serve insert buffer requests. */
+
+ if (fil_node->space->purpose == FIL_TABLESPACE) {
+ srv_set_io_thread_op_info(segment, "complete io for buf page");
+ buf_page_io_complete(message, NULL);
+ } else {
+ srv_set_io_thread_op_info(segment, "complete io for log");
+ log_io_complete(message);
+ }
+}
+#endif /* UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+ ulint space_id) /*!< in: file space id (this can be a group of
+ log files or a tablespace of the database) */
+{
+ fil_space_t* space;
+ fil_node_t* node;
+ os_file_t file;
+ ib_int64_t old_mod_counter;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (!space || space->is_being_deleted) {
+ mutex_exit(&fil_system->mutex);
+
+ return;
+ }
+
+ space->n_pending_flushes++; /*!< prevent dropping of the space while
+ we are flushing */
+ node = UT_LIST_GET_FIRST(space->chain);
+
+ while (node) {
+ if (node->modification_counter > node->flush_counter) {
+ ut_a(node->open);
+
+ /* We want to flush the changes at least up to
+ old_mod_counter */
+ old_mod_counter = node->modification_counter;
+
+ if (space->purpose == FIL_TABLESPACE) {
+ fil_n_pending_tablespace_flushes++;
+ } else {
+ fil_n_pending_log_flushes++;
+ fil_n_log_flushes++;
+ }
+#ifdef __WIN__
+ if (node->is_raw_disk) {
+
+ goto skip_flush;
+ }
+#endif
+retry:
+ if (node->n_pending_flushes > 0) {
+ /* We want to avoid calling os_file_flush() on
+ the file twice at the same time, because we do
+ not know what bugs OS's may contain in file
+ i/o; sleep for a while */
+
+ mutex_exit(&fil_system->mutex);
+
+ os_thread_sleep(20000);
+
+ mutex_enter(&fil_system->mutex);
+
+ if (node->flush_counter >= old_mod_counter) {
+
+ goto skip_flush;
+ }
+
+ goto retry;
+ }
+
+ ut_a(node->open);
+ file = node->handle;
+ node->n_pending_flushes++;
+
+ mutex_exit(&fil_system->mutex);
+
+ /* fprintf(stderr, "Flushing to file %s\n",
+ node->name); */
+
+ os_file_flush(file);
+
+ mutex_enter(&fil_system->mutex);
+
+ node->n_pending_flushes--;
+skip_flush:
+ if (node->flush_counter < old_mod_counter) {
+ node->flush_counter = old_mod_counter;
+
+ if (space->is_in_unflushed_spaces
+ && fil_space_is_flushed(space)) {
+
+ space->is_in_unflushed_spaces = FALSE;
+
+ UT_LIST_REMOVE(
+ unflushed_spaces,
+ fil_system->unflushed_spaces,
+ space);
+ }
+ }
+
+ if (space->purpose == FIL_TABLESPACE) {
+ fil_n_pending_tablespace_flushes--;
+ } else {
+ fil_n_pending_log_flushes--;
+ }
+ }
+
+ node = UT_LIST_GET_NEXT(chain, node);
+ }
+
+ space->n_pending_flushes--;
+
+ mutex_exit(&fil_system->mutex);
+}
+
+/**********************************************************************//**
+Flushes to disk the writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+ ulint purpose) /*!< in: FIL_TABLESPACE, FIL_LOG */
+{
+ fil_space_t* space;
+ ulint* space_ids;
+ ulint n_space_ids;
+ ulint i;
+
+ mutex_enter(&fil_system->mutex);
+
+ n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces);
+ if (n_space_ids == 0) {
+
+ mutex_exit(&fil_system->mutex);
+ return;
+ }
+
+ /* Assemble a list of space ids to flush. Previously, we
+ traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
+ on a space that was just removed from the list by fil_flush().
+ Thus, the space could be dropped and the memory overwritten. */
+ space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
+
+ n_space_ids = 0;
+
+ for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces);
+ space;
+ space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
+
+ if (space->purpose == purpose && !space->is_being_deleted) {
+
+ space_ids[n_space_ids++] = space->id;
+ }
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ /* Flush the spaces. It will not hurt to call fil_flush() on
+ a non-existing space id. */
+ for (i = 0; i < n_space_ids; i++) {
+
+ fil_flush(space_ids[i]);
+ }
+
+ mem_free(space_ids);
+}
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void)
+/*==============*/
+{
+ fil_space_t* space;
+ fil_node_t* fil_node;
+ ulint n_open = 0;
+ ulint i;
+
+ mutex_enter(&fil_system->mutex);
+
+ /* Look for spaces in the hash table */
+
+ for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
+
+ space = HASH_GET_FIRST(fil_system->spaces, i);
+
+ while (space != NULL) {
+ UT_LIST_VALIDATE(chain, fil_node_t, space->chain,
+ ut_a(ut_list_node_313->open
+ || !ut_list_node_313->n_pending));
+
+ fil_node = UT_LIST_GET_FIRST(space->chain);
+
+ while (fil_node != NULL) {
+ if (fil_node->n_pending > 0) {
+ ut_a(fil_node->open);
+ }
+
+ if (fil_node->open) {
+ n_open++;
+ }
+ fil_node = UT_LIST_GET_NEXT(chain, fil_node);
+ }
+ space = HASH_GET_NEXT(hash, space);
+ }
+ }
+
+ ut_a(fil_system->n_open == n_open);
+
+ UT_LIST_VALIDATE(LRU, fil_node_t, fil_system->LRU, (void) 0);
+
+ fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+
+ while (fil_node != NULL) {
+ ut_a(fil_node->n_pending == 0);
+ ut_a(fil_node->open);
+ ut_a(fil_node->space->purpose == FIL_TABLESPACE);
+ ut_a(!trx_sys_sys_space(fil_node->space->id));
+
+ fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+ fil_addr_t addr) /*!< in: address */
+{
+ return(addr.page == FIL_NULL);
+}
+
+/********************************************************************//**
+Get the predecessor of a file page.
+@return FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+ const byte* page) /*!< in: file page */
+{
+ return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************************//**
+Get the successor of a file page.
+@return FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+ const byte* page) /*!< in: file page */
+{
+ return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type) /*!< in: type */
+{
+ ut_ad(page);
+
+ mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+ const byte* page) /*!< in: file page */
+{
+ ut_ad(page);
+
+ return(mach_read_from_2(page + FIL_PAGE_TYPE));
+}
+
+/********************************************************************
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void)
+/*===========*/
+{
+#ifndef UNIV_HOTBACKUP
+ /* The mutex should already have been freed. */
+ ut_ad(fil_system->mutex.magic_n == 0);
+#endif /* !UNIV_HOTBACKUP */
+
+ hash_table_free(fil_system->spaces);
+
+ hash_table_free(fil_system->name_hash);
+
+ ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0);
+ ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0);
+ ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0);
+
+ mem_free(fil_system);
+
+ fil_system = NULL;
+}
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+fil_system_hash_cells(void)
+/*=======================*/
+{
+ if (fil_system) {
+ return (fil_system->spaces->n_cells
+ + fil_system->name_hash->n_cells);
+ } else {
+ return 0;
+ }
+}
+
+ulint
+fil_system_hash_nodes(void)
+/*=======================*/
+{
+ if (fil_system) {
+ return (UT_LIST_GET_LEN(fil_system->space_list)
+ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
+ } else {
+ return 0;
+ }
+}
+
+/*************************************************************************
+functions to access is_corrupt flag of fil_space_t*/
+
+ibool
+fil_space_is_corrupt(
+/*=================*/
+ ulint space_id)
+{
+ fil_space_t* space;
+ ibool ret = FALSE;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (space && space->is_corrupt) {
+ ret = TRUE;
+ }
+
+ mutex_exit(&fil_system->mutex);
+
+ return(ret);
+}
+
+void
+fil_space_set_corrupt(
+/*==================*/
+ ulint space_id)
+{
+ fil_space_t* space;
+
+ mutex_enter(&fil_system->mutex);
+
+ space = fil_space_get_by_id(space_id);
+
+ if (space) {
+ space->is_corrupt = TRUE;
+ }
+
+ mutex_exit(&fil_system->mutex);
+}
+
diff --git a/storage/xtradb/fsp/fsp0fsp.c b/storage/xtradb/fsp/fsp0fsp.c
new file mode 100644
index 00000000000..cd28186109f
--- /dev/null
+++ b/storage/xtradb/fsp/fsp0fsp.c
@@ -0,0 +1,4346 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.c
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+
+#ifdef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "page0zip.h"
+#ifdef UNIV_HOTBACKUP
+# include "fut0lst.h"
+#else /* UNIV_HOTBACKUP */
+# include "sync0sync.h"
+# include "fut0fut.h"
+# include "srv0srv.h"
+# include "ibuf0ibuf.h"
+# include "btr0btr.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "log0log.h"
+#endif /* UNIV_HOTBACKUP */
+#include "dict0mem.h"
+#include "trx0sys.h"
+
+#define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header
+ within a file page */
+
+/* The data structures in files are defined just as byte strings in C */
+typedef byte fsp_header_t;
+typedef byte xdes_t;
+
+/* SPACE HEADER
+ ============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID 0 /* space id */
+#define FSP_NOT_USED 4 /* this field contained a value up to
+ which we know that the modifications
+ in the database have been flushed to
+ the file space; not used now */
+#define FSP_SIZE 8 /* Current size of the space in
+ pages */
+#define FSP_FREE_LIMIT 12 /* Minimum page number for which the
+ free list has not been initialized:
+ the pages >= this limit are, by
+ definition, free; note that in a
+ single-table tablespace where size
+ < 64 pages, this number is 64, i.e.,
+ we have initialized the space
+ about the first extent, but have not
+ physically allocted those pages to the
+ file */
+#define FSP_SPACE_FLAGS 16 /* table->flags & ~DICT_TF_COMPACT */
+#define FSP_FRAG_N_USED 20 /* number of used pages in the
+ FSP_FREE_FRAG list */
+#define FSP_FREE 24 /* list of free extents */
+#define FSP_FREE_FRAG (24 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents not
+ belonging to any segment */
+#define FSP_FULL_FRAG (24 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents not belonging
+ to any segment */
+#define FSP_SEG_ID (24 + 3 * FLST_BASE_NODE_SIZE)
+ /* 8 bytes which give the first unused
+ segment id */
+#define FSP_SEG_INODES_FULL (32 + 3 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where all the segment inode
+ slots are reserved */
+#define FSP_SEG_INODES_FREE (32 + 4 * FLST_BASE_NODE_SIZE)
+ /* list of pages containing segment
+ headers, where not all the segment
+ header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define FSP_HEADER_SIZE (32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define FSP_FREE_ADD 4 /* this many free extents are added
+ to the free list from above
+ FSP_FREE_LIMIT at a time */
+
+/* FILE SEGMENT INODE
+ ==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef byte fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE FSEG_PAGE_DATA
+ /* the list node for linking
+ segment inode pages */
+
+#define FSEG_ARR_OFFSET (FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define FSEG_ID 0 /* 8 bytes of segment id: if this is
+ ut_dulint_zero, it means that the
+ header is unused */
+#define FSEG_NOT_FULL_N_USED 8
+ /* number of used segment pages in
+ the FSEG_NOT_FULL list */
+#define FSEG_FREE 12
+ /* list of free extents of this
+ segment */
+#define FSEG_NOT_FULL (12 + FLST_BASE_NODE_SIZE)
+ /* list of partially free extents */
+#define FSEG_FULL (12 + 2 * FLST_BASE_NODE_SIZE)
+ /* list of full extents */
+#define FSEG_MAGIC_N (12 + 3 * FLST_BASE_NODE_SIZE)
+ /* magic number used in debugging */
+#define FSEG_FRAG_ARR (16 + 3 * FLST_BASE_NODE_SIZE)
+ /* array of individual pages
+ belonging to this segment in fsp
+ fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS (FSP_EXTENT_SIZE / 2)
+ /* number of slots in the array for
+ the fragment pages */
+#define FSEG_FRAG_SLOT_SIZE 4 /* a fragment page slot contains its
+ page number within space, FIL_NULL
+ means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE \
+ (16 + 3 * FLST_BASE_NODE_SIZE \
+ + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+#define FSP_SEG_INODES_PER_PAGE(zip_size) \
+ (((zip_size ? zip_size : UNIV_PAGE_SIZE) \
+ - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
+ /* Number of segment inodes which fit on a
+ single page */
+
+#define FSEG_MAGIC_N_VALUE 97937874
+
+#define FSEG_FILLFACTOR 8 /* If this value is x, then if
+ the number of unused but reserved
+ pages in a segment is less than
+ reserved pages * 1/x, and there are
+ at least FSEG_FRAG_LIMIT used pages,
+ then we allow a new empty extent to
+ be added to the segment in
+ fseg_alloc_free_page. Otherwise, we
+ use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT FSEG_FRAG_ARR_N_SLOTS
+ /* If the segment has >= this many
+ used pages, it may be expanded by
+ allocating extents to the segment;
+ until that only individual fragment
+ pages are allocated from the space */
+
+#define FSEG_FREE_LIST_LIMIT 40 /* If the reserved size of a segment
+ is at least this many extents, we
+ allow extents to be put to the free
+ list of the extent: at most
+ FSEG_FREE_LIST_MAX_LEN many */
+#define FSEG_FREE_LIST_MAX_LEN 4
+
+
+/* EXTENT DESCRIPTOR
+ =================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define XDES_ID 0 /* The identifier of the segment
+ to which this extent belongs */
+#define XDES_FLST_NODE 8 /* The list node data structure
+ for the descriptors */
+#define XDES_STATE (FLST_NODE_SIZE + 8)
+ /* contains state information
+ of the extent */
+#define XDES_BITMAP (FLST_NODE_SIZE + 12)
+ /* Descriptor bitmap of the pages
+ in the extent */
+/*-------------------------------------*/
+
+#define XDES_BITS_PER_PAGE 2 /* How many bits are there per page */
+#define XDES_FREE_BIT 0 /* Index of the bit which tells if
+ the page is free */
+#define XDES_CLEAN_BIT 1 /* NOTE: currently not used!
+ Index of the bit which tells if
+ there are old versions of tuples
+ on the page */
+/* States of a descriptor */
+#define XDES_FREE 1 /* extent is in free list of space */
+#define XDES_FREE_FRAG 2 /* extent is in free fragment list of
+ space */
+#define XDES_FULL_FRAG 3 /* extent is in full fragment list of
+ space */
+#define XDES_FSEG 4 /* extent belongs to a segment */
+
+/* File extent data structure size in bytes. */
+#define XDES_SIZE \
+ (XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/* Offset of the descriptor array on a descriptor page */
+#define XDES_ARR_OFFSET (FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/* Flag to indicate if we have printed the tablespace full error. */
+static ibool fsp_tbs_full_error_printed = FALSE;
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset in the extent */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+ fseg_inode_t* seg_inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset in the extent */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how
+many pages are currently used.
+@return number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+ fseg_inode_t* header, /*!< in: segment inode */
+ ulint* used, /*!< out: number of pages used (not
+ more than reserved) */
+ mtr_t* mtr); /*!< in: mtr handle */
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static
+void
+fseg_mark_page_used(
+/*================*/
+ fseg_inode_t* seg_inode,/*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Puts new extents to the free list if
+there are free extents above the free limit. If an extent happens
+to contain an extent descriptor page, the extent is put to
+the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+ ibool init_space, /*!< in: TRUE if this is a single-table
+ tablespace and we are only initing
+ the tablespace's first extent
+ descriptor page and ibuf bitmap page;
+ then we do not allocate more extents */
+ ulint space, /*!< in: space */
+ fsp_header_t* header, /*!< in: space header */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return the allocated page number, FIL_NULL if no page could be allocated */
+static
+ulint
+fseg_alloc_free_page_low(
+/*=====================*/
+ ulint space, /*!< in: space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fseg_inode_t* seg_inode, /*!< in: segment inode */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction, /*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr); /*!< in: mtr handle */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+ page_t* page) /*!< in: header page (page 0 in the tablespace) */
+{
+ return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets a pointer to the space header and x-locks its page.
+@return pointer to the space header, page x-locked */
+UNIV_INLINE
+fsp_header_t*
+fsp_get_space_header(
+/*=================*/
+ ulint id, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ fsp_header_t* header;
+
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ ut_ad(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(id || !zip_size);
+
+ block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(0);
+ }
+ ut_a(block);
+
+ header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header));
+ ut_ad(zip_size == dict_table_flags_to_zip_size(
+ mach_read_from_4(FSP_SPACE_FLAGS + header)));
+ return(header);
+}
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+ const xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ulint offset, /*!< in: page offset within extent:
+ 0 ... FSP_EXTENT_SIZE - 1 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint index;
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
+ ut_ad(offset < FSP_EXTENT_SIZE);
+
+ index = bit + XDES_BITS_PER_PAGE * offset;
+
+ byte_index = index / 8;
+ bit_index = index % 8;
+
+ return(ut_bit_get_nth(mtr_read_ulint(descr + XDES_BITMAP + byte_index,
+ MLOG_1BYTE, mtr),
+ bit_index));
+}
+
+/**********************************************************************//**
+Sets a descriptor bit of a page. */
+UNIV_INLINE
+void
+xdes_set_bit(
+/*=========*/
+ xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ulint offset, /*!< in: page offset within extent:
+ 0 ... FSP_EXTENT_SIZE - 1 */
+ ibool val, /*!< in: bit value */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint index;
+ ulint byte_index;
+ ulint bit_index;
+ ulint descr_byte;
+
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
+ ut_ad(offset < FSP_EXTENT_SIZE);
+
+ index = bit + XDES_BITS_PER_PAGE * offset;
+
+ byte_index = index / 8;
+ bit_index = index % 8;
+
+ descr_byte = mtr_read_ulint(descr + XDES_BITMAP + byte_index,
+ MLOG_1BYTE, mtr);
+ descr_byte = ut_bit_set_nth(descr_byte, bit_index, val);
+
+ mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte,
+ MLOG_1BYTE, mtr);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Starts from hint
+and scans upward; at the end of the extent the search is wrapped to
+the start of the extent.
+@return bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit(
+/*==========*/
+ xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ibool val, /*!< in: desired bit value */
+ ulint hint, /*!< in: hint of which bit position would be desirable */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+
+ ut_ad(descr && mtr);
+ ut_ad(val <= TRUE);
+ ut_ad(hint < FSP_EXTENT_SIZE);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ for (i = hint; i < FSP_EXTENT_SIZE; i++) {
+ if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+ return(i);
+ }
+ }
+
+ for (i = 0; i < hint; i++) {
+ if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Scans the extent in
+a direction opposite to xdes_find_bit.
+@return bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit_downward(
+/*===================*/
+ xdes_t* descr, /*!< in: descriptor */
+ ulint bit, /*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+ ibool val, /*!< in: desired bit value */
+ ulint hint, /*!< in: hint of which bit position would be desirable */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+
+ ut_ad(descr && mtr);
+ ut_ad(val <= TRUE);
+ ut_ad(hint < FSP_EXTENT_SIZE);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ for (i = hint + 1; i > 0; i--) {
+ if (val == xdes_get_bit(descr, bit, i - 1, mtr)) {
+
+ return(i - 1);
+ }
+ }
+
+ for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) {
+ if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns the number of used pages in a descriptor.
+@return number of pages used */
+UNIV_INLINE
+ulint
+xdes_get_n_used(
+/*============*/
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+ ulint count = 0;
+
+ ut_ad(descr && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+ if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no used pages.
+@return TRUE if totally free */
+UNIV_INLINE
+ibool
+xdes_is_free(
+/*=========*/
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (0 == xdes_get_n_used(descr, mtr)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no free pages.
+@return TRUE if full */
+UNIV_INLINE
+ibool
+xdes_is_full(
+/*=========*/
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Sets the state of an xdes. */
+UNIV_INLINE
+void
+xdes_set_state(
+/*===========*/
+ xdes_t* descr, /*!< in/out: descriptor */
+ ulint state, /*!< in: state to set */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ut_ad(descr && mtr);
+ ut_ad(state >= XDES_FREE);
+ ut_ad(state <= XDES_FSEG);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+ mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+ const xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint state;
+
+ ut_ad(descr && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+ state = mtr_read_ulint(descr + XDES_STATE, MLOG_4BYTES, mtr);
+ ut_ad(state - 1 < XDES_FSEG);
+ return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+UNIV_INLINE
+void
+xdes_init(
+/*======*/
+ xdes_t* descr, /*!< in: descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+
+ ut_ad(descr && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0);
+
+ for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) {
+ mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr);
+ }
+
+ xdes_set_state(descr, XDES_FREE, mtr);
+}
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint offset) /*!< in: page offset */
+{
+#ifndef DOXYGEN /* Doxygen gets confused of these */
+# if PAGE_ZIP_MIN_SIZE <= XDES_ARR_OFFSET \
+ + (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
+# error
+# endif
+#endif /* !DOXYGEN */
+ ut_a(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE);
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(ut_2pow_round(offset, UNIV_PAGE_SIZE));
+ } else {
+ ut_ad(zip_size > XDES_ARR_OFFSET
+ + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+ return(ut_2pow_round(offset, zip_size));
+ }
+}
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint offset) /*!< in: page offset */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE)
+ / FSP_EXTENT_SIZE);
+ } else {
+ return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
+ }
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the extent
+descriptor resides is x-locked. If the page offset is equal to the free limit
+of the space, adds new extents from above the free limit to the space free
+list, if not free limit == space size. This adding is necessary to make the
+descriptor defined, as they are uninitialized above the free limit.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds the free limit */
+UNIV_INLINE
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+/*===============================*/
+ fsp_header_t* sp_header,/*!< in/out: space header, x-latched */
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: page offset;
+ if equal to the free limit,
+ we try to add new extents to
+ the space free list */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint limit;
+ ulint size;
+ ulint zip_size;
+ ulint descr_page_no;
+ page_t* descr_page;
+
+ ut_ad(mtr);
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+ MTR_MEMO_X_LOCK));
+ ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET);
+ /* Read free limit and space size */
+ limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
+ size = mach_read_from_4(sp_header + FSP_SIZE);
+ zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(sp_header + FSP_SPACE_FLAGS));
+
+ /* If offset is >= size or > limit, return NULL */
+
+ if ((offset >= size) || (offset > limit)) {
+
+ return(NULL);
+ }
+
+ /* If offset is == limit, fill free list of the space. */
+
+ if (offset == limit) {
+ fsp_fill_free_list(FALSE, space, sp_header, mtr);
+ }
+
+ descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+ if (descr_page_no == 0) {
+ /* It is on the space header page */
+
+ descr_page = page_align(sp_header);
+ } else {
+ buf_block_t* block;
+
+ block = buf_page_get(space, zip_size, descr_page_no,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ descr_page = buf_block_get_frame(block);
+ }
+
+ return(descr_page + XDES_ARR_OFFSET
+ + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the
+extent descriptor resides is x-locked. If the page offset is equal to
+the free limit of the space, adds new extents from above the free limit
+to the space free list, if not free limit == space size. This adding
+is necessary to make the descriptor defined, as they are uninitialized
+above the free limit.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds the free limit */
+static
+xdes_t*
+xdes_get_descriptor(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint offset, /*!< in: page offset; if equal to the free limit,
+ we try to add new extents to the space free list */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ buf_block_t* block;
+ fsp_header_t* sp_header;
+
+ block = buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(0);
+ }
+ ut_a(block);
+
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+ return(xdes_get_descriptor_with_space_hdr(sp_header, space, offset,
+ mtr));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor if the file address
+of the descriptor list node is known. The page where the
+extent descriptor resides is x-locked.
+@return pointer to the extent descriptor */
+UNIV_INLINE
+xdes_t*
+xdes_lst_get_descriptor(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fil_addr_t lst_node,/*!< in: file address of the list node
+ contained in the descriptor */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ xdes_t* descr;
+
+ ut_ad(mtr);
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+ MTR_MEMO_X_LOCK));
+ descr = fut_get_ptr(space, zip_size, lst_node, RW_X_LATCH, mtr)
+ - XDES_FLST_NODE;
+
+ return(descr);
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return offset of the first page in extent */
+UNIV_INLINE
+ulint
+xdes_get_offset(
+/*============*/
+ xdes_t* descr) /*!< in: extent descriptor */
+{
+ ut_ad(descr);
+
+ return(page_get_page_no(page_align(descr))
+ + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE)
+ * FSP_EXTENT_SIZE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page_low(
+/*===================*/
+ buf_block_t* block) /*!< in: pointer to a page */
+{
+ page_t* page = buf_block_get_frame(block);
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+ block->check_index_page_at_flush = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ memset(page, 0, UNIV_PAGE_SIZE);
+ memset(page_zip->data, 0, page_zip_get_size(page_zip));
+ mach_write_to_4(page + FIL_PAGE_OFFSET,
+ buf_block_get_page_no(block));
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ buf_block_get_space(block));
+ memcpy(page_zip->data + FIL_PAGE_OFFSET,
+ page + FIL_PAGE_OFFSET, 4);
+ memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+ return;
+ }
+
+ memset(page, 0, UNIV_PAGE_SIZE);
+ mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block));
+ mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ buf_block_get_space(block));
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page(
+/*===============*/
+ buf_block_t* block, /*!< in: pointer to a page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fsp_init_file_page_low(block);
+
+ mlog_write_initial_log_record(buf_block_get_frame(block),
+ MLOG_INIT_FILE_PAGE, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ buf_block_t* block) /*!< in: block or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ if (block) {
+ fsp_init_file_page_low(block);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Initializes the fsp system. */
+UNIV_INTERN
+void
+fsp_init(void)
+/*==========*/
+{
+ /* Does nothing at the moment */
+}
+
+/**********************************************************************//**
+Writes the space id and compressed page size to a tablespace header.
+This function is used past the buffer pool when we in fil0fil.c create
+a new single-table tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+ page_t* page, /*!< in/out: first page in the space */
+ ulint space_id, /*!< in: space id */
+ ulint flags) /*!< in: tablespace flags (FSP_SPACE_FLAGS):
+ 0, or table->flags if newer than COMPACT */
+{
+ /* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+ ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+ ROW_FORMAT=REDUNDANT (table->flags == 0). For any other
+ format, the tablespace flags should equal table->flags. */
+ ut_a(flags != DICT_TF_COMPACT);
+
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
+ space_id);
+ mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page,
+ flags);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint size, /*!< in: current size in blocks */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fsp_header_t* header;
+ buf_block_t* block;
+ page_t* page;
+ ulint flags;
+ ulint zip_size;
+
+ ut_ad(mtr);
+
+ mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+ zip_size = dict_table_flags_to_zip_size(flags);
+ block = buf_page_create(space, 0, zip_size, mtr);
+ buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ /* The prior contents of the file page should be ignored */
+
+ fsp_init_file_page(block, mtr);
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR,
+ MLOG_2BYTES, mtr);
+
+ header = FSP_HEADER_OFFSET + page;
+
+ mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_SPACE_FLAGS, flags,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr);
+
+ flst_init(header + FSP_FREE, mtr);
+ flst_init(header + FSP_FREE_FRAG, mtr);
+ flst_init(header + FSP_FULL_FRAG, mtr);
+ flst_init(header + FSP_SEG_INODES_FULL, mtr);
+ flst_init(header + FSP_SEG_INODES_FREE, mtr);
+
+ mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr);
+ if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) {
+ fsp_fill_free_list(FALSE, space, header, mtr);
+ btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
+ space, 0, ut_dulint_add(DICT_IBUF_ID_MIN, space),
+ dict_ind_redundant, mtr);
+ } else {
+ fsp_fill_free_list(TRUE, space, header, mtr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+ const page_t* page) /*!< in: first page of a tablespace */
+{
+ ulint fsp_id;
+ ulint id;
+
+ fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID);
+
+ id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ if (id != fsp_id) {
+ fprintf(stderr,
+ "InnoDB: Error: space id in fsp header %lu,"
+ " but in the page header %lu\n",
+ (ulong) fsp_id, (ulong) id);
+
+ return(ULINT_UNDEFINED);
+ }
+
+ return(id);
+}
+
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+ const page_t* page) /*!< in: first page of a tablespace */
+{
+ ut_ad(!page_offset(page));
+
+ return(mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page));
+}
+
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+ const page_t* page) /*!< in: first page of a tablespace */
+{
+ ulint flags = fsp_header_get_flags(page);
+
+ return(dict_table_flags_to_zip_size(flags));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint size_inc,/*!< in: size increment in pages */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fsp_header_t* header;
+ ulint size;
+ ulint flags;
+
+ ut_ad(mtr);
+
+ mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+ header = fsp_get_space_header(space,
+ dict_table_flags_to_zip_size(flags),
+ mtr);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES,
+ mtr);
+}
+
+/**********************************************************************//**
+Gets the current free limit of the system tablespace. The free limit
+means the place of the first page which has never been put to the
+free list for allocation. The space above that address is initialized
+to zero. Sets also the global variable log_fsp_current_free_limit.
+@return free limit in megabytes */
+UNIV_INTERN
+ulint
+fsp_header_get_free_limit(void)
+/*===========================*/
+{
+ fsp_header_t* header;
+ ulint limit;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
+
+ header = fsp_get_space_header(0, 0, &mtr);
+
+ limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr);
+
+ limit /= ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+ log_fsp_current_free_limit_set_and_checkpoint(limit);
+
+ mtr_commit(&mtr);
+
+ return(limit);
+}
+
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header. If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files. If there is an auto-extending data file,
+this can be smaller.
+@return size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void)
+/*================================*/
+{
+ fsp_header_t* header;
+ ulint size;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
+
+ header = fsp_get_space_header(0, 0, &mtr);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+ mtr_commit(&mtr);
+
+ return(size);
+}
+
+/***********************************************************************//**
+Tries to extend a single-table tablespace so that a page would fit in the
+data file.
+@return TRUE if success */
+static
+ibool
+fsp_try_extend_data_file_with_pages(
+/*================================*/
+ ulint space, /*!< in: space */
+ ulint page_no, /*!< in: page number */
+ fsp_header_t* header, /*!< in: space header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool success;
+ ulint actual_size;
+ ulint size;
+
+ ut_a(space != 0);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ ut_a(page_no >= size);
+
+ success = fil_extend_space_to_desired_size(&actual_size, space,
+ page_no + 1);
+ /* actual_size now has the space size in pages; it may be less than
+ we wanted if we ran out of disk space */
+
+ mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr);
+
+ return(success);
+}
+
+/***********************************************************************//**
+Tries to extend the last data file of a tablespace if it is auto-extending.
+@return FALSE if not auto-extending */
+static
+ibool
+fsp_try_extend_data_file(
+/*=====================*/
+ ulint* actual_increase,/*!< out: actual increase in pages, where
+ we measure the tablespace size from
+ what the header field says; it may be
+ the actual file size rounded down to
+ megabyte */
+ ulint space, /*!< in: space */
+ fsp_header_t* header, /*!< in: space header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint size;
+ ulint zip_size;
+ ulint new_size;
+ ulint old_size;
+ ulint size_increase;
+ ulint actual_size;
+ ibool success;
+
+ *actual_increase = 0;
+
+ if (space == 0 && !srv_auto_extend_last_data_file) {
+
+ /* We print the error message only once to avoid
+ spamming the error log. Note that we don't need
+ to reset the flag to FALSE as dealing with this
+ error requires server restart. */
+ if (fsp_tbs_full_error_printed == FALSE) {
+ fprintf(stderr,
+ "InnoDB: Error: Data file(s) ran"
+ " out of space.\n"
+ "Please add another data file or"
+ " use \'autoextend\' for the last"
+ " data file.\n");
+ fsp_tbs_full_error_printed = TRUE;
+ }
+ return(FALSE);
+ }
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+ zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(header + FSP_SPACE_FLAGS));
+
+ old_size = size;
+
+ if (space == 0) {
+ if (!srv_last_file_size_max) {
+ size_increase = SRV_AUTO_EXTEND_INCREMENT;
+ } else {
+ if (srv_last_file_size_max
+ < srv_data_file_sizes[srv_n_data_files - 1]) {
+
+ fprintf(stderr,
+ "InnoDB: Error: Last data file size"
+ " is %lu, max size allowed %lu\n",
+ (ulong) srv_data_file_sizes[
+ srv_n_data_files - 1],
+ (ulong) srv_last_file_size_max);
+ }
+
+ size_increase = srv_last_file_size_max
+ - srv_data_file_sizes[srv_n_data_files - 1];
+ if (size_increase > SRV_AUTO_EXTEND_INCREMENT) {
+ size_increase = SRV_AUTO_EXTEND_INCREMENT;
+ }
+ }
+ } else {
+ /* We extend single-table tablespaces first one extent
+ at a time, but for bigger tablespaces more. It is not
+ enough to extend always by one extent, because some
+ extents are frag page extents. */
+ ulint extent_size; /*!< one megabyte, in pages */
+
+ if (!zip_size) {
+ extent_size = FSP_EXTENT_SIZE;
+ } else {
+ extent_size = FSP_EXTENT_SIZE
+ * UNIV_PAGE_SIZE / zip_size;
+ }
+
+ if (size < extent_size) {
+ /* Let us first extend the file to extent_size */
+ success = fsp_try_extend_data_file_with_pages(
+ space, extent_size - 1, header, mtr);
+ if (!success) {
+ new_size = mtr_read_ulint(header + FSP_SIZE,
+ MLOG_4BYTES, mtr);
+
+ *actual_increase = new_size - old_size;
+
+ return(FALSE);
+ }
+
+ size = extent_size;
+ }
+
+ if (size < 32 * extent_size) {
+ size_increase = extent_size;
+ } else {
+ /* Below in fsp_fill_free_list() we assume
+ that we add at most FSP_FREE_ADD extents at
+ a time */
+ size_increase = FSP_FREE_ADD * extent_size;
+ }
+ }
+
+ if (size_increase == 0) {
+
+ return(TRUE);
+ }
+
+ success = fil_extend_space_to_desired_size(&actual_size, space,
+ size + size_increase);
+ /* We ignore any fragments of a full megabyte when storing the size
+ to the space header */
+
+ if (!zip_size) {
+ new_size = ut_calc_align_down(actual_size,
+ (1024 * 1024) / UNIV_PAGE_SIZE);
+ } else {
+ new_size = ut_calc_align_down(actual_size,
+ (1024 * 1024) / zip_size);
+ }
+ mlog_write_ulint(header + FSP_SIZE, new_size, MLOG_4BYTES, mtr);
+
+ *actual_increase = new_size - old_size;
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Puts new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+ ibool init_space, /*!< in: TRUE if this is a single-table
+ tablespace and we are only initing
+ the tablespace's first extent
+ descriptor page and ibuf bitmap page;
+ then we do not allocate more extents */
+ ulint space, /*!< in: space */
+ fsp_header_t* header, /*!< in/out: space header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint limit;
+ ulint size;
+ ulint zip_size;
+ xdes_t* descr;
+ ulint count = 0;
+ ulint frag_n_used;
+ ulint actual_increase;
+ ulint i;
+ mtr_t ibuf_mtr;
+
+ ut_ad(header && mtr);
+ ut_ad(page_offset(header) == FSP_HEADER_OFFSET);
+
+ /* Check if we can fill free list from above the free list limit */
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+ limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
+
+ zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(FSP_SPACE_FLAGS + header));
+ ut_a(ut_is_2pow(zip_size));
+ ut_a(zip_size <= UNIV_PAGE_SIZE);
+ ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+
+ if (space == 0 && srv_auto_extend_last_data_file
+ && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+ /* Try to increase the last data file size */
+ fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+ }
+
+ if (space != 0 && !init_space
+ && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+ /* Try to increase the .ibd file size */
+ fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+ }
+
+ i = limit;
+
+ while ((init_space && i < 1)
+ || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) {
+
+ ibool init_xdes;
+ if (zip_size) {
+ init_xdes = ut_2pow_remainder(i, zip_size) == 0;
+ } else {
+ init_xdes = ut_2pow_remainder(i, UNIV_PAGE_SIZE) == 0;
+ }
+
+ mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
+ MLOG_4BYTES, mtr);
+
+ /* Update the free limit info in the log system and make
+ a checkpoint */
+ if (space == 0) {
+ ut_a(!zip_size);
+ log_fsp_current_free_limit_set_and_checkpoint(
+ (i + FSP_EXTENT_SIZE)
+ / ((1024 * 1024) / UNIV_PAGE_SIZE));
+ }
+
+ if (UNIV_UNLIKELY(init_xdes)) {
+
+ buf_block_t* block;
+
+ /* We are going to initialize a new descriptor page
+ and a new ibuf bitmap page: the prior contents of the
+ pages should be ignored. */
+
+ if (i > 0) {
+ block = buf_page_create(
+ space, i, zip_size, mtr);
+ buf_page_get(space, zip_size, i,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block,
+ SYNC_FSP_PAGE);
+
+ fsp_init_file_page(block, mtr);
+ mlog_write_ulint(buf_block_get_frame(block)
+ + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_XDES,
+ MLOG_2BYTES, mtr);
+ }
+
+ /* Initialize the ibuf bitmap page in a separate
+ mini-transaction because it is low in the latching
+ order, and we must be able to release its latch
+ before returning from the fsp routine */
+
+ mtr_start(&ibuf_mtr);
+
+ block = buf_page_create(space,
+ i + FSP_IBUF_BITMAP_OFFSET,
+ zip_size, &ibuf_mtr);
+ buf_page_get(space, zip_size,
+ i + FSP_IBUF_BITMAP_OFFSET,
+ RW_X_LATCH, &ibuf_mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ fsp_init_file_page(block, &ibuf_mtr);
+
+ ibuf_bitmap_page_init(block, &ibuf_mtr);
+
+ mtr_commit(&ibuf_mtr);
+ }
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, i,
+ mtr);
+ xdes_init(descr, mtr);
+
+//#if UNIV_PAGE_SIZE % FSP_EXTENT_SIZE
+//# error "UNIV_PAGE_SIZE % FSP_EXTENT_SIZE != 0"
+//#endif
+//#if PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE
+//# error "PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE != 0"
+//#endif
+
+ if (UNIV_UNLIKELY(init_xdes)) {
+
+ /* The first page in the extent is a descriptor page
+ and the second is an ibuf bitmap page: mark them
+ used */
+
+ xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr);
+ xdes_set_bit(descr, XDES_FREE_BIT,
+ FSP_IBUF_BITMAP_OFFSET, FALSE, mtr);
+ xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+
+ flst_add_last(header + FSP_FREE_FRAG,
+ descr + XDES_FLST_NODE, mtr);
+ frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+ MLOG_4BYTES, mtr);
+ mlog_write_ulint(header + FSP_FRAG_N_USED,
+ frag_n_used + 2, MLOG_4BYTES, mtr);
+ } else {
+ flst_add_last(header + FSP_FREE,
+ descr + XDES_FLST_NODE, mtr);
+ count++;
+ }
+
+ i += FSP_EXTENT_SIZE;
+ }
+}
+
+/**********************************************************************//**
+Allocates a new free extent.
+@return extent descriptor, NULL if cannot be allocated */
+static
+xdes_t*
+fsp_alloc_free_extent(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint hint, /*!< in: hint of which extent would be desirable: any
+ page offset in the extent goes; the hint must not
+ be > FSP_FREE_LIMIT */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fsp_header_t* header;
+ fil_addr_t first;
+ xdes_t* descr;
+
+ ut_ad(mtr);
+
+ header = fsp_get_space_header(space, zip_size, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+ if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) {
+ /* Ok, we can take this extent */
+ } else {
+ /* Take the first extent in the free list */
+ first = flst_get_first(header + FSP_FREE, mtr);
+
+ if (fil_addr_is_null(first)) {
+ fsp_fill_free_list(FALSE, space, header, mtr);
+
+ first = flst_get_first(header + FSP_FREE, mtr);
+ }
+
+ if (fil_addr_is_null(first)) {
+
+ return(NULL); /* No free extents left */
+ }
+
+ descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+ }
+
+ flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+
+ return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a space. The page is marked as used.
+@return the page offset, FIL_NULL if no page could be allocated */
+static
+ulint
+fsp_alloc_free_page(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fsp_header_t* header;
+ fil_addr_t first;
+ xdes_t* descr;
+ buf_block_t* block;
+ ulint free;
+ ulint frag_n_used;
+ ulint page_no;
+ ulint space_size;
+ ibool success;
+
+ ut_ad(mtr);
+
+ header = fsp_get_space_header(space, zip_size, mtr);
+
+ /* Get the hinted descriptor */
+ descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+ if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) {
+ /* Ok, we can take this extent */
+ } else {
+ /* Else take the first extent in free_frag list */
+ first = flst_get_first(header + FSP_FREE_FRAG, mtr);
+
+ if (fil_addr_is_null(first)) {
+ /* There are no partially full fragments: allocate
+ a free extent and add it to the FREE_FRAG list. NOTE
+ that the allocation may have as a side-effect that an
+ extent containing a descriptor page is added to the
+ FREE_FRAG list. But we will allocate our page from the
+ the free extent anyway. */
+
+ descr = fsp_alloc_free_extent(space, zip_size,
+ hint, mtr);
+
+ if (descr == NULL) {
+ /* No free space left */
+
+ return(FIL_NULL);
+ }
+
+ xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+ flst_add_last(header + FSP_FREE_FRAG,
+ descr + XDES_FLST_NODE, mtr);
+ } else {
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ first, mtr);
+ }
+
+ /* Reset the hint */
+ hint = 0;
+ }
+
+ /* Now we have in descr an extent with at least one free page. Look
+ for a free page in the extent. */
+
+ free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
+ hint % FSP_EXTENT_SIZE, mtr);
+ if (free == ULINT_UNDEFINED) {
+
+ ut_print_buf(stderr, ((byte*)descr) - 500, 1000);
+ putc('\n', stderr);
+
+ ut_error;
+ }
+
+ page_no = xdes_get_offset(descr) + free;
+
+ space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ if (space_size <= page_no) {
+ /* It must be that we are extending a single-table tablespace
+ whose size is still < 64 pages */
+
+ ut_a(space != 0);
+ if (page_no >= FSP_EXTENT_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to extend a"
+ " single-table tablespace %lu\n"
+ "InnoDB: by single page(s) though the"
+ " space size %lu. Page no %lu.\n",
+ (ulong) space, (ulong) space_size,
+ (ulong) page_no);
+ return(FIL_NULL);
+ }
+ success = fsp_try_extend_data_file_with_pages(space, page_no,
+ header, mtr);
+ if (!success) {
+ /* No disk space left */
+ return(FIL_NULL);
+ }
+ }
+
+ xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
+
+ /* Update the FRAG_N_USED field */
+ frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+ mtr);
+ frag_n_used++;
+ mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
+ mtr);
+ if (xdes_is_full(descr, mtr)) {
+ /* The fragment is full: move it to another list */
+ flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+ mtr);
+ xdes_set_state(descr, XDES_FULL_FRAG, mtr);
+
+ flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+ mtr);
+ mlog_write_ulint(header + FSP_FRAG_N_USED,
+ frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
+ mtr);
+ }
+
+ /* Initialize the allocated page to the buffer pool, so that it can
+ be obtained immediately with buf_page_get without need for a disk
+ read. */
+
+ buf_page_create(space, page_no, zip_size, mtr);
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ /* Prior contents of the page should be ignored */
+ fsp_init_file_page(block, mtr);
+
+ return(page_no);
+}
+
+/**********************************************************************//**
+Frees a single page of a space. The page is marked as free and clean. */
+static
+void
+fsp_free_page(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fsp_header_t* header;
+ xdes_t* descr;
+ ulint state;
+ ulint frag_n_used;
+
+ ut_ad(mtr);
+
+ /* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+ header = fsp_get_space_header(space, zip_size, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+ state = xdes_get_state(descr, mtr);
+
+ if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) {
+ fprintf(stderr,
+ "InnoDB: Error: File space extent descriptor"
+ " of page %lu has state %lu\n",
+ (ulong) page,
+ (ulong) state);
+ fputs("InnoDB: Dump of descriptor: ", stderr);
+ ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+ putc('\n', stderr);
+
+ if (state == XDES_FREE) {
+ /* We put here some fault tolerance: if the page
+ is already free, return without doing anything! */
+
+ return;
+ }
+
+ ut_error;
+ }
+
+ if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+ fprintf(stderr,
+ "InnoDB: Error: File space extent descriptor"
+ " of page %lu says it is free\n"
+ "InnoDB: Dump of descriptor: ", (ulong) page);
+ ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+ putc('\n', stderr);
+
+ /* We put here some fault tolerance: if the page
+ is already free, return without doing anything! */
+
+ return;
+ }
+
+ xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+ xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+ frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+ mtr);
+ if (state == XDES_FULL_FRAG) {
+ /* The fragment was full: move it to another list */
+ flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+ mtr);
+ xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+ flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+ mtr);
+ mlog_write_ulint(header + FSP_FRAG_N_USED,
+ frag_n_used + FSP_EXTENT_SIZE - 1,
+ MLOG_4BYTES, mtr);
+ } else {
+ ut_a(frag_n_used > 0);
+ mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1,
+ MLOG_4BYTES, mtr);
+ }
+
+ if (xdes_is_free(descr, mtr)) {
+ /* The extent has become free: move it to another list */
+ flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+ mtr);
+ fsp_free_extent(space, zip_size, page, mtr);
+ }
+}
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset in the extent */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fsp_header_t* header;
+ xdes_t* descr;
+
+ ut_ad(mtr);
+
+ header = fsp_get_space_header(space, zip_size, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+ if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+ ut_print_buf(stderr, (byte*)descr - 500, 1000);
+ putc('\n', stderr);
+
+ ut_error;
+ }
+
+ xdes_init(descr, mtr);
+
+ flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+}
+
+/**********************************************************************//**
+Returns the nth inode slot on an inode page.
+@return segment inode */
+UNIV_INLINE
+fseg_inode_t*
+fsp_seg_inode_page_get_nth_inode(
+/*=============================*/
+ page_t* page, /*!< in: segment inode page */
+ ulint i, /*!< in: inode index on page */
+ ulint zip_size __attribute__((unused)),
+ /*!< in: compressed page size, or 0 */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size));
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+
+ return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i);
+}
+
+/**********************************************************************//**
+Looks for a used segment inode on a segment inode page.
+@return segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(
+/*=========================*/
+ page_t* page, /*!< in: segment inode page */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint i;
+ fseg_inode_t* inode;
+
+ for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+ inode = fsp_seg_inode_page_get_nth_inode(
+ page, i, zip_size, mtr);
+
+ if (!ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) {
+ /* This is used */
+
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for an unused segment inode on a segment inode page.
+@return segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(
+/*=========================*/
+ page_t* page, /*!< in: segment inode page */
+ ulint i, /*!< in: search forward starting from this index */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fseg_inode_t* inode;
+
+ if (srv_pass_corrupt_table && !page) {
+ return(ULINT_UNDEFINED);
+ }
+ ut_a(page);
+
+ for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+ inode = fsp_seg_inode_page_get_nth_inode(
+ page, i, zip_size, mtr);
+
+ if (ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) {
+ /* This is unused */
+
+ return(i);
+ }
+
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode page.
+@return TRUE if could be allocated */
+static
+ibool
+fsp_alloc_seg_inode_page(
+/*=====================*/
+ fsp_header_t* space_header, /*!< in: space header */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fseg_inode_t* inode;
+ buf_block_t* block;
+ page_t* page;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint i;
+
+ ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+ space = page_get_space_id(page_align(space_header));
+ zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+
+ page_no = fsp_alloc_free_page(space, zip_size, 0, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ block->check_index_page_at_flush = FALSE;
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE,
+ MLOG_2BYTES, mtr);
+
+ for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+ inode = fsp_seg_inode_page_get_nth_inode(page, i,
+ zip_size, mtr);
+
+ mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr);
+ }
+
+ flst_add_last(space_header + FSP_SEG_INODES_FREE,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode.
+@return segment inode, or NULL if not enough space */
+static
+fseg_inode_t*
+fsp_alloc_seg_inode(
+/*================*/
+ fsp_header_t* space_header, /*!< in: space header */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint page_no;
+ buf_block_t* block;
+ page_t* page;
+ fseg_inode_t* inode;
+ ibool success;
+ ulint zip_size;
+ ulint n;
+
+ ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+ if (flst_get_len(space_header + FSP_SEG_INODES_FREE, mtr) == 0) {
+ /* Allocate a new segment inode page */
+
+ success = fsp_alloc_seg_inode_page(space_header, mtr);
+
+ if (!success) {
+
+ return(NULL);
+ }
+ }
+
+ page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page;
+
+ zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+ block = buf_page_get(page_get_space_id(page_align(space_header)),
+ zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ page = buf_block_get_frame(block);
+
+ if (srv_pass_corrupt_table && !page) {
+ return(0);
+ }
+ ut_a(page);
+
+ n = fsp_seg_inode_page_find_free(page, 0, zip_size, mtr);
+
+ ut_a(n != ULINT_UNDEFINED);
+
+ inode = fsp_seg_inode_page_get_nth_inode(page, n, zip_size, mtr);
+
+ if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1,
+ zip_size, mtr)) {
+ /* There are no other unused headers left on the page: move it
+ to another list */
+
+ flst_remove(space_header + FSP_SEG_INODES_FREE,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+
+ flst_add_last(space_header + FSP_SEG_INODES_FULL,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+ }
+
+ ut_ad(ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))
+ || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ return(inode);
+}
+
+/**********************************************************************//**
+Frees a file segment inode. */
+static
+void
+fsp_free_seg_inode(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ page_t* page;
+ fsp_header_t* space_header;
+
+ page = page_align(inode);
+
+ space_header = fsp_get_space_header(space, zip_size, mtr);
+
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ if (ULINT_UNDEFINED
+ == fsp_seg_inode_page_find_free(page, 0, zip_size, mtr)) {
+
+ /* Move the page to another list */
+
+ flst_remove(space_header + FSP_SEG_INODES_FULL,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+
+ flst_add_last(space_header + FSP_SEG_INODES_FREE,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+ }
+
+ mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr);
+ mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr);
+
+ if (ULINT_UNDEFINED
+ == fsp_seg_inode_page_find_used(page, zip_size, mtr)) {
+
+ /* There are no other used headers left on the page: free it */
+
+ flst_remove(space_header + FSP_SEG_INODES_FREE,
+ page + FSEG_INODE_PAGE_NODE, mtr);
+
+ fsp_free_page(space, zip_size, page_get_page_no(page), mtr);
+ }
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return segment inode, page x-latched; NULL if the inode is free */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+/*===============*/
+ fseg_header_t* header, /*!< in: segment header */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fil_addr_t inode_addr;
+ fseg_inode_t* inode;
+
+ inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO);
+ inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET);
+ ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE));
+
+ inode = fut_get_ptr(space, zip_size, inode_addr, RW_X_LATCH, mtr);
+
+ if (srv_pass_corrupt_table && !inode) {
+ return(0);
+ }
+ ut_a(inode);
+
+ if (UNIV_UNLIKELY
+ (ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID)))) {
+
+ inode = NULL;
+ } else {
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ }
+
+ return(inode);
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return segment inode, page x-latched */
+static
+fseg_inode_t*
+fseg_inode_get(
+/*===========*/
+ fseg_header_t* header, /*!< in: segment header */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fseg_inode_t* inode
+ = fseg_inode_try_get(header, space, zip_size, mtr);
+ ut_a(srv_pass_corrupt_table || inode);
+ return(inode);
+}
+
+/**********************************************************************//**
+Gets the page number from the nth fragment page slot.
+@return page number, FIL_NULL if not in use */
+UNIV_INLINE
+ulint
+fseg_get_nth_frag_page_no(
+/*======================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint n, /*!< in: slot index */
+ mtr_t* mtr __attribute__((unused))) /*!< in: mtr handle */
+{
+ ut_ad(inode && mtr);
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+ return(mach_read_from_4(inode + FSEG_FRAG_ARR
+ + n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/**********************************************************************//**
+Sets the page number in the nth fragment page slot. */
+UNIV_INLINE
+void
+fseg_set_nth_frag_page_no(
+/*======================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint n, /*!< in: slot index */
+ ulint page_no,/*!< in: page number to set */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ut_ad(inode && mtr);
+ ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+ ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+ page_no, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint i;
+ ulint page_no;
+
+ ut_ad(inode && mtr);
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(inode, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint i;
+ ulint page_no;
+
+ ut_ad(inode && mtr);
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ page_no = fseg_get_nth_frag_page_no(
+ inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr);
+
+ if (page_no != FIL_NULL) {
+
+ return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Calculates reserved fragment page slots.
+@return number of fragment pages */
+static
+ulint
+fseg_get_n_frag_pages(
+/*==================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint i;
+ ulint count = 0;
+
+ ut_ad(inode && mtr);
+
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) {
+ count++;
+ }
+ }
+
+ return(count);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has already
+ done the reservation for the pages with
+ fsp_reserve_free_extents (at least 2 extents: one for
+ the inode and the other for the segment) then there is
+ no need to do the check for this individual
+ operation */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint flags;
+ ulint zip_size;
+ fsp_header_t* space_header;
+ fseg_inode_t* inode;
+ dulint seg_id;
+ buf_block_t* block = 0; /* remove warning */
+ fseg_header_t* header = 0; /* remove warning */
+ rw_lock_t* latch;
+ ibool success;
+ ulint n_reserved;
+ ulint i;
+
+ ut_ad(mtr);
+ ut_ad(byte_offset + FSEG_HEADER_SIZE
+ <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ if (page != 0) {
+ block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
+ header = byte_offset + buf_block_get_frame(block);
+ }
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ if (rw_lock_get_x_lock_count(latch) == 1) {
+ /* This thread did not own the latch before this call: free
+ excess pages from the insert buffer free list */
+
+ if (space == IBUF_SPACE_ID) {
+ ibuf_free_excess_pages();
+ }
+ }
+
+ if (!has_done_reservation) {
+ success = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
+ if (!success) {
+ return(NULL);
+ }
+ }
+
+ space_header = fsp_get_space_header(space, zip_size, mtr);
+
+ inode = fsp_alloc_seg_inode(space_header, mtr);
+
+ if (inode == NULL) {
+
+ goto funct_exit;
+ }
+
+ /* Read the next segment id from space header and increment the
+ value in space header */
+
+ seg_id = mtr_read_dulint(space_header + FSP_SEG_ID, mtr);
+
+ mlog_write_dulint(space_header + FSP_SEG_ID, ut_dulint_add(seg_id, 1),
+ mtr);
+
+ mlog_write_dulint(inode + FSEG_ID, seg_id, mtr);
+ mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr);
+
+ flst_init(inode + FSEG_FREE, mtr);
+ flst_init(inode + FSEG_NOT_FULL, mtr);
+ flst_init(inode + FSEG_FULL, mtr);
+
+ mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE,
+ MLOG_4BYTES, mtr);
+ for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+ fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr);
+ }
+
+ if (page == 0) {
+ page = fseg_alloc_free_page_low(space, zip_size,
+ inode, 0, FSP_UP, mtr);
+
+ if (page == FIL_NULL) {
+
+ fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+ goto funct_exit;
+ }
+
+ block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
+ header = byte_offset + buf_block_get_frame(block);
+ mlog_write_ulint(header - byte_offset + FIL_PAGE_TYPE,
+ FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr);
+ }
+
+ mlog_write_ulint(header + FSEG_HDR_OFFSET,
+ page_offset(inode), MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(header + FSEG_HDR_PAGE_NO,
+ page_get_page_no(page_align(inode)),
+ MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(header + FSEG_HDR_SPACE, space, MLOG_4BYTES, mtr);
+
+funct_exit:
+ if (!has_done_reservation) {
+
+ fil_space_release_free_extents(space, n_reserved);
+ }
+
+ return(block);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ return(fseg_create_general(space, page, byte_offset, FALSE, mtr));
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint* used, /*!< out: number of pages used (not
+ more than reserved) */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint ret;
+
+ ut_ad(inode && used && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+
+ *used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr)
+ + fseg_get_n_frag_pages(inode, mtr);
+
+ ret = fseg_get_n_frag_pages(inode, mtr)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE, mtr)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL, mtr)
+ + FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr);
+
+ return(ret);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+ fseg_header_t* header, /*!< in: segment header */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint ret;
+ fseg_inode_t* inode;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+ rw_lock_t* latch;
+
+ space = page_get_space_id(page_align(header));
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ inode = fseg_inode_get(header, space, zip_size, mtr);
+
+ ret = fseg_n_reserved_pages_low(inode, used, mtr);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward. */
+static
+void
+fseg_fill_free_list(
+/*================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint hint, /*!< in: hint which extent would be good as
+ the first extent */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ xdes_t* descr;
+ ulint i;
+ dulint seg_id;
+ ulint reserved;
+ ulint used;
+
+ ut_ad(inode && mtr);
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+ reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+ if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
+
+ /* The segment is too small to allow extents in free list */
+
+ return;
+ }
+
+ if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+ /* Free list is not empty */
+
+ return;
+ }
+
+ for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
+ descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+
+ if ((descr == NULL)
+ || (XDES_FREE != xdes_get_state(descr, mtr))) {
+
+ /* We cannot allocate the desired extent: stop */
+
+ return;
+ }
+
+ descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+ xdes_set_state(descr, XDES_FSEG, mtr);
+
+ seg_id = mtr_read_dulint(inode + FSEG_ID, mtr);
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ mlog_write_dulint(descr + XDES_ID, seg_id, mtr);
+
+ flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+ hint += FSP_EXTENT_SIZE;
+ }
+}
+
+/*********************************************************************//**
+Allocates a free extent for the segment: looks first in the free list of the
+segment, then tries to allocate from the space free list. NOTE that the extent
+returned still resides in the segment free list, it is not yet taken off it!
+@return allocated extent, still placed in the segment free list, NULL
+if could not be allocated */
+static
+xdes_t*
+fseg_alloc_free_extent(
+/*===================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ xdes_t* descr;
+ dulint seg_id;
+ fil_addr_t first;
+
+ ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+ /* Segment free list is not empty, allocate from it */
+
+ first = flst_get_first(inode + FSEG_FREE, mtr);
+
+ descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+ } else {
+ /* Segment free list was empty, allocate from space */
+ descr = fsp_alloc_free_extent(space, zip_size, 0, mtr);
+
+ if (descr == NULL) {
+
+ return(NULL);
+ }
+
+ seg_id = mtr_read_dulint(inode + FSEG_ID, mtr);
+
+ xdes_set_state(descr, XDES_FSEG, mtr);
+ mlog_write_dulint(descr + XDES_ID, seg_id, mtr);
+ flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+
+ /* Try to fill the segment free list */
+ fseg_fill_free_list(inode, space, zip_size,
+ xdes_get_offset(descr) + FSP_EXTENT_SIZE,
+ mtr);
+ }
+
+ return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return the allocated page number, FIL_NULL if no page could be allocated */
+static
+ulint
+fseg_alloc_free_page_low(
+/*=====================*/
+ ulint space, /*!< in: space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fseg_inode_t* seg_inode, /*!< in: segment inode */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction, /*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fsp_header_t* space_header;
+ ulint space_size;
+ dulint seg_id;
+ ulint used;
+ ulint reserved;
+ xdes_t* descr; /*!< extent of the hinted page */
+ ulint ret_page; /*!< the allocated page offset, FIL_NULL
+ if could not be allocated */
+ xdes_t* ret_descr; /*!< the extent of the allocated page */
+ ibool frag_page_allocated = FALSE;
+ ibool success;
+ ulint n;
+
+ ut_ad(mtr);
+ ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr);
+
+ ut_ad(!ut_dulint_is_zero(seg_id));
+
+ reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr);
+
+ space_header = fsp_get_space_header(space, zip_size, mtr);
+
+ descr = xdes_get_descriptor_with_space_hdr(space_header, space,
+ hint, mtr);
+ if (descr == NULL) {
+ /* Hint outside space or too high above free limit: reset
+ hint */
+ hint = 0;
+ descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+ }
+
+ /* In the big if-else below we look for ret_page and ret_descr */
+ /*-------------------------------------------------------------*/
+ if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+ && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID,
+ mtr), seg_id))
+ && (xdes_get_bit(descr, XDES_FREE_BIT,
+ hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
+
+ /* 1. We can take the hinted page
+ =================================*/
+ ret_descr = descr;
+ ret_page = hint;
+ /*-----------------------------------------------------------*/
+ } else if ((xdes_get_state(descr, mtr) == XDES_FREE)
+ && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+ && (used >= FSEG_FRAG_LIMIT)) {
+
+ /* 2. We allocate the free extent from space and can take
+ =========================================================
+ the hinted page
+ ===============*/
+ ret_descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+ ut_a(ret_descr == descr);
+
+ xdes_set_state(ret_descr, XDES_FSEG, mtr);
+ mlog_write_dulint(ret_descr + XDES_ID, seg_id, mtr);
+ flst_add_last(seg_inode + FSEG_FREE,
+ ret_descr + XDES_FLST_NODE, mtr);
+
+ /* Try to fill the segment free list */
+ fseg_fill_free_list(seg_inode, space, zip_size,
+ hint + FSP_EXTENT_SIZE, mtr);
+ ret_page = hint;
+ /*-----------------------------------------------------------*/
+ } else if ((direction != FSP_NO_DIR)
+ && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+ && (used >= FSEG_FRAG_LIMIT)
+ && (!!(ret_descr
+ = fseg_alloc_free_extent(seg_inode,
+ space, zip_size, mtr)))) {
+
+ /* 3. We take any free extent (which was already assigned above
+ ===============================================================
+ in the if-condition to ret_descr) and take the lowest or
+ ========================================================
+ highest page in it, depending on the direction
+ ==============================================*/
+ ret_page = xdes_get_offset(ret_descr);
+
+ if (direction == FSP_DOWN) {
+ ret_page += FSP_EXTENT_SIZE - 1;
+ }
+ /*-----------------------------------------------------------*/
+ } else if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+ && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID,
+ mtr), seg_id))
+ && (!xdes_is_full(descr, mtr))) {
+
+ /* 4. We can take the page from the same extent as the
+ ======================================================
+ hinted page (and the extent already belongs to the
+ ==================================================
+ segment)
+ ========*/
+ ret_descr = descr;
+ ret_page = xdes_get_offset(ret_descr)
+ + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+ hint % FSP_EXTENT_SIZE, mtr);
+ /*-----------------------------------------------------------*/
+ } else if (reserved - used > 0) {
+ /* 5. We take any unused page from the segment
+ ==============================================*/
+ fil_addr_t first;
+
+ if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) {
+ first = flst_get_first(seg_inode + FSEG_NOT_FULL,
+ mtr);
+ } else if (flst_get_len(seg_inode + FSEG_FREE, mtr) > 0) {
+ first = flst_get_first(seg_inode + FSEG_FREE, mtr);
+ } else {
+ ut_error;
+ return(FIL_NULL);
+ }
+
+ ret_descr = xdes_lst_get_descriptor(space, zip_size,
+ first, mtr);
+ ret_page = xdes_get_offset(ret_descr)
+ + xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+ 0, mtr);
+ /*-----------------------------------------------------------*/
+ } else if (used < FSEG_FRAG_LIMIT) {
+ /* 6. We allocate an individual page from the space
+ ===================================================*/
+ ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr);
+ ret_descr = NULL;
+
+ frag_page_allocated = TRUE;
+
+ if (ret_page != FIL_NULL) {
+ /* Put the page in the fragment page array of the
+ segment */
+ n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+ ut_a(n != FIL_NULL);
+
+ fseg_set_nth_frag_page_no(seg_inode, n, ret_page,
+ mtr);
+ }
+ /*-----------------------------------------------------------*/
+ } else {
+ /* 7. We allocate a new extent and take its first page
+ ======================================================*/
+ ret_descr = fseg_alloc_free_extent(seg_inode,
+ space, zip_size, mtr);
+
+ if (ret_descr == NULL) {
+ ret_page = FIL_NULL;
+ } else {
+ ret_page = xdes_get_offset(ret_descr);
+ }
+ }
+
+ if (ret_page == FIL_NULL) {
+ /* Page could not be allocated */
+
+ return(FIL_NULL);
+ }
+
+ if (space != 0) {
+ space_size = fil_space_get_size(space);
+
+ if (space_size <= ret_page) {
+ /* It must be that we are extending a single-table
+ tablespace whose size is still < 64 pages */
+
+ if (ret_page >= FSP_EXTENT_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error (2): trying to extend"
+ " a single-table tablespace %lu\n"
+ "InnoDB: by single page(s) though"
+ " the space size %lu. Page no %lu.\n",
+ (ulong) space, (ulong) space_size,
+ (ulong) ret_page);
+ return(FIL_NULL);
+ }
+
+ success = fsp_try_extend_data_file_with_pages(
+ space, ret_page, space_header, mtr);
+ if (!success) {
+ /* No disk space left */
+ return(FIL_NULL);
+ }
+ }
+ }
+
+ if (!frag_page_allocated) {
+ /* Initialize the allocated page to buffer pool, so that it
+ can be obtained immediately with buf_page_get without need
+ for a disk read */
+ buf_block_t* block;
+ ulint zip_size = dict_table_flags_to_zip_size(
+ mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+
+ block = buf_page_create(space, ret_page, zip_size, mtr);
+ buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+ if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size,
+ ret_page, RW_X_LATCH,
+ mtr))) {
+ ut_error;
+ }
+
+ /* The prior contents of the page should be ignored */
+ fsp_init_file_page(block, mtr);
+
+ /* At this point we know the extent and the page offset.
+ The extent is still in the appropriate list (FSEG_NOT_FULL
+ or FSEG_FREE), and the page is not yet marked as used. */
+
+ ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr)
+ == ret_descr);
+ ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT,
+ ret_page % FSP_EXTENT_SIZE, mtr) == TRUE);
+
+ fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr);
+ }
+
+ buf_reset_check_index_page_at_flush(space, ret_page);
+
+ return(ret_page);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ fseg_inode_t* inode;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+ rw_lock_t* latch;
+ ibool success;
+ ulint page_no;
+ ulint n_reserved;
+
+ space = page_get_space_id(page_align(seg_header));
+
+ latch = fil_space_get_latch(space, &flags);
+
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ if (rw_lock_get_x_lock_count(latch) == 1) {
+ /* This thread did not own the latch before this call: free
+ excess pages from the insert buffer free list */
+
+ if (space == IBUF_SPACE_ID) {
+ ibuf_free_excess_pages();
+ }
+ }
+
+ inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+ if (!has_done_reservation) {
+ success = fsp_reserve_free_extents(&n_reserved, space, 2,
+ FSP_NORMAL, mtr);
+ if (!success) {
+ return(FIL_NULL);
+ }
+ }
+
+ page_no = fseg_alloc_free_page_low(space, zip_size,
+ inode, hint, direction, mtr);
+ if (!has_done_reservation) {
+ fil_space_release_free_extents(space, n_reserved);
+ }
+
+ return(page_no);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page(
+/*=================*/
+ fseg_header_t* seg_header,/*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ return(fseg_alloc_free_page_general(seg_header, hint, direction,
+ FALSE, mtr));
+}
+
+/**********************************************************************//**
+Checks that we have at least 2 frag pages free in the first extent of a
+single-table tablespace, and they are also physically initialized to the data
+file. That is we have already extended the data file so that those pages are
+inside the data file. If not, this function extends the tablespace with
+pages.
+@return TRUE if there were >= 3 free pages, or we were able to extend */
+static
+ibool
+fsp_reserve_free_pages(
+/*===================*/
+ ulint space, /*!< in: space id, must be != 0 */
+ fsp_header_t* space_header, /*!< in: header of that space,
+ x-latched */
+ ulint size, /*!< in: size of the tablespace in pages,
+ must be < FSP_EXTENT_SIZE / 2 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ xdes_t* descr;
+ ulint n_used;
+
+ ut_a(space != 0);
+ ut_a(size < FSP_EXTENT_SIZE / 2);
+
+ descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0,
+ mtr);
+ n_used = xdes_get_n_used(descr, mtr);
+
+ ut_a(n_used <= size);
+
+ if (size >= n_used + 2) {
+
+ return(TRUE);
+ }
+
+ return(fsp_try_extend_data_file_with_pages(space, n_used + 1,
+ space_header, mtr));
+}
+
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+ ulint* n_reserved,/*!< out: number of extents actually reserved; if we
+ return TRUE and the tablespace size is < 64 pages,
+ then this can be 0, otherwise it is n_ext */
+ ulint space, /*!< in: space id */
+ ulint n_ext, /*!< in: number of extents to reserve */
+ ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fsp_header_t* space_header;
+ rw_lock_t* latch;
+ ulint n_free_list_ext;
+ ulint free_limit;
+ ulint size;
+ ulint flags;
+ ulint zip_size;
+ ulint n_free;
+ ulint n_free_up;
+ ulint reserve;
+ ibool success;
+ ulint n_pages_added;
+
+ ut_ad(mtr);
+ *n_reserved = n_ext;
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ space_header = fsp_get_space_header(space, zip_size, mtr);
+try_again:
+ size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+ if (size < FSP_EXTENT_SIZE / 2) {
+ /* Use different rules for small single-table tablespaces */
+ *n_reserved = 0;
+ return(fsp_reserve_free_pages(space, space_header, size, mtr));
+ }
+
+ n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr);
+
+ free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+ MLOG_4BYTES, mtr);
+
+ /* Below we play safe when counting free extents above the free limit:
+ some of them will contain extent descriptor pages, and therefore
+ will not be free extents */
+
+ n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+ if (n_free_up > 0) {
+ n_free_up--;
+ if (!zip_size) {
+ n_free_up -= n_free_up
+ / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+ } else {
+ n_free_up -= n_free_up
+ / (zip_size / FSP_EXTENT_SIZE);
+ }
+ }
+
+ n_free = n_free_list_ext + n_free_up;
+
+ if (alloc_type == FSP_NORMAL) {
+ /* We reserve 1 extent + 0.5 % of the space size to undo logs
+ and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+ code is duplicated in the function below! */
+
+ reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ } else if (alloc_type == FSP_UNDO) {
+ /* We reserve 0.5 % of the space size to cleaning operations */
+
+ reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200;
+
+ if (n_free <= reserve + n_ext) {
+
+ goto try_to_extend;
+ }
+ } else {
+ ut_a(alloc_type == FSP_CLEANING);
+ }
+
+ success = fil_space_reserve_free_extents(space, n_free, n_ext);
+
+ if (success) {
+ return(TRUE);
+ }
+try_to_extend:
+ success = fsp_try_extend_data_file(&n_pages_added, space,
+ space_header, mtr);
+ if (success && n_pages_added > 0) {
+
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+ ulint space) /*!< in: space id */
+{
+ fsp_header_t* space_header;
+ ulint n_free_list_ext;
+ ulint free_limit;
+ ulint size;
+ ulint flags;
+ ulint zip_size;
+ ulint n_free;
+ ulint n_free_up;
+ ulint reserve;
+ rw_lock_t* latch;
+ mtr_t mtr;
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mtr_start(&mtr);
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ mtr_x_lock(latch, &mtr);
+
+ space_header = fsp_get_space_header(space, zip_size, &mtr);
+
+ size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+ n_free_list_ext = flst_get_len(space_header + FSP_FREE, &mtr);
+
+ free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ if (size < FSP_EXTENT_SIZE) {
+ ut_a(space != 0); /* This must be a single-table
+ tablespace */
+
+ return(0); /* TODO: count free frag pages and
+ return a value based on that */
+ }
+
+ /* Below we play safe when counting free extents above the free limit:
+ some of them will contain extent descriptor pages, and therefore
+ will not be free extents */
+
+ n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+ if (n_free_up > 0) {
+ n_free_up--;
+ if (!zip_size) {
+ n_free_up -= n_free_up
+ / (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+ } else {
+ n_free_up -= n_free_up
+ / (zip_size / FSP_EXTENT_SIZE);
+ }
+ }
+
+ n_free = n_free_list_ext + n_free_up;
+
+ /* We reserve 1 extent + 0.5 % of the space size to undo logs
+ and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+ code is duplicated in the function above! */
+
+ reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+ if (reserve > n_free) {
+ return(0);
+ }
+
+ if (!zip_size) {
+ return((ullint) (n_free - reserve)
+ * FSP_EXTENT_SIZE
+ * (UNIV_PAGE_SIZE / 1024));
+ } else {
+ return((ullint) (n_free - reserve)
+ * FSP_EXTENT_SIZE
+ * (zip_size / 1024));
+ }
+}
+
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static
+void
+fseg_mark_page_used(
+/*================*/
+ fseg_inode_t* seg_inode,/*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ xdes_t* descr;
+ ulint not_full_n_used;
+
+ ut_ad(seg_inode && mtr);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+
+ descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+ ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr)
+ == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr));
+
+ if (xdes_is_free(descr, mtr)) {
+ /* We move the extent from the free list to the
+ NOT_FULL list */
+ flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE,
+ mtr);
+ flst_add_last(seg_inode + FSEG_NOT_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ }
+
+ ut_ad(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
+ == TRUE);
+ /* We mark the page as used */
+ xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr);
+
+ not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ MLOG_4BYTES, mtr);
+ not_full_n_used++;
+ mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used,
+ MLOG_4BYTES, mtr);
+ if (xdes_is_full(descr, mtr)) {
+ /* We move the extent from the NOT_FULL list to the
+ FULL list */
+ flst_remove(seg_inode + FSEG_NOT_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ flst_add_last(seg_inode + FSEG_FULL,
+ descr + XDES_FLST_NODE, mtr);
+
+ mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - FSP_EXTENT_SIZE,
+ MLOG_4BYTES, mtr);
+ }
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+static
+void
+fseg_free_page_low(
+/*===============*/
+ fseg_inode_t* seg_inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ xdes_t* descr;
+ ulint not_full_n_used;
+ ulint state;
+ dulint descr_id;
+ dulint seg_id;
+ ulint i;
+
+ ut_ad(seg_inode && mtr);
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+ ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+ /* Drop search system page hash index if the page is found in
+ the pool and is hashed */
+
+ btr_search_drop_page_hash_when_freed(space, zip_size, page);
+
+ descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+ if (srv_pass_corrupt_table && !descr) {
+ /* The page may be corrupt. pass it. */
+ return;
+ }
+
+ ut_a(descr);
+ if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+ fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+ stderr);
+ ut_print_buf(stderr, descr, 40);
+
+ fprintf(stderr, "\n"
+ "InnoDB: Serious error! InnoDB is trying to"
+ " free page %lu\n"
+ "InnoDB: though it is already marked as free"
+ " in the tablespace!\n"
+ "InnoDB: The tablespace free space info is corrupt.\n"
+ "InnoDB: You may need to dump your"
+ " InnoDB tables and recreate the whole\n"
+ "InnoDB: database!\n", (ulong) page);
+crash:
+ fputs("InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "forcing-recovery.html\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+ ut_error;
+ }
+
+ state = xdes_get_state(descr, mtr);
+
+ if (state != XDES_FSEG) {
+ /* The page is in the fragment pages of the segment */
+
+ for (i = 0;; i++) {
+ if (fseg_get_nth_frag_page_no(seg_inode, i, mtr)
+ == page) {
+
+ fseg_set_nth_frag_page_no(seg_inode, i,
+ FIL_NULL, mtr);
+ break;
+ }
+ }
+
+ fsp_free_page(space, zip_size, page, mtr);
+
+ return;
+ }
+
+ /* If we get here, the page is in some extent of the segment */
+
+ descr_id = mtr_read_dulint(descr + XDES_ID, mtr);
+ seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr);
+#if 0
+ fprintf(stderr,
+ "InnoDB: InnoDB is freeing space %lu page %lu,\n"
+ "InnoDB: which belongs to descr seg %lu %lu\n"
+ "InnoDB: segment %lu %lu.\n",
+ (ulong) space, (ulong) page,
+ (ulong) ut_dulint_get_high(descr_id),
+ (ulong) ut_dulint_get_low(descr_id),
+ (ulong) ut_dulint_get_high(seg_id),
+ (ulong) ut_dulint_get_low(seg_id));
+#endif /* 0 */
+ if (0 != ut_dulint_cmp(descr_id, seg_id)) {
+ fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+ stderr);
+ ut_print_buf(stderr, descr, 40);
+ fputs("\nInnoDB: Dump of the segment inode: ", stderr);
+ ut_print_buf(stderr, seg_inode, 40);
+ putc('\n', stderr);
+
+ fprintf(stderr,
+ "InnoDB: Serious error: InnoDB is trying to"
+ " free space %lu page %lu,\n"
+ "InnoDB: which does not belong to"
+ " segment %lu %lu but belongs\n"
+ "InnoDB: to segment %lu %lu.\n",
+ (ulong) space, (ulong) page,
+ (ulong) ut_dulint_get_high(descr_id),
+ (ulong) ut_dulint_get_low(descr_id),
+ (ulong) ut_dulint_get_high(seg_id),
+ (ulong) ut_dulint_get_low(seg_id));
+ goto crash;
+ }
+
+ not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ MLOG_4BYTES, mtr);
+ if (xdes_is_full(descr, mtr)) {
+ /* The fragment is full: move it to another list */
+ flst_remove(seg_inode + FSEG_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ flst_add_last(seg_inode + FSEG_NOT_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used + FSP_EXTENT_SIZE - 1,
+ MLOG_4BYTES, mtr);
+ } else {
+ ut_a(not_full_n_used > 0);
+ mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - 1, MLOG_4BYTES, mtr);
+ }
+
+ xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+ xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+ if (xdes_is_free(descr, mtr)) {
+ /* The extent has become free: free it to space */
+ flst_remove(seg_inode + FSEG_NOT_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ fsp_free_extent(space, zip_size, page, mtr);
+ }
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+ fseg_header_t* seg_header, /*!< in: segment header */
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint flags;
+ ulint zip_size;
+ fseg_inode_t* seg_inode;
+ rw_lock_t* latch;
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ seg_inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+ fseg_free_page_low(seg_inode, space, zip_size, page, mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ buf_page_set_file_page_was_freed(space, page);
+#endif
+}
+
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+ fseg_inode_t* seg_inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page, /*!< in: a page in the extent */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ ulint first_page_in_extent;
+ xdes_t* descr;
+ ulint not_full_n_used;
+ ulint descr_n_used;
+ ulint i;
+
+ ut_ad(seg_inode && mtr);
+
+ descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+ ut_a(xdes_get_state(descr, mtr) == XDES_FSEG);
+ ut_a(0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, mtr),
+ mtr_read_dulint(seg_inode + FSEG_ID, mtr)));
+ ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+ == FSEG_MAGIC_N_VALUE);
+
+ first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+ for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+ if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+
+ /* Drop search system page hash index if the page is
+ found in the pool and is hashed */
+
+ btr_search_drop_page_hash_when_freed(
+ space, zip_size, first_page_in_extent + i);
+ }
+ }
+
+ if (xdes_is_full(descr, mtr)) {
+ flst_remove(seg_inode + FSEG_FULL,
+ descr + XDES_FLST_NODE, mtr);
+ } else if (xdes_is_free(descr, mtr)) {
+ flst_remove(seg_inode + FSEG_FREE,
+ descr + XDES_FLST_NODE, mtr);
+ } else {
+ flst_remove(seg_inode + FSEG_NOT_FULL,
+ descr + XDES_FLST_NODE, mtr);
+
+ not_full_n_used = mtr_read_ulint(
+ seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr);
+
+ descr_n_used = xdes_get_n_used(descr, mtr);
+ ut_a(not_full_n_used >= descr_n_used);
+ mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+ not_full_n_used - descr_n_used,
+ MLOG_4BYTES, mtr);
+ }
+
+ fsp_free_extent(space, zip_size, page, mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+
+ buf_page_set_file_page_was_freed(space,
+ first_page_in_extent + i);
+ }
+#endif
+}
+
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment by
+repeatedly calling this function in different mini-transactions. Doing
+the freeing in a single mini-transaction might result in too big a
+mini-transaction.
+@return TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+ fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint n;
+ ulint page;
+ xdes_t* descr;
+ fseg_inode_t* inode;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+ ulint header_page;
+ rw_lock_t* latch;
+
+ space = page_get_space_id(page_align(header));
+ header_page = page_get_page_no(page_align(header));
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ descr = xdes_get_descriptor(space, zip_size, header_page, mtr);
+
+ if (srv_pass_corrupt_table && !descr) {
+ /* The page may be corrupt. pass it. */
+ return(TRUE);
+ }
+
+ /* Check that the header resides on a page which has not been
+ freed yet */
+
+ ut_a(descr);
+ ut_a(xdes_get_bit(descr, XDES_FREE_BIT,
+ header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
+ inode = fseg_inode_try_get(header, space, zip_size, mtr);
+
+ if (UNIV_UNLIKELY(inode == NULL)) {
+ fprintf(stderr, "double free of inode from %u:%u\n",
+ (unsigned) space, (unsigned) header_page);
+ return(TRUE);
+ }
+
+ descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+ if (descr != NULL) {
+ /* Free the extent held by the segment */
+ page = xdes_get_offset(descr);
+
+ fseg_free_extent(inode, space, zip_size, page, mtr);
+
+ return(FALSE);
+ }
+
+ /* Free a frag page */
+ n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+ return(TRUE);
+ }
+
+ fseg_free_page_low(inode, space, zip_size,
+ fseg_get_nth_frag_page_no(inode, n, mtr), mtr);
+
+ n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+ if (n == ULINT_UNDEFINED) {
+ /* Freeing completed: free the segment inode */
+ fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+ fseg_header_t* header, /*!< in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint n;
+ ulint page;
+ xdes_t* descr;
+ fseg_inode_t* inode;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+ ulint page_no;
+ rw_lock_t* latch;
+
+ space = page_get_space_id(page_align(header));
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ ut_ad(!mutex_own(&kernel_mutex)
+ || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+ mtr_x_lock(latch, mtr);
+
+ inode = fseg_inode_get(header, space, zip_size, mtr);
+
+ if (srv_pass_corrupt_table && !inode) {
+ /* ignore the corruption */
+ return(TRUE);
+ }
+ ut_a(inode);
+
+ descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+ if (descr != NULL) {
+ /* Free the extent held by the segment */
+ page = xdes_get_offset(descr);
+
+ fseg_free_extent(inode, space, zip_size, page, mtr);
+
+ return(FALSE);
+ }
+
+ /* Free a frag page */
+
+ n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+ if (n == ULINT_UNDEFINED) {
+ ut_error;
+ }
+
+ page_no = fseg_get_nth_frag_page_no(inode, n, mtr);
+
+ if (page_no == page_get_page_no(page_align(header))) {
+
+ return(TRUE);
+ }
+
+ fseg_free_page_low(inode, space, zip_size, page_no, mtr);
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fil_addr_t first;
+ xdes_t* descr;
+
+ ut_ad(inode && mtr);
+
+ ut_ad(space == page_get_space_id(page_align(inode)));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ first = fil_addr_null;
+
+ if (flst_get_len(inode + FSEG_FULL, mtr) > 0) {
+
+ first = flst_get_first(inode + FSEG_FULL, mtr);
+
+ } else if (flst_get_len(inode + FSEG_NOT_FULL, mtr) > 0) {
+
+ first = flst_get_first(inode + FSEG_NOT_FULL, mtr);
+
+ } else if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+
+ first = flst_get_first(inode + FSEG_FREE, mtr);
+ }
+
+ if (first.page == FIL_NULL) {
+
+ return(NULL);
+ }
+ descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+
+ return(descr);
+}
+
+/*******************************************************************//**
+Validates a segment.
+@return TRUE if ok */
+static
+ibool
+fseg_validate_low(
+/*==============*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr2) /*!< in: mtr */
+{
+ ulint space;
+ dulint seg_id;
+ mtr_t mtr;
+ xdes_t* descr;
+ fil_addr_t node_addr;
+ ulint n_used = 0;
+ ulint n_used2 = 0;
+
+ ut_ad(mtr_memo_contains_page(mtr2, inode, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+ space = page_get_space_id(page_align(inode));
+
+ seg_id = mtr_read_dulint(inode + FSEG_ID, mtr2);
+ n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+ MLOG_4BYTES, mtr2);
+ flst_validate(inode + FSEG_FREE, mtr2);
+ flst_validate(inode + FSEG_NOT_FULL, mtr2);
+ flst_validate(inode + FSEG_FULL, mtr2);
+
+ /* Validate FSEG_FREE list */
+ node_addr = flst_get_first(inode + FSEG_FREE, mtr2);
+
+ while (!fil_addr_is_null(node_addr)) {
+ ulint flags;
+ ulint zip_size;
+
+ mtr_start(&mtr);
+ mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) == 0);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+ ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+ seg_id));
+
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+ mtr_commit(&mtr);
+ }
+
+ /* Validate FSEG_NOT_FULL list */
+
+ node_addr = flst_get_first(inode + FSEG_NOT_FULL, mtr2);
+
+ while (!fil_addr_is_null(node_addr)) {
+ ulint flags;
+ ulint zip_size;
+
+ mtr_start(&mtr);
+ mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) > 0);
+ ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+ ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+ seg_id));
+
+ n_used2 += xdes_get_n_used(descr, &mtr);
+
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+ mtr_commit(&mtr);
+ }
+
+ /* Validate FSEG_FULL list */
+
+ node_addr = flst_get_first(inode + FSEG_FULL, mtr2);
+
+ while (!fil_addr_is_null(node_addr)) {
+ ulint flags;
+ ulint zip_size;
+
+ mtr_start(&mtr);
+ mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+ ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+ seg_id));
+
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+ mtr_commit(&mtr);
+ }
+
+ ut_a(n_used == n_used2);
+
+ return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fseg_inode_t* inode;
+ ibool ret;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+
+ space = page_get_space_id(page_align(header));
+
+ mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ inode = fseg_inode_get(header, space, zip_size, mtr);
+
+ ret = fseg_validate_low(inode, mtr);
+
+ return(ret);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Writes info of a segment. */
+static
+void
+fseg_print_low(
+/*===========*/
+ fseg_inode_t* inode, /*!< in: segment inode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint seg_id_low;
+ ulint seg_id_high;
+ ulint n_used;
+ ulint n_frag;
+ ulint n_free;
+ ulint n_not_full;
+ ulint n_full;
+ ulint reserved;
+ ulint used;
+ ulint page_no;
+ dulint d_var;
+
+ ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+ space = page_get_space_id(page_align(inode));
+ page_no = page_get_page_no(page_align(inode));
+
+ reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+ d_var = mtr_read_dulint(inode + FSEG_ID, mtr);
+
+ seg_id_low = ut_dulint_get_low(d_var);
+ seg_id_high = ut_dulint_get_high(d_var);
+
+ n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+ MLOG_4BYTES, mtr);
+ n_frag = fseg_get_n_frag_pages(inode, mtr);
+ n_free = flst_get_len(inode + FSEG_FREE, mtr);
+ n_not_full = flst_get_len(inode + FSEG_NOT_FULL, mtr);
+ n_full = flst_get_len(inode + FSEG_FULL, mtr);
+
+ fprintf(stderr,
+ "SEGMENT id %lu %lu space %lu; page %lu;"
+ " res %lu used %lu; full ext %lu\n"
+ "fragm pages %lu; free extents %lu;"
+ " not full extents %lu: pages %lu\n",
+ (ulong) seg_id_high, (ulong) seg_id_low,
+ (ulong) space, (ulong) page_no,
+ (ulong) reserved, (ulong) used, (ulong) n_full,
+ (ulong) n_frag, (ulong) n_free, (ulong) n_not_full,
+ (ulong) n_used);
+ ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fseg_inode_t* inode;
+ ulint space;
+ ulint flags;
+ ulint zip_size;
+
+ space = page_get_space_id(page_align(header));
+
+ mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ inode = fseg_inode_get(header, space, zip_size, mtr);
+
+ fseg_print_low(inode, mtr);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+ ulint space) /*!< in: space id */
+{
+ fsp_header_t* header;
+ fseg_inode_t* seg_inode;
+ page_t* seg_inode_page;
+ rw_lock_t* latch;
+ ulint size;
+ ulint flags;
+ ulint zip_size;
+ ulint free_limit;
+ ulint frag_n_used;
+ mtr_t mtr;
+ mtr_t mtr2;
+ xdes_t* descr;
+ fil_addr_t node_addr;
+ fil_addr_t next_node_addr;
+ ulint descr_count = 0;
+ ulint n_used = 0;
+ ulint n_used2 = 0;
+ ulint n_full_frag_pages;
+ ulint n;
+ ulint seg_inode_len_free;
+ ulint seg_inode_len_full;
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+ ut_a(ut_is_2pow(zip_size));
+ ut_a(zip_size <= UNIV_PAGE_SIZE);
+ ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+
+ /* Start first a mini-transaction mtr2 to lock out all other threads
+ from the fsp system */
+ mtr_start(&mtr2);
+ mtr_x_lock(latch, &mtr2);
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+ free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT,
+ MLOG_4BYTES, &mtr);
+ frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+ MLOG_4BYTES, &mtr);
+
+ n_full_frag_pages = FSP_EXTENT_SIZE
+ * flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+ if (UNIV_UNLIKELY(free_limit > size)) {
+
+ ut_a(space != 0);
+ ut_a(size < FSP_EXTENT_SIZE);
+ }
+
+ flst_validate(header + FSP_FREE, &mtr);
+ flst_validate(header + FSP_FREE_FRAG, &mtr);
+ flst_validate(header + FSP_FULL_FRAG, &mtr);
+
+ mtr_commit(&mtr);
+
+ /* Validate FSP_FREE list */
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+ node_addr = flst_get_first(header + FSP_FREE, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ descr_count++;
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) == 0);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FREE);
+
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+ mtr_commit(&mtr);
+ }
+
+ /* Validate FSP_FREE_FRAG list */
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+ node_addr = flst_get_first(header + FSP_FREE_FRAG, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ descr_count++;
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) > 0);
+ ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FREE_FRAG);
+
+ n_used += xdes_get_n_used(descr, &mtr);
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+
+ mtr_commit(&mtr);
+ }
+
+ /* Validate FSP_FULL_FRAG list */
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+ node_addr = flst_get_first(header + FSP_FULL_FRAG, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ descr_count++;
+ descr = xdes_lst_get_descriptor(space, zip_size,
+ node_addr, &mtr);
+
+ ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+ ut_a(xdes_get_state(descr, &mtr) == XDES_FULL_FRAG);
+
+ node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+ mtr_commit(&mtr);
+ }
+
+ /* Validate segments */
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+ seg_inode_len_full = flst_get_len(header + FSP_SEG_INODES_FULL, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+
+ n = 0;
+ do {
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ seg_inode_page = fut_get_ptr(
+ space, zip_size, node_addr, RW_X_LATCH, &mtr)
+ - FSEG_INODE_PAGE_NODE;
+
+ seg_inode = fsp_seg_inode_page_get_nth_inode(
+ seg_inode_page, n, zip_size, &mtr);
+ ut_a(!ut_dulint_is_zero(
+ mach_read_from_8(seg_inode + FSEG_ID)));
+ fseg_validate_low(seg_inode, &mtr);
+
+ descr_count += flst_get_len(seg_inode + FSEG_FREE,
+ &mtr);
+ descr_count += flst_get_len(seg_inode + FSEG_FULL,
+ &mtr);
+ descr_count += flst_get_len(seg_inode + FSEG_NOT_FULL,
+ &mtr);
+
+ n_used2 += fseg_get_n_frag_pages(seg_inode, &mtr);
+
+ next_node_addr = flst_get_next_addr(
+ seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+ mtr_commit(&mtr);
+ } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+ node_addr = next_node_addr;
+ }
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+ seg_inode_len_free = flst_get_len(header + FSP_SEG_INODES_FREE, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+
+ n = 0;
+
+ do {
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ seg_inode_page = fut_get_ptr(
+ space, zip_size, node_addr, RW_X_LATCH, &mtr)
+ - FSEG_INODE_PAGE_NODE;
+
+ seg_inode = fsp_seg_inode_page_get_nth_inode(
+ seg_inode_page, n, zip_size, &mtr);
+ if (!ut_dulint_is_zero(
+ mach_read_from_8(seg_inode + FSEG_ID))) {
+ fseg_validate_low(seg_inode, &mtr);
+
+ descr_count += flst_get_len(
+ seg_inode + FSEG_FREE, &mtr);
+ descr_count += flst_get_len(
+ seg_inode + FSEG_FULL, &mtr);
+ descr_count += flst_get_len(
+ seg_inode + FSEG_NOT_FULL, &mtr);
+ n_used2 += fseg_get_n_frag_pages(
+ seg_inode, &mtr);
+ }
+
+ next_node_addr = flst_get_next_addr(
+ seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+ mtr_commit(&mtr);
+ } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+ node_addr = next_node_addr;
+ }
+
+ ut_a(descr_count * FSP_EXTENT_SIZE == free_limit);
+ if (!zip_size) {
+ ut_a(n_used + n_full_frag_pages
+ == n_used2 + 2 * ((free_limit + (UNIV_PAGE_SIZE - 1))
+ / UNIV_PAGE_SIZE)
+ + seg_inode_len_full + seg_inode_len_free);
+ } else {
+ ut_a(n_used + n_full_frag_pages
+ == n_used2 + 2 * ((free_limit + (zip_size - 1))
+ / zip_size)
+ + seg_inode_len_full + seg_inode_len_free);
+ }
+ ut_a(frag_n_used == n_used);
+
+ mtr_commit(&mtr2);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+ ulint space) /*!< in: space id */
+{
+ fsp_header_t* header;
+ fseg_inode_t* seg_inode;
+ page_t* seg_inode_page;
+ rw_lock_t* latch;
+ ulint flags;
+ ulint zip_size;
+ ulint size;
+ ulint free_limit;
+ ulint frag_n_used;
+ fil_addr_t node_addr;
+ fil_addr_t next_node_addr;
+ ulint n_free;
+ ulint n_free_frag;
+ ulint n_full_frag;
+ ulint seg_id_low;
+ ulint seg_id_high;
+ ulint n;
+ ulint n_segs = 0;
+ dulint d_var;
+ mtr_t mtr;
+ mtr_t mtr2;
+
+ latch = fil_space_get_latch(space, &flags);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ /* Start first a mini-transaction mtr2 to lock out all other threads
+ from the fsp system */
+
+ mtr_start(&mtr2);
+
+ mtr_x_lock(latch, &mtr2);
+
+ mtr_start(&mtr);
+
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+ free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES,
+ &mtr);
+ frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+ &mtr);
+ n_free = flst_get_len(header + FSP_FREE, &mtr);
+ n_free_frag = flst_get_len(header + FSP_FREE_FRAG, &mtr);
+ n_full_frag = flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+ d_var = mtr_read_dulint(header + FSP_SEG_ID, &mtr);
+
+ seg_id_low = ut_dulint_get_low(d_var);
+ seg_id_high = ut_dulint_get_high(d_var);
+
+ fprintf(stderr,
+ "FILE SPACE INFO: id %lu\n"
+ "size %lu, free limit %lu, free extents %lu\n"
+ "not full frag extents %lu: used pages %lu,"
+ " full frag extents %lu\n"
+ "first seg id not used %lu %lu\n",
+ (ulong) space,
+ (ulong) size, (ulong) free_limit, (ulong) n_free,
+ (ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag,
+ (ulong) seg_id_high, (ulong) seg_id_low);
+
+ mtr_commit(&mtr);
+
+ /* Print segments */
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+
+ n = 0;
+
+ do {
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ seg_inode_page = fut_get_ptr(
+ space, zip_size, node_addr, RW_X_LATCH, &mtr)
+ - FSEG_INODE_PAGE_NODE;
+
+ seg_inode = fsp_seg_inode_page_get_nth_inode(
+ seg_inode_page, n, zip_size, &mtr);
+ ut_a(!ut_dulint_is_zero(
+ mach_read_from_8(seg_inode + FSEG_ID)));
+ fseg_print_low(seg_inode, &mtr);
+
+ n_segs++;
+
+ next_node_addr = flst_get_next_addr(
+ seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+ mtr_commit(&mtr);
+ } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+ node_addr = next_node_addr;
+ }
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ header = fsp_get_space_header(space, zip_size, &mtr);
+
+ node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+ mtr_commit(&mtr);
+
+ while (!fil_addr_is_null(node_addr)) {
+
+ n = 0;
+
+ do {
+
+ mtr_start(&mtr);
+ mtr_x_lock(latch, &mtr);
+
+ seg_inode_page = fut_get_ptr(
+ space, zip_size, node_addr, RW_X_LATCH, &mtr)
+ - FSEG_INODE_PAGE_NODE;
+
+ seg_inode = fsp_seg_inode_page_get_nth_inode(
+ seg_inode_page, n, zip_size, &mtr);
+ if (!ut_dulint_is_zero(
+ mach_read_from_8(seg_inode + FSEG_ID))) {
+
+ fseg_print_low(seg_inode, &mtr);
+ n_segs++;
+ }
+
+ next_node_addr = flst_get_next_addr(
+ seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+ mtr_commit(&mtr);
+ } while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+ node_addr = next_node_addr;
+ }
+
+ mtr_commit(&mtr2);
+
+ fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/fut/fut0fut.c b/storage/xtradb/fut/fut0fut.c
new file mode 100644
index 00000000000..20b45a575e6
--- /dev/null
+++ b/storage/xtradb/fut/fut0fut.c
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0fut.c
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+
+#ifdef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
diff --git a/storage/xtradb/fut/fut0lst.c b/storage/xtradb/fut/fut0lst.c
new file mode 100644
index 00000000000..a1e21c22725
--- /dev/null
+++ b/storage/xtradb/fut/fut0lst.c
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0lst.c
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0lst.h"
+
+#ifdef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+
+/********************************************************************//**
+Adds a node to an empty list. */
+static
+void
+flst_add_to_empty(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of
+ empty list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ fil_addr_t node_addr;
+ ulint len;
+
+ ut_ad(mtr && base && node);
+ ut_ad(base != node);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+ len = flst_get_len(base, mtr);
+ ut_a(len == 0);
+
+ buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+ /* Update first and last fields of base node */
+ flst_write_addr(base + FLST_FIRST, node_addr, mtr);
+ flst_write_addr(base + FLST_LAST, node_addr, mtr);
+
+ /* Set prev and next fields of node to add */
+ flst_write_addr(node + FLST_PREV, fil_addr_null, mtr);
+ flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr);
+
+ /* Update len of base node */
+ mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ fil_addr_t node_addr;
+ ulint len;
+ fil_addr_t last_addr;
+ flst_node_t* last_node;
+
+ ut_ad(mtr && base && node);
+ ut_ad(base != node);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+ len = flst_get_len(base, mtr);
+ last_addr = flst_get_last(base, mtr);
+
+ buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+ /* If the list is not empty, call flst_insert_after */
+ if (len != 0) {
+ if (last_addr.page == node_addr.page) {
+ last_node = page_align(node) + last_addr.boffset;
+ } else {
+ ulint zip_size = fil_space_get_zip_size(space);
+
+ last_node = fut_get_ptr(space, zip_size, last_addr,
+ RW_X_LATCH, mtr);
+ }
+
+ flst_insert_after(base, last_node, node, mtr);
+ } else {
+ /* else call flst_add_to_empty */
+ flst_add_to_empty(base, node, mtr);
+ }
+}
+
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ fil_addr_t node_addr;
+ ulint len;
+ fil_addr_t first_addr;
+ flst_node_t* first_node;
+
+ ut_ad(mtr && base && node);
+ ut_ad(base != node);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+ len = flst_get_len(base, mtr);
+ first_addr = flst_get_first(base, mtr);
+
+ buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+ /* If the list is not empty, call flst_insert_before */
+ if (len != 0) {
+ if (first_addr.page == node_addr.page) {
+ first_node = page_align(node) + first_addr.boffset;
+ } else {
+ ulint zip_size = fil_space_get_zip_size(space);
+
+ first_node = fut_get_ptr(space, zip_size, first_addr,
+ RW_X_LATCH, mtr);
+ }
+
+ flst_insert_before(base, node, first_node, mtr);
+ } else {
+ /* else call flst_add_to_empty */
+ flst_add_to_empty(base, node, mtr);
+ }
+}
+
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node1, /*!< in: node to insert after */
+ flst_node_t* node2, /*!< in: node to add */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ fil_addr_t node1_addr;
+ fil_addr_t node2_addr;
+ flst_node_t* node3;
+ fil_addr_t node3_addr;
+ ulint len;
+
+ ut_ad(mtr && node1 && node2 && base);
+ ut_ad(base != node1);
+ ut_ad(base != node2);
+ ut_ad(node2 != node1);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+ buf_ptr_get_fsp_addr(node1, &space, &node1_addr);
+ buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+ node3_addr = flst_get_next_addr(node1, mtr);
+
+ /* Set prev and next fields of node2 */
+ flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+ flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+ if (!fil_addr_is_null(node3_addr)) {
+ /* Update prev field of node3 */
+ ulint zip_size = fil_space_get_zip_size(space);
+
+ node3 = fut_get_ptr(space, zip_size,
+ node3_addr, RW_X_LATCH, mtr);
+ flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+ } else {
+ /* node1 was last in list: update last field in base */
+ flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+ }
+
+ /* Set next field of node1 */
+ flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+
+ /* Update len of base node */
+ len = flst_get_len(base, mtr);
+ mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to insert */
+ flst_node_t* node3, /*!< in: node to insert before */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ flst_node_t* node1;
+ fil_addr_t node1_addr;
+ fil_addr_t node2_addr;
+ fil_addr_t node3_addr;
+ ulint len;
+
+ ut_ad(mtr && node2 && node3 && base);
+ ut_ad(base != node2);
+ ut_ad(base != node3);
+ ut_ad(node2 != node3);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node3, MTR_MEMO_PAGE_X_FIX));
+
+ buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+ buf_ptr_get_fsp_addr(node3, &space, &node3_addr);
+
+ node1_addr = flst_get_prev_addr(node3, mtr);
+
+ /* Set prev and next fields of node2 */
+ flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+ flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+ if (!fil_addr_is_null(node1_addr)) {
+ ulint zip_size = fil_space_get_zip_size(space);
+ /* Update next field of node1 */
+ node1 = fut_get_ptr(space, zip_size, node1_addr,
+ RW_X_LATCH, mtr);
+ flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+ } else {
+ /* node3 was first in list: update first field in base */
+ flst_write_addr(base + FLST_FIRST, node2_addr, mtr);
+ }
+
+ /* Set prev field of node3 */
+ flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+
+ /* Update len of base node */
+ len = flst_get_len(base, mtr);
+ mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to remove */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ ulint zip_size;
+ flst_node_t* node1;
+ fil_addr_t node1_addr;
+ fil_addr_t node2_addr;
+ flst_node_t* node3;
+ fil_addr_t node3_addr;
+ ulint len;
+
+ ut_ad(mtr && node2 && base);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+ buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+ zip_size = fil_space_get_zip_size(space);
+
+ node1_addr = flst_get_prev_addr(node2, mtr);
+ node3_addr = flst_get_next_addr(node2, mtr);
+
+ if (!fil_addr_is_null(node1_addr)) {
+
+ /* Update next field of node1 */
+
+ if (node1_addr.page == node2_addr.page) {
+
+ node1 = page_align(node2) + node1_addr.boffset;
+ } else {
+ node1 = fut_get_ptr(space, zip_size,
+ node1_addr, RW_X_LATCH, mtr);
+ }
+
+ ut_ad(node1 != node2);
+
+ flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr);
+ } else {
+ /* node2 was first in list: update first field in base */
+ flst_write_addr(base + FLST_FIRST, node3_addr, mtr);
+ }
+
+ if (!fil_addr_is_null(node3_addr)) {
+ /* Update prev field of node3 */
+
+ if (node3_addr.page == node2_addr.page) {
+
+ node3 = page_align(node2) + node3_addr.boffset;
+ } else {
+ node3 = fut_get_ptr(space, zip_size,
+ node3_addr, RW_X_LATCH, mtr);
+ }
+
+ ut_ad(node2 != node3);
+
+ flst_write_addr(node3 + FLST_PREV, node1_addr, mtr);
+ } else {
+ /* node2 was last in list: update last field in base */
+ flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+ }
+
+ /* Update len of base node */
+ len = flst_get_len(base, mtr);
+ ut_ad(len > 0);
+
+ mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove,
+ must be >= 1 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint space;
+ flst_node_t* node1;
+ fil_addr_t node1_addr;
+ fil_addr_t node2_addr;
+ ulint len;
+
+ ut_ad(mtr && node2 && base);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(n_nodes > 0);
+
+ buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+ node1_addr = flst_get_prev_addr(node2, mtr);
+
+ if (!fil_addr_is_null(node1_addr)) {
+
+ /* Update next field of node1 */
+
+ if (node1_addr.page == node2_addr.page) {
+
+ node1 = page_align(node2) + node1_addr.boffset;
+ } else {
+ node1 = fut_get_ptr(space,
+ fil_space_get_zip_size(space),
+ node1_addr, RW_X_LATCH, mtr);
+ }
+
+ flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr);
+ } else {
+ /* node2 was first in list: update the field in base */
+ flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+ }
+
+ flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+
+ /* Update len of base node */
+ len = flst_get_len(base, mtr);
+ ut_ad(len >= n_nodes);
+
+ mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node not to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fil_addr_t node2_addr;
+ ulint len;
+ ulint space;
+
+ ut_ad(mtr && node2 && base);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+ if (n_nodes == 0) {
+
+ ut_ad(fil_addr_is_null(flst_get_next_addr(node2, mtr)));
+
+ return;
+ }
+
+ buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+ /* Update next field of node2 */
+ flst_write_addr(node2 + FLST_NEXT, fil_addr_null, mtr);
+
+ flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+
+ /* Update len of base node */
+ len = flst_get_len(base, mtr);
+ ut_ad(len >= n_nodes);
+
+ mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Validates a file-based list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr1) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ const flst_node_t* node;
+ fil_addr_t node_addr;
+ fil_addr_t base_addr;
+ ulint len;
+ ulint i;
+ mtr_t mtr2;
+
+ ut_ad(base);
+ ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX));
+
+ /* We use two mini-transaction handles: the first is used to
+ lock the base node, and prevent other threads from modifying the
+ list. The second is used to traverse the list. We cannot run the
+ second mtr without committing it at times, because if the list
+ is long, then the x-locked pages could fill the buffer resulting
+ in a deadlock. */
+
+ /* Find out the space id */
+ buf_ptr_get_fsp_addr(base, &space, &base_addr);
+ zip_size = fil_space_get_zip_size(space);
+
+ len = flst_get_len(base, mtr1);
+ node_addr = flst_get_first(base, mtr1);
+
+ for (i = 0; i < len; i++) {
+ mtr_start(&mtr2);
+
+ node = fut_get_ptr(space, zip_size,
+ node_addr, RW_X_LATCH, &mtr2);
+ node_addr = flst_get_next_addr(node, &mtr2);
+
+ mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+ becoming full */
+ }
+
+ ut_a(fil_addr_is_null(node_addr));
+
+ node_addr = flst_get_last(base, mtr1);
+
+ for (i = 0; i < len; i++) {
+ mtr_start(&mtr2);
+
+ node = fut_get_ptr(space, zip_size,
+ node_addr, RW_X_LATCH, &mtr2);
+ node_addr = flst_get_prev_addr(node, &mtr2);
+
+ mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+ becoming full */
+ }
+
+ ut_a(fil_addr_is_null(node_addr));
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const buf_frame_t* frame;
+ ulint len;
+
+ ut_ad(base && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+ frame = page_align((byte*) base);
+
+ len = flst_get_len(base, mtr);
+
+ fprintf(stderr,
+ "FILE-BASED LIST:\n"
+ "Base node in space %lu page %lu byte offset %lu; len %lu\n",
+ (ulong) page_get_space_id(frame),
+ (ulong) page_get_page_no(frame),
+ (ulong) page_offset(base), (ulong) len);
+}
diff --git a/storage/xtradb/ha/ha0ha.c b/storage/xtradb/ha/ha0ha.c
new file mode 100644
index 00000000000..7f11917de0a
--- /dev/null
+++ b/storage/xtradb/ha/ha0ha.c
@@ -0,0 +1,464 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ha/ha0ha.c
+The hash table with external chains
+
+Created 8/22/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ha0ha.h"
+#ifdef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#ifdef UNIV_DEBUG
+# include "buf0buf.h"
+#endif /* UNIV_DEBUG */
+#include "btr0sea.h"
+#include "page0page.h"
+
+/*************************************************************//**
+Creates a hash table with at least n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+ ulint n, /*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+ ulint mutex_level, /*!< in: level of the mutexes in the latching
+ order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes) /*!< in: number of mutexes to protect the
+ hash table: must be a power of 2, or 0 */
+{
+ hash_table_t* table;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_ad(ut_is_2pow(n_mutexes));
+ table = hash_create(n);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ table->adaptive = TRUE;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ /* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
+ but in practise it never should in this case, hence the asserts. */
+
+ if (n_mutexes == 0) {
+ table->heap = mem_heap_create_in_btr_search(
+ ut_min(4096, MEM_MAX_ALLOC_IN_BUF));
+ ut_a(table->heap);
+
+ return(table);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ hash_create_mutexes(table, n_mutexes, mutex_level);
+
+ table->heaps = mem_alloc(n_mutexes * sizeof(void*));
+
+ for (i = 0; i < n_mutexes; i++) {
+ table->heaps[i] = mem_heap_create_in_btr_search(4096);
+ ut_a(table->heaps[i]);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ return(table);
+}
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+ hash_table_t* table) /*!< in, own: hash table */
+{
+ ulint i;
+ ulint n;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+ /* Free the memory heaps. */
+ n = table->n_mutexes;
+
+ for (i = 0; i < n; i++) {
+ mem_heap_free(table->heaps[i]);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Clear the hash table. */
+ n = hash_get_n_cells(table);
+
+ for (i = 0; i < n; i++) {
+ hash_get_nth_cell(table, i)->node = NULL;
+ }
+}
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted. If btr_search_enabled is set to FALSE, we will only allow
+updating existing nodes, but no new node is allowed to be added.
+@return TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data) /*!< in: data, must not be NULL */
+{
+ hash_cell_t* cell;
+ ha_node_t* node;
+ ha_node_t* prev_node;
+ ulint hash;
+
+ ut_ad(data);
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ hash = hash_calc_hash(fold, table);
+
+ cell = hash_get_nth_cell(table, hash);
+
+ prev_node = cell->node;
+
+ while (prev_node != NULL) {
+ if (prev_node->fold == fold) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ if (table->adaptive) {
+ buf_block_t* prev_block = prev_node->block;
+ ut_a(prev_block->frame
+ == page_align(prev_node->data));
+ ut_a(prev_block->n_pointers > 0);
+ prev_block->n_pointers--;
+ block->n_pointers++;
+ }
+ ut_ad(!btr_search_fully_disabled);
+# endif /* !UNIV_HOTBACKUP */
+
+ prev_node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ prev_node->data = data;
+
+ return(TRUE);
+ }
+
+ prev_node = prev_node->next;
+ }
+
+ /* We are in the process of disabling hash index, do not add
+ new chain node */
+ if (!btr_search_enabled) {
+ ut_ad(!btr_search_fully_disabled);
+ return(TRUE);
+ }
+
+ /* We have to allocate a new chain node */
+
+ node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t));
+
+ if (node == NULL) {
+ /* It was a btr search type memory heap and at the moment
+ no more memory could be allocated: return */
+
+ ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH);
+
+ return(FALSE);
+ }
+
+ ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ if (table->adaptive) {
+ block->n_pointers++;
+ }
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ node->fold = fold;
+
+ node->next = NULL;
+
+ prev_node = cell->node;
+
+ if (prev_node == NULL) {
+
+ cell->node = node;
+
+ return(TRUE);
+ }
+
+ while (prev_node->next != NULL) {
+
+ prev_node = prev_node->next;
+ }
+
+ prev_node->next = node;
+
+ return(TRUE);
+}
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ha_node_t* del_node) /*!< in: node to be deleted */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ if (table->adaptive) {
+ ut_a(del_node->block->frame = page_align(del_node->data));
+ ut_a(del_node->block->n_pointers > 0);
+ del_node->block->n_pointers--;
+ }
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and updates
+the pointer to data, if found. */
+UNIV_INTERN
+void
+ha_search_and_update_if_found_func(
+/*===============================*/
+ hash_table_t* table, /*!< in/out: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data, /*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* new_data)/*!< in: new pointer to the data */
+{
+ ha_node_t* node;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ if (table->adaptive) {
+ ut_a(node->block->n_pointers > 0);
+ node->block->n_pointers--;
+ new_block->n_pointers++;
+ }
+# endif /* !UNIV_HOTBACKUP */
+
+ node->block = new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data = new_data;
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: fold value */
+ const page_t* page) /*!< in: buffer page */
+{
+ ha_node_t* node;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (page_align(ha_node_get_data(node)) == page) {
+
+ /* Remove the hash node */
+
+ ha_delete_hash_node(table, node);
+
+ /* Start again from the first node in the chain
+ because the deletion may compact the heap of
+ nodes and move other nodes! */
+
+ node = ha_chain_get_first(table, fold);
+ } else {
+ node = ha_chain_get_next(node);
+ }
+ }
+#ifdef UNIV_DEBUG
+ /* Check that all nodes really got deleted */
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ ut_a(page_align(ha_node_get_data(node)) != page);
+
+ node = ha_chain_get_next(node);
+ }
+#endif
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint start_index, /*!< in: start index */
+ ulint end_index) /*!< in: end index */
+{
+ hash_cell_t* cell;
+ ha_node_t* node;
+ ibool ok = TRUE;
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_a(start_index <= end_index);
+ ut_a(start_index < hash_get_n_cells(table));
+ ut_a(end_index < hash_get_n_cells(table));
+
+ for (i = start_index; i <= end_index; i++) {
+
+ cell = hash_get_nth_cell(table, i);
+
+ node = cell->node;
+
+ while (node) {
+ if (hash_calc_hash(node->fold, table) != i) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ "InnoDB: Error: hash table node"
+ " fold value %lu does not\n"
+ "InnoDB: match the cell number %lu.\n",
+ (ulong) node->fold, (ulong) i);
+
+ ok = FALSE;
+ }
+
+ node = node->next;
+ }
+ }
+
+ return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ hash_table_t* table) /*!< in: hash table */
+{
+#ifdef UNIV_DEBUG
+/* Some of the code here is disabled for performance reasons in production
+builds, see http://bugs.mysql.com/36941 */
+#define PRINT_USED_CELLS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_USED_CELLS
+ hash_cell_t* cell;
+ ulint cells = 0;
+ ulint i;
+#endif /* PRINT_USED_CELLS */
+ ulint n_bufs;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef PRINT_USED_CELLS
+ for (i = 0; i < hash_get_n_cells(table); i++) {
+
+ cell = hash_get_nth_cell(table, i);
+
+ if (cell->node) {
+
+ cells++;
+ }
+ }
+#endif /* PRINT_USED_CELLS */
+
+ fprintf(file, "Hash table size %lu",
+ (ulong) hash_get_n_cells(table));
+
+#ifdef PRINT_USED_CELLS
+ fprintf(file, ", used cells %lu", (ulong) cells);
+#endif /* PRINT_USED_CELLS */
+
+ if (table->heaps == NULL && table->heap != NULL) {
+
+ /* This calculation is intended for the adaptive hash
+ index: how many buffer frames we have reserved? */
+
+ n_bufs = UT_LIST_GET_LEN(table->heap->base) - 1;
+
+ if (table->heap->free_block) {
+ n_bufs++;
+ }
+
+ fprintf(file, ", node heap has %lu buffer(s)\n",
+ (ulong) n_bufs);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ha/ha0storage.c b/storage/xtradb/ha/ha0storage.c
new file mode 100644
index 00000000000..698e34f1166
--- /dev/null
+++ b/storage/xtradb/ha/ha0storage.c
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/ha0storage.c
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+/*******************************************************************//**
+Retrieves a data from a storage. If it is present, a pointer to the
+stored copy of data is returned, otherwise NULL is returned. */
+static
+const void*
+ha_storage_get(
+/*===========*/
+ ha_storage_t* storage, /*!< in: hash storage */
+ const void* data, /*!< in: data to check for */
+ ulint data_len) /*!< in: data length */
+{
+ ha_storage_node_t* node;
+ ulint fold;
+
+ /* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
+ macro */
+ fold = ut_fold_binary(data, data_len);
+
+#define IS_FOUND \
+ node->data_len == data_len && memcmp(node->data, data, data_len) == 0
+
+ HASH_SEARCH(
+ next, /* node->"next" */
+ storage->hash, /* the hash table */
+ fold, /* key */
+ ha_storage_node_t*, /* type of node->next */
+ node, /* auxiliary variable */
+ , /* assertion */
+ IS_FOUND); /* search criteria */
+
+ if (node == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(node->data);
+}
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit". */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim) /*!< in: memory limit to obey */
+{
+ void* raw;
+ ha_storage_node_t* node;
+ const void* data_copy;
+ ulint fold;
+
+ /* check if data chunk is already present */
+ data_copy = ha_storage_get(storage, data, data_len);
+ if (data_copy != NULL) {
+
+ return(data_copy);
+ }
+
+ /* not present */
+
+ /* check if we are allowed to allocate data_len bytes */
+ if (memlim > 0
+ && ha_storage_get_size(storage) + data_len > memlim) {
+
+ return(NULL);
+ }
+
+ /* we put the auxiliary node struct and the data itself in one
+ continuous block */
+ raw = mem_heap_alloc(storage->heap,
+ sizeof(ha_storage_node_t) + data_len);
+
+ node = (ha_storage_node_t*) raw;
+ data_copy = (byte*) raw + sizeof(*node);
+
+ memcpy((byte*) raw + sizeof(*node), data, data_len);
+
+ node->data_len = data_len;
+ node->data = data_copy;
+
+ /* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
+ macro */
+ fold = ut_fold_binary(data, data_len);
+
+ HASH_INSERT(
+ ha_storage_node_t, /* type used in the hash chain */
+ next, /* node->"next" */
+ storage->hash, /* the hash table */
+ fold, /* key */
+ node); /* add this data to the hash */
+
+ /* the output should not be changed because it will spoil the
+ hash table */
+ return(data_copy);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ha_storage()
+{
+ ha_storage_t* storage;
+ char buf[1024];
+ int i;
+ const void* stored[256];
+ const void* p;
+
+ storage = ha_storage_create(0, 0);
+
+ for (i = 0; i < 256; i++) {
+
+ memset(buf, i, sizeof(buf));
+ stored[i] = ha_storage_put(storage, buf, sizeof(buf));
+ }
+
+ //ha_storage_empty(&storage);
+
+ for (i = 255; i >= 0; i--) {
+
+ memset(buf, i, sizeof(buf));
+ p = ha_storage_put(storage, buf, sizeof(buf));
+
+ if (p != stored[i]) {
+
+ fprintf(stderr, "ha_storage_put() returned %p "
+ "instead of %p, i=%d\n", p, stored[i], i);
+ return;
+ }
+ }
+
+ fprintf(stderr, "all ok\n");
+
+ ha_storage_free(storage);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c
new file mode 100644
index 00000000000..0f4fc55d895
--- /dev/null
+++ b/storage/xtradb/ha/hash0hash.c
@@ -0,0 +1,242 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/hash0hash.c
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "hash0hash.h"
+#ifdef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#include "mem0mem.h"
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ mutex_enter(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ mutex_exit(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+ hash_table_t* table) /*!< in: hash table */
+{
+ ulint i;
+
+ for (i = 0; i < table->n_mutexes; i++) {
+
+ mutex_enter(table->mutexes + i);
+ }
+}
+
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+ hash_table_t* table) /*!< in: hash table */
+{
+ ulint i;
+
+ for (i = 0; i < table->n_mutexes; i++) {
+
+ mutex_exit(table->mutexes + i);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+ ulint n) /*!< in: number of array cells */
+{
+ hash_cell_t* array;
+ ulint prime;
+ hash_table_t* table;
+
+ prime = ut_find_prime(n);
+
+ table = mem_alloc(sizeof(hash_table_t));
+
+ array = ut_malloc(sizeof(hash_cell_t) * prime);
+
+ table->array = array;
+ table->n_cells = prime;
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ table->n_mutexes = 0;
+ table->mutexes = NULL;
+ table->heaps = NULL;
+#endif /* !UNIV_HOTBACKUP */
+ table->heap = NULL;
+ ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+ /* Initialize the cell array */
+ hash_table_clear(table);
+
+ return(table);
+}
+
+/*************************************************************//**
+*/
+UNIV_INTERN
+ulint
+hash_create_needed(
+/*===============*/
+ ulint n)
+{
+ ulint prime;
+ ulint offset;
+
+ prime = ut_find_prime(n);
+
+ offset = (sizeof(hash_table_t) + 7) / 8;
+ offset *= 8;
+
+ return(offset + sizeof(hash_cell_t) * prime);
+}
+
+UNIV_INTERN
+void
+hash_create_init(
+/*=============*/
+ hash_table_t* table,
+ ulint n)
+{
+ ulint prime;
+ ulint offset;
+
+ prime = ut_find_prime(n);
+
+ offset = (sizeof(hash_table_t) + 7) / 8;
+ offset *= 8;
+
+ table->array = (hash_cell_t*)(((byte*)table) + offset);
+ table->n_cells = prime;
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ table->n_mutexes = 0;
+ table->mutexes = NULL;
+ table->heaps = NULL;
+ table->heap = NULL;
+ ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+ /* Initialize the cell array */
+ hash_table_clear(table);
+}
+
+UNIV_INTERN
+void
+hash_create_reuse(
+/*==============*/
+ hash_table_t* table)
+{
+ ulint offset;
+
+ offset = (sizeof(hash_table_t) + 7) / 8;
+ offset *= 8;
+
+ table->array = (hash_cell_t*)(((byte*)table) + offset);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+}
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+ hash_table_t* table) /*!< in, own: hash table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifndef UNIV_HOTBACKUP
+ ut_a(table->mutexes == NULL);
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_free(table->array);
+ mem_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a mutex array to protect a hash table. */
+UNIV_INTERN
+void
+hash_create_mutexes_func(
+/*=====================*/
+ hash_table_t* table, /*!< in: hash table */
+#ifdef UNIV_SYNC_DEBUG
+ ulint sync_level, /*!< in: latching order level of the
+ mutexes: used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes) /*!< in: number of mutexes, must be a
+ power of 2 */
+{
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_a(n_mutexes > 0);
+ ut_a(ut_is_2pow(n_mutexes));
+
+ table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t));
+
+ for (i = 0; i < n_mutexes; i++) {
+ mutex_create(table->mutexes + i, sync_level);
+ }
+
+ table->n_mutexes = n_mutexes;
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ha_innodb.def b/storage/xtradb/ha_innodb.def
new file mode 100644
index 00000000000..e0faa62deb1
--- /dev/null
+++ b/storage/xtradb/ha_innodb.def
@@ -0,0 +1,4 @@
+EXPORTS
+ _mysql_plugin_interface_version_
+ _mysql_sizeof_struct_st_plugin_
+ _mysql_plugin_declarations_
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
new file mode 100644
index 00000000000..f511918e845
--- /dev/null
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -0,0 +1,12109 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* TODO list for the InnoDB handler in 5.0:
+ - Remove the flag trx->active_trans and look at trx->conc_state
+ - fix savepoint functions to use savepoint storage area
+ - Find out what kind of problems the OS X case-insensitivity causes to
+ table and database names; should we 'normalize' the names like we do
+ in Windows?
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#ifndef MYSQL_SERVER
+#define MYSQL_SERVER
+#endif
+
+#include <mysql_priv.h>
+#ifdef MYSQL_SERVER
+#include <log_event.h>
+#endif /* MYSQL_SERVER */
+
+#include <m_ctype.h>
+#include <mysys_err.h>
+#include <mysql/plugin.h>
+
+/** @file ha_innodb.cc */
+
+/* Include necessary InnoDB headers */
+extern "C" {
+#include "univ.i"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "trx0roll.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "mtr0mtr.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "log0log.h"
+#include "lock0lock.h"
+#include "dict0crea.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "fsp0fsp.h"
+#include "sync0sync.h"
+#include "fil0fil.h"
+#include "trx0xa.h"
+#include "row0merge.h"
+#include "thr0loc.h"
+#include "dict0boot.h"
+#include "ha_prototypes.h"
+#include "ut0mem.h"
+#include "ibuf0ibuf.h"
+}
+
+#include "ha_innodb.h"
+#include "i_s.h"
+
+#ifdef MYSQL_SERVER
+// Defined in trx0sys.c
+extern char trx_sys_mysql_master_log_name[];
+extern ib_int64_t trx_sys_mysql_master_log_pos;
+extern char trx_sys_mysql_relay_log_name[];
+extern ib_int64_t trx_sys_mysql_relay_log_pos;
+#endif /* MYSQL_SERVER */
+
+#ifndef MYSQL_SERVER
+# ifndef MYSQL_PLUGIN_IMPORT
+# define MYSQL_PLUGIN_IMPORT /* nothing */
+# endif /* MYSQL_PLUGIN_IMPORT */
+
+#if MYSQL_VERSION_ID < 50124
+/* this is defined in mysql_priv.h inside #ifdef MYSQL_SERVER
+but we need it here */
+bool check_global_access(THD *thd, ulong want_access);
+#endif /* MYSQL_VERSION_ID < 50124 */
+#endif /* MYSQL_SERVER */
+
+/** to protect innobase_open_files */
+static pthread_mutex_t innobase_share_mutex;
+/** to force correct commit order in binlog */
+static pthread_mutex_t prepare_commit_mutex;
+static ulong commit_threads = 0;
+static pthread_mutex_t commit_threads_m;
+static pthread_cond_t commit_cond;
+static pthread_mutex_t commit_cond_m;
+static bool innodb_inited = 0;
+
+C_MODE_START
+static int index_cond_func_innodb(void *arg);
+C_MODE_END
+
+
+
+#define INSIDE_HA_INNOBASE_CC
+
+/* In the Windows plugin, the return value of current_thd is
+undefined. Map it to NULL. */
+
+#define EQ_CURRENT_THD(thd) ((thd) == current_thd)
+
+
+static struct handlerton* innodb_hton_ptr;
+
+static const long AUTOINC_OLD_STYLE_LOCKING = 0;
+static const long AUTOINC_NEW_STYLE_LOCKING = 1;
+static const long AUTOINC_NO_LOCKING = 2;
+
+static long innobase_mirrored_log_groups, innobase_log_files_in_group,
+ innobase_log_buffer_size,
+ innobase_additional_mem_pool_size, innobase_file_io_threads,
+ innobase_force_recovery, innobase_open_files,
+ innobase_autoinc_lock_mode;
+static ulong innobase_commit_concurrency = 0;
+static ulong innobase_read_io_threads;
+static ulong innobase_write_io_threads;
+
+static ulong innobase_page_size;
+
+static my_bool innobase_thread_concurrency_timer_based;
+static long long innobase_buffer_pool_size, innobase_log_file_size;
+
+/** Percentage of the buffer pool to reserve for 'old' blocks.
+Connected to buf_LRU_old_ratio. */
+static uint innobase_old_blocks_pct;
+
+/* The default values for the following char* start-up parameters
+are determined in innobase_init below: */
+
+static char* innobase_data_home_dir = NULL;
+static char* innobase_data_file_path = NULL;
+static char* innobase_log_group_home_dir = NULL;
+static char* innobase_file_format_name = NULL;
+static char* innobase_change_buffering = NULL;
+static char* innobase_doublewrite_file = NULL;
+
+/* Note: This variable can be set to on/off and any of the supported
+file formats in the configuration file, but can only be set to any
+of the supported file formats during runtime. */
+static char* innobase_file_format_check = NULL;
+
+static char* innobase_file_flush_method = NULL;
+
+/* Below we have boolean-valued start-up parameters, and their default
+values */
+
+static ulong innobase_fast_shutdown = 1;
+#ifdef UNIV_LOG_ARCHIVE
+static my_bool innobase_log_archive = FALSE;
+static char* innobase_log_arch_dir = NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+static my_bool innobase_use_doublewrite = TRUE;
+static my_bool innobase_use_checksums = TRUE;
+static my_bool innobase_fast_checksum = FALSE;
+static my_bool innobase_extra_undoslots = FALSE;
+static my_bool innobase_fast_recovery = FALSE;
+static my_bool innobase_recovery_stats = TRUE;
+static my_bool innobase_locks_unsafe_for_binlog = FALSE;
+static my_bool innobase_overwrite_relay_log_info = FALSE;
+static my_bool innobase_rollback_on_timeout = FALSE;
+static my_bool innobase_create_status_file = FALSE;
+static my_bool innobase_stats_on_metadata = TRUE;
+static my_bool innobase_use_sys_stats_table = FALSE;
+static my_bool innobase_buffer_pool_shm_checksum = TRUE;
+
+static char* internal_innobase_data_file_path = NULL;
+
+static char* innodb_version_str = (char*) INNODB_VERSION_STR;
+
+/* The following counter is used to convey information to InnoDB
+about server activity: in selects it is not sensible to call
+srv_active_wake_master_thread after each fetch or search, we only do
+it every INNOBASE_WAKE_INTERVAL'th step. */
+
+#define INNOBASE_WAKE_INTERVAL 32
+static ulong innobase_active_counter = 0;
+
+static hash_table_t* innobase_open_tables;
+
+#ifdef __NETWARE__ /* some special cleanup for NetWare */
+bool nw_panic = FALSE;
+#endif
+
+/** Allowed values of innodb_change_buffering */
+static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
+ "none", /* IBUF_USE_NONE */
+ "inserts" /* IBUF_USE_INSERT */
+};
+
+static INNOBASE_SHARE *get_share(const char *table_name);
+static void free_share(INNOBASE_SHARE *share);
+static int innobase_close_connection(handlerton *hton, THD* thd);
+static int innobase_commit(handlerton *hton, THD* thd, bool all);
+static int innobase_rollback(handlerton *hton, THD* thd, bool all);
+static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
+ void *savepoint);
+static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
+static int innobase_release_savepoint(handlerton *hton, THD* thd,
+ void *savepoint);
+static handler *innobase_create_handler(handlerton *hton,
+ TABLE_SHARE *table,
+ MEM_ROOT *mem_root);
+/* "GEN_CLUST_INDEX" is the name reserved for Innodb default
+system primary index. */
+static const char innobase_index_reserve_name[]= "GEN_CLUST_INDEX";
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default(void);
+/*==========================================*/
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+ const char* format_name); /*!< in: pointer to file format
+ name */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return true if one of "on" or "off" */
+static
+bool
+innobase_file_format_check_on_off(
+/*==============================*/
+ const char* format_check); /*!< in: parameter value */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*================================*/
+ const char* format_check); /*!< in: parameter value */
+/****************************************************************//**
+Return alter table flags supported in an InnoDB database. */
+static
+uint
+innobase_alter_table_flags(
+/*=======================*/
+ uint flags);
+
+static const char innobase_hton_name[]= "InnoDB";
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return 0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ long long intbuf;
+ ulong commit_concurrency;
+
+ DBUG_ENTER("innobase_commit_concurrency_validate");
+
+ if (value->val_int(value, &intbuf)) {
+ /* The value is NULL. That is invalid. */
+ DBUG_RETURN(1);
+ }
+
+ *reinterpret_cast<ulong*>(save) = commit_concurrency
+ = static_cast<ulong>(intbuf);
+
+ /* Allow the value to be updated, as long as it remains zero
+ or nonzero. */
+ DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
+ "Enable InnoDB support for the XA two-phase commit",
+ /* check_func */ NULL, /* update_func */ NULL,
+ /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+ "Enable InnoDB locking in LOCK TABLES",
+ /* check_func */ NULL, /* update_func */ NULL,
+ /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+ "Use strict mode when evaluating create options.",
+ NULL, NULL, FALSE);
+
+static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+ "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
+ NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
+
+static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit_session, PLUGIN_VAR_RQCMDARG,
+ "Control innodb_flush_log_at_trx_commit for each sessions. "
+ "The value 0~2 are same meanings to innodb_flush_log_at_trx_commit. "
+ "The value 3 regards innodb_flush_log_at_trx_commit (default).",
+ NULL, NULL, 3, 0, 3, 0);
+
+
+static handler *innobase_create_handler(handlerton *hton,
+ TABLE_SHARE *table,
+ MEM_ROOT *mem_root)
+{
+ return new (mem_root) ha_innobase(hton, table);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be prepared */
+ bool all); /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement
+ ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid_list,/*!< in/out: prepared transactions */
+ uint len); /*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+ handlerton* hton,
+ XID* xid); /*!< in: X/Open XA transaction identification */
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid); /*!< in: X/Open XA transaction
+ identification */
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+ handlerton* hton, /*!< in: innobase hton */
+ THD* thd); /*!< in: user thread handle */
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+ handlerton* hton,
+ THD* thd, /*!< in: user thread handle */
+ void* curview);/*!< in: Consistent cursor view to be set */
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+ handlerton* hton,
+ THD* thd, /*!< in: user thread handle */
+ void* curview);/*!< in: Consistent read view to be closed */
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+ handlerton* hton, /*!< in: handlerton of Innodb */
+ char* path); /*!< in: database path; inside InnoDB the name
+ of the last directory in the path is used as
+ the database name: for example, in 'mysql/data/test'
+ the database name is 'test' */
+/*******************************************************************//**
+Closes an InnoDB database. */
+static
+int
+innobase_end(handlerton *hton, ha_panic_function type);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ /* out: 0 */
+ handlerton* hton, /* in: Innodb handlerton */
+ THD* thd); /* in: MySQL thread handle of the user for whom
+ the transaction should be committed */
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+ handlerton* hton); /*!< in: InnoDB handlerton */
+
+/************************************************************************//**
+Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
+Monitor to the client. */
+static
+bool
+innodb_show_status(
+/*===============*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of the caller */
+ stat_print_fn *stat_print);
+static
+bool innobase_show_status(handlerton *hton, THD* thd,
+ stat_print_fn* stat_print,
+ enum ha_stat_type stat_type);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+ trx_t* trx); /*!< in: transaction handle */
+
+static SHOW_VAR innodb_status_variables[]= {
+ {"buffer_pool_pages_data",
+ (char*) &export_vars.innodb_buffer_pool_pages_data, SHOW_LONG},
+ {"buffer_pool_pages_dirty",
+ (char*) &export_vars.innodb_buffer_pool_pages_dirty, SHOW_LONG},
+ {"buffer_pool_pages_flushed",
+ (char*) &export_vars.innodb_buffer_pool_pages_flushed, SHOW_LONG},
+ {"buffer_pool_pages_free",
+ (char*) &export_vars.innodb_buffer_pool_pages_free, SHOW_LONG},
+#ifdef UNIV_DEBUG
+ {"buffer_pool_pages_latched",
+ (char*) &export_vars.innodb_buffer_pool_pages_latched, SHOW_LONG},
+#endif /* UNIV_DEBUG */
+ {"buffer_pool_pages_misc",
+ (char*) &export_vars.innodb_buffer_pool_pages_misc, SHOW_LONG},
+ {"buffer_pool_pages_total",
+ (char*) &export_vars.innodb_buffer_pool_pages_total, SHOW_LONG},
+ {"buffer_pool_read_ahead",
+ (char*) &export_vars.innodb_buffer_pool_read_ahead, SHOW_LONG},
+ {"buffer_pool_read_ahead_evicted",
+ (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
+ {"buffer_pool_read_requests",
+ (char*) &export_vars.innodb_buffer_pool_read_requests, SHOW_LONG},
+ {"buffer_pool_reads",
+ (char*) &export_vars.innodb_buffer_pool_reads, SHOW_LONG},
+ {"buffer_pool_wait_free",
+ (char*) &export_vars.innodb_buffer_pool_wait_free, SHOW_LONG},
+ {"buffer_pool_write_requests",
+ (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
+ {"data_fsyncs",
+ (char*) &export_vars.innodb_data_fsyncs, SHOW_LONG},
+ {"data_pending_fsyncs",
+ (char*) &export_vars.innodb_data_pending_fsyncs, SHOW_LONG},
+ {"data_pending_reads",
+ (char*) &export_vars.innodb_data_pending_reads, SHOW_LONG},
+ {"data_pending_writes",
+ (char*) &export_vars.innodb_data_pending_writes, SHOW_LONG},
+ {"data_read",
+ (char*) &export_vars.innodb_data_read, SHOW_LONG},
+ {"data_reads",
+ (char*) &export_vars.innodb_data_reads, SHOW_LONG},
+ {"data_writes",
+ (char*) &export_vars.innodb_data_writes, SHOW_LONG},
+ {"data_written",
+ (char*) &export_vars.innodb_data_written, SHOW_LONG},
+ {"dblwr_pages_written",
+ (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG},
+ {"deadlocks",
+ (char*) &export_vars.innodb_deadlocks, SHOW_LONG},
+ {"dblwr_writes",
+ (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG},
+ {"dict_tables",
+ (char*) &export_vars.innodb_dict_tables, SHOW_LONG},
+ {"have_atomic_builtins",
+ (char*) &export_vars.innodb_have_atomic_builtins, SHOW_BOOL},
+ {"log_waits",
+ (char*) &export_vars.innodb_log_waits, SHOW_LONG},
+ {"log_write_requests",
+ (char*) &export_vars.innodb_log_write_requests, SHOW_LONG},
+ {"log_writes",
+ (char*) &export_vars.innodb_log_writes, SHOW_LONG},
+ {"os_log_fsyncs",
+ (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG},
+ {"os_log_pending_fsyncs",
+ (char*) &export_vars.innodb_os_log_pending_fsyncs, SHOW_LONG},
+ {"os_log_pending_writes",
+ (char*) &export_vars.innodb_os_log_pending_writes, SHOW_LONG},
+ {"os_log_written",
+ (char*) &export_vars.innodb_os_log_written, SHOW_LONG},
+ {"page_size",
+ (char*) &export_vars.innodb_page_size, SHOW_LONG},
+ {"pages_created",
+ (char*) &export_vars.innodb_pages_created, SHOW_LONG},
+ {"pages_read",
+ (char*) &export_vars.innodb_pages_read, SHOW_LONG},
+ {"pages_written",
+ (char*) &export_vars.innodb_pages_written, SHOW_LONG},
+ {"row_lock_current_waits",
+ (char*) &export_vars.innodb_row_lock_current_waits, SHOW_LONG},
+ {"row_lock_time",
+ (char*) &export_vars.innodb_row_lock_time, SHOW_LONGLONG},
+ {"row_lock_time_avg",
+ (char*) &export_vars.innodb_row_lock_time_avg, SHOW_LONG},
+ {"row_lock_time_max",
+ (char*) &export_vars.innodb_row_lock_time_max, SHOW_LONG},
+ {"row_lock_waits",
+ (char*) &export_vars.innodb_row_lock_waits, SHOW_LONG},
+ {"rows_deleted",
+ (char*) &export_vars.innodb_rows_deleted, SHOW_LONG},
+ {"rows_inserted",
+ (char*) &export_vars.innodb_rows_inserted, SHOW_LONG},
+ {"rows_read",
+ (char*) &export_vars.innodb_rows_read, SHOW_LONG},
+ {"rows_updated",
+ (char*) &export_vars.innodb_rows_updated, SHOW_LONG},
+ {NullS, NullS, SHOW_LONG}
+};
+
+/* General functions */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return true if thd is the replication thread */
+extern "C" UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ void* thd) /*!< in: thread handle (THD*) */
+{
+ return((ibool) thd_slave_thread((THD*) thd));
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innodb_srv_conc_enter_innodb(
+/*=========================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ if (UNIV_LIKELY(!srv_thread_concurrency)) {
+
+ return;
+ }
+
+ srv_conc_enter_innodb(trx);
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innodb_srv_conc_exit_innodb(
+/*========================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
+
+ return;
+ }
+
+ srv_conc_exit_innodb(trx);
+}
+
+/******************************************************************//**
+Releases possible search latch and InnoDB thread FIFO ticket. These should
+be released at each SQL statement end, and also when mysqld passes the
+control to the client. It does no harm to release these also in the middle
+of an SQL statement. */
+static inline
+void
+innobase_release_stat_resources(
+/*============================*/
+ trx_t* trx) /*!< in: transaction object */
+{
+ if (trx->has_search_latch) {
+ trx_search_latch_release_if_reserved(trx);
+ }
+
+ if (trx->declared_to_be_inside_innodb) {
+ /* Release our possible ticket in the FIFO */
+
+ srv_conc_force_exit_innodb(trx);
+ }
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+extern "C" UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ void* thd) /*!< in: thread handle (THD*) */
+{
+ return((ibool) thd_non_transactional_update((THD*) thd));
+}
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return true if thd is executing SELECT */
+extern "C" UNIV_INTERN
+ibool
+thd_is_select(
+/*==========*/
+ const void* thd) /*!< in: thread handle (THD*) */
+{
+ return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return true if thd has XA support */
+extern "C" UNIV_INTERN
+ibool
+thd_supports_xa(
+/*============*/
+ void* thd) /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_supports_xa */
+{
+ return(THDVAR((THD*) thd, support_xa));
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+extern "C" UNIV_INTERN
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ void* thd) /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_lock_wait_timeout */
+{
+ /* According to <mysql/plugin.h>, passing thd == NULL
+ returns the global value of the session variable. */
+ return(THDVAR((THD*) thd, lock_wait_timeout));
+}
+
+/******************************************************************//**
+*/
+extern "C" UNIV_INTERN
+ulong
+thd_flush_log_at_trx_commit_session(
+/*================================*/
+ void* thd)
+{
+ return(THDVAR((THD*) thd, flush_log_at_trx_commit_session));
+}
+
+/********************************************************************//**
+Obtain the InnoDB transaction of a MySQL thread.
+@return reference to transaction pointer */
+static inline
+trx_t*&
+thd_to_trx(
+/*=======*/
+ THD* thd) /*!< in: MySQL thread */
+{
+ return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
+}
+
+/********************************************************************//**
+Call this function when mysqld passes control to the client. That is to
+avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
+documentation, see handler.cc.
+@return 0 */
+static
+int
+innobase_release_temporary_latches(
+/*===============================*/
+ handlerton* hton, /*!< in: handlerton */
+ THD* thd) /*!< in: MySQL thread */
+{
+ trx_t* trx;
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (!innodb_inited) {
+
+ return(0);
+ }
+
+ trx = thd_to_trx(thd);
+
+ if (trx) {
+ innobase_release_stat_resources(trx);
+ }
+ return(0);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+static inline
+void
+innobase_active_small(void)
+/*=======================*/
+{
+ innobase_active_counter++;
+
+ if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+ srv_active_wake_master_thread();
+ }
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+extern "C" UNIV_INTERN
+int
+convert_error_code_to_mysql(
+/*========================*/
+ int error, /*!< in: InnoDB error code */
+ ulint flags, /*!< in: InnoDB table flags, or 0 */
+ THD* thd) /*!< in: user thread handle or NULL */
+{
+ switch (error) {
+ case DB_SUCCESS:
+ return(0);
+
+ case DB_INTERRUPTED:
+ my_error(ER_QUERY_INTERRUPTED, MYF(0));
+ /* fall through */
+
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ HA_ERR_ROW_IS_REFERENCED,
+ "InnoDB: Cannot delete/update "
+ "rows with cascading foreign key "
+ "constraints that exceed max "
+ "depth of %d. Please "
+ "drop extra constraints and try "
+ "again", DICT_FK_MAX_RECURSIVE_LOAD);
+
+ /* fall through */
+
+ case DB_ERROR:
+ default:
+ return(-1); /* unspecified error */
+
+ case DB_DUPLICATE_KEY:
+ /* Be cautious with returning this error, since
+ mysql could re-enter the storage layer to get
+ duplicated key info, the operation requires a
+ valid table handle and/or transaction information,
+ which might not always be available in the error
+ handling stage. */
+ return(HA_ERR_FOUND_DUPP_KEY);
+
+ case DB_FOREIGN_DUPLICATE_KEY:
+ return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+ case DB_MISSING_HISTORY:
+ return(HA_ERR_TABLE_DEF_CHANGED);
+
+ case DB_RECORD_NOT_FOUND:
+ return(HA_ERR_NO_ACTIVE_RECORD);
+
+ case DB_DEADLOCK:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_DEADLOCK);
+
+ case DB_LOCK_WAIT_TIMEOUT:
+ /* Starting from 5.0.13, we let MySQL just roll back the
+ latest SQL statement in a lock wait timeout. Previously, we
+ rolled back the whole transaction. */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(
+ thd, (bool)row_rollback_on_timeout);
+ }
+
+ return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+ case DB_NO_REFERENCED_ROW:
+ return(HA_ERR_NO_REFERENCED_ROW);
+
+ case DB_ROW_IS_REFERENCED:
+ return(HA_ERR_ROW_IS_REFERENCED);
+
+ case DB_CANNOT_ADD_CONSTRAINT:
+ return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+ case DB_CANNOT_DROP_CONSTRAINT:
+
+ return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+ misleading, a new MySQL error
+ code should be introduced */
+
+ case DB_COL_APPEARS_TWICE_IN_INDEX:
+ case DB_CORRUPTION:
+ return(HA_ERR_CRASHED);
+
+ case DB_OUT_OF_FILE_SPACE:
+ return(HA_ERR_RECORD_FILE_FULL);
+
+ case DB_TABLE_IS_BEING_USED:
+ return(HA_ERR_WRONG_COMMAND);
+
+ case DB_TABLE_NOT_FOUND:
+ return(HA_ERR_NO_SUCH_TABLE);
+
+ case DB_TOO_BIG_RECORD:
+ my_error(ER_TOO_BIG_ROWSIZE, MYF(0),
+ page_get_free_space_of_empty(flags
+ & DICT_TF_COMPACT) / 2);
+ return(HA_ERR_TO_BIG_ROW);
+
+ case DB_NO_SAVEPOINT:
+ return(HA_ERR_NO_SAVEPOINT);
+
+ case DB_LOCK_TABLE_FULL:
+ /* Since we rolled back the whole transaction, we must
+ tell it also to MySQL so that MySQL knows to empty the
+ cached binlog for this transaction */
+
+ if (thd) {
+ thd_mark_transaction_to_rollback(thd, TRUE);
+ }
+
+ return(HA_ERR_LOCK_TABLE_FULL);
+
+ case DB_PRIMARY_KEY_IS_NULL:
+ return(ER_PRIMARY_CANT_HAVE_NULL);
+
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
+ available in 5.1.38 and later, but the plugin should still
+ work with previous versions of MySQL. */
+#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS
+ return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
+ return(HA_ERR_RECORD_FILE_FULL);
+#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
+ case DB_UNSUPPORTED:
+ return(HA_ERR_UNSUPPORTED);
+ }
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+extern "C" UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ void* thd, /*!< in: pointer to a MySQL THD object */
+ uint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ char buffer[1024];
+
+ fputs(thd_security_context((THD*) thd, buffer, sizeof buffer,
+ max_query_len), f);
+ putc('\n', f);
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+extern "C" UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
+ ulint* mbmaxlen) /*!< out: maximum length of a char (in bytes) */
+{
+ CHARSET_INFO* cs;
+ ut_ad(cset < 256);
+ ut_ad(mbminlen);
+ ut_ad(mbmaxlen);
+
+ cs = all_charsets[cset];
+ if (cs) {
+ *mbminlen = cs->mbminlen;
+ *mbmaxlen = cs->mbmaxlen;
+ } else {
+ THD* thd = current_thd;
+
+ if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+ /* Fix bug#46256: allow tables to be dropped if the
+ collation is not found, but issue a warning. */
+ if ((global_system_variables.log_warnings)
+ && (cset != 0)){
+
+ sql_print_warning(
+ "Unknown collation #%lu.", cset);
+ }
+ } else {
+
+ ut_a(cset == 0);
+ }
+
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+extern "C" UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+extern "C" UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len) /*!< in: length of 'to', in bytes */
+{
+ uint errors;
+
+ strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+extern "C" UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b) /*!< in: second string to compare */
+{
+ return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+extern "C" UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+ char* a) /*!< in/out: string to put in lower case */
+{
+ my_casedn_str(system_charset_info, a);
+}
+
+/**********************************************************************//**
+Determines the connection character set.
+@return connection character set */
+extern "C" UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+ void* mysql_thd) /*!< in: MySQL thread handle */
+{
+ return(thd_charset((THD*) mysql_thd));
+}
+
+/**********************************************************************//**
+Determines the current SQL statement.
+@return SQL statement string */
+extern "C" UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+ void* mysql_thd, /*!< in: MySQL thread handle */
+ size_t* length) /*!< out: length of the SQL statement */
+{
+#if MYSQL_VERSION_ID >= 50142
+ LEX_STRING* stmt;
+
+ stmt = thd_query_string((THD*) mysql_thd);
+ *length = stmt->length;
+ return(stmt->str);
+#else
+ const char* stmt_str = thd_query((THD*) mysql_thd);
+ *length = strlen(stmt_str);
+ return(stmt_str);
+#endif
+}
+
+#if defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN)
+extern MYSQL_PLUGIN_IMPORT MY_TMPDIR mysql_tmpdir_list;
+/*******************************************************************//**
+Map an OS error to an errno value. The OS error number is stored in
+_doserrno and the mapped value is stored in errno) */
+extern "C"
+void __cdecl
+_dosmaperr(
+ unsigned long); /*!< in: OS error value */
+
+/*********************************************************************//**
+Creates a temporary file.
+@return temporary file descriptor, or < 0 on error */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_tmpfile(void)
+/*========================*/
+{
+ int fd; /* handle of opened file */
+ HANDLE osfh; /* OS handle of opened file */
+ char* tmpdir; /* point to the directory
+ where to create file */
+ TCHAR path_buf[MAX_PATH - 14]; /* buffer for tmp file path.
+ The length cannot be longer
+ than MAX_PATH - 14, or
+ GetTempFileName will fail. */
+ char filename[MAX_PATH]; /* name of the tmpfile */
+ DWORD fileaccess = GENERIC_READ /* OS file access */
+ | GENERIC_WRITE
+ | DELETE;
+ DWORD fileshare = FILE_SHARE_READ /* OS file sharing mode */
+ | FILE_SHARE_WRITE
+ | FILE_SHARE_DELETE;
+ DWORD filecreate = CREATE_ALWAYS; /* OS method of open/create */
+ DWORD fileattrib = /* OS file attribute flags */
+ FILE_ATTRIBUTE_NORMAL
+ | FILE_FLAG_DELETE_ON_CLOSE
+ | FILE_ATTRIBUTE_TEMPORARY
+ | FILE_FLAG_SEQUENTIAL_SCAN;
+
+ DBUG_ENTER("innobase_mysql_tmpfile");
+
+ tmpdir = my_tmpdir(&mysql_tmpdir_list);
+
+ /* The tmpdir parameter can not be NULL for GetTempFileName. */
+ if (!tmpdir) {
+ uint ret;
+
+ /* Use GetTempPath to determine path for temporary files. */
+ ret = GetTempPath(sizeof(path_buf), path_buf);
+ if (ret > sizeof(path_buf) || (ret == 0)) {
+
+ _dosmaperr(GetLastError()); /* map error */
+ DBUG_RETURN(-1);
+ }
+
+ tmpdir = path_buf;
+ }
+
+ /* Use GetTempFileName to generate a unique filename. */
+ if (!GetTempFileName(tmpdir, "ib", 0, filename)) {
+
+ _dosmaperr(GetLastError()); /* map error */
+ DBUG_RETURN(-1);
+ }
+
+ DBUG_PRINT("info", ("filename: %s", filename));
+
+ /* Open/Create the file. */
+ osfh = CreateFile(filename, fileaccess, fileshare, NULL,
+ filecreate, fileattrib, NULL);
+ if (osfh == INVALID_HANDLE_VALUE) {
+
+ /* open/create file failed! */
+ _dosmaperr(GetLastError()); /* map error */
+ DBUG_RETURN(-1);
+ }
+
+ do {
+ /* Associates a CRT file descriptor with the OS file handle. */
+ fd = _open_osfhandle((intptr_t) osfh, 0);
+ } while (fd == -1 && errno == EINTR);
+
+ if (fd == -1) {
+ /* Open failed, close the file handle. */
+
+ _dosmaperr(GetLastError()); /* map error */
+ CloseHandle(osfh); /* no need to check if
+ CloseHandle fails */
+ }
+
+ DBUG_RETURN(fd);
+}
+#else
+/*********************************************************************//**
+Creates a temporary file.
+@return temporary file descriptor, or < 0 on error */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_tmpfile(void)
+/*========================*/
+{
+ int fd2 = -1;
+ File fd = mysql_tmpfile("ib");
+ if (fd >= 0) {
+ /* Copy the file descriptor, so that the additional resources
+ allocated by create_temp_file() can be freed by invoking
+ my_close().
+
+ Because the file descriptor returned by this function
+ will be passed to fdopen(), it will be closed by invoking
+ fclose(), which in turn will invoke close() instead of
+ my_close(). */
+ fd2 = dup(fd);
+ if (fd2 < 0) {
+ DBUG_PRINT("error",("Got error %d on dup",fd2));
+ my_errno=errno;
+ my_error(EE_OUT_OF_FILERESOURCES,
+ MYF(ME_BELL+ME_WAITTANG),
+ "ib*", my_errno);
+ }
+ my_close(fd, MYF(MY_WME));
+ }
+ return(fd2);
+}
+#endif /* defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) */
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+extern "C" UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert from */
+ uint* errors) /*!< out: number of errors encountered
+ during the conversion */
+{
+ return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
+ (const char*)from, (uint32) from_length, from_cs,
+ errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+extern "C" UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ /* XXX we use a hard limit instead of allocating
+ but_size bytes from the heap */
+ CHARSET_INFO* data_cs;
+ char buf_tmp[8192];
+ ulint buf_tmp_used;
+ uint num_errors;
+
+ data_cs = all_charsets[charset_coll];
+
+ buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+ system_charset_info,
+ data, data_len, data_cs,
+ &num_errors);
+
+ return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+ INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to
+n * 3 where autoinc_lock_mode != TRADITIONAL because we want
+to reserve 3 values for the multi-value INSERT above.
+@return the next value */
+static
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+ ulonglong current, /*!< in: Current value */
+ ulonglong increment, /*!< in: increment current by */
+ ulonglong offset, /*!< in: AUTOINC offset */
+ ulonglong max_value) /*!< in: max value for type */
+{
+ ulonglong next_value;
+
+ /* Should never be 0. */
+ ut_a(increment > 0);
+
+ /* According to MySQL documentation, if the offset is greater than
+ the increment then the offset is ignored. */
+ if (offset > increment) {
+ offset = 0;
+ }
+
+ if (max_value <= current) {
+ next_value = max_value;
+ } else if (offset <= 1) {
+ /* Offset 0 and 1 are the same, because there must be at
+ least one node in the system. */
+ if (max_value - current <= increment) {
+ next_value = max_value;
+ } else {
+ next_value = current + increment;
+ }
+ } else if (max_value > current) {
+ if (current > offset) {
+ next_value = ((current - offset) / increment) + 1;
+ } else {
+ next_value = ((offset - current) / increment) + 1;
+ }
+
+ ut_a(increment > 0);
+ ut_a(next_value > 0);
+
+ /* Check for multiplication overflow. */
+ if (increment > (max_value / next_value)) {
+
+ next_value = max_value;
+ } else {
+ next_value *= increment;
+
+ ut_a(max_value >= next_value);
+
+ /* Check for overflow. */
+ if (max_value - next_value <= offset) {
+ next_value = max_value;
+ } else {
+ next_value += offset;
+ }
+ }
+ } else {
+ next_value = max_value;
+ }
+
+ ut_a(next_value <= max_value);
+
+ return(next_value);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+ THD* thd, /*!< in: user thread handle */
+ trx_t* trx) /*!< in/out: InnoDB transaction handle */
+{
+ DBUG_ENTER("innobase_trx_init");
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+ DBUG_ASSERT(thd == trx->mysql_thd);
+
+ trx->check_foreigns = !thd_test_options(
+ thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+ trx->check_unique_secondary = !thd_test_options(
+ thd, OPTION_RELAXED_UNIQUE_CHECKS);
+
+#ifdef EXTENDED_SLOWLOG
+ if (thd_log_slow_verbosity(thd) & SLOG_V_INNODB) {
+ trx->take_stats = TRUE;
+ } else {
+ trx->take_stats = FALSE;
+ }
+#else
+ trx->take_stats = FALSE;
+#endif
+
+ DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return InnoDB transaction handle */
+extern "C" UNIV_INTERN
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_trx_allocate");
+ DBUG_ASSERT(thd != NULL);
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+ trx = trx_allocate_for_mysql();
+
+ trx->mysql_thd = thd;
+
+ innobase_trx_init(thd, trx);
+
+ DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return InnoDB transaction handle */
+static
+trx_t*
+check_trx_exists(
+/*=============*/
+ THD* thd) /*!< in: user thread handle */
+{
+ trx_t*& trx = thd_to_trx(thd);
+
+ ut_ad(EQ_CURRENT_THD(thd));
+
+ if (trx == NULL) {
+ trx = innobase_trx_allocate(thd);
+ } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
+ mem_analyze_corruption(trx);
+ ut_error;
+ }
+
+ innobase_trx_init(thd, trx);
+
+ return(trx);
+}
+
+
+/*************************************************************************
+Gets current trx. */
+extern "C"
+trx_t*
+innobase_get_trx()
+{
+ THD *thd=current_thd;
+ if (likely(thd != 0)) {
+ trx_t*& trx = thd_to_trx(thd);
+ return(trx);
+ } else {
+ return(NULL);
+ }
+}
+
+extern "C"
+ibool
+innobase_get_slow_log()
+{
+#ifdef EXTENDED_SLOWLOG
+ return((ibool) thd_opt_slow_log());
+#else
+ return(FALSE);
+#endif
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
+ :handler(hton, table_arg),
+ int_table_flags(HA_REC_NOT_IN_SEQ |
+ HA_NULL_IN_KEY |
+ HA_CAN_INDEX_BLOBS |
+ HA_CAN_SQL_HANDLER |
+ HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
+ HA_PRIMARY_KEY_IN_READ_INDEX |
+ HA_BINLOG_ROW_CAPABLE |
+ HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
+ HA_TABLE_SCAN_ON_INDEX),
+ start_of_scan(0),
+ num_write_row(0)
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::~ha_innobase()
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN inline
+void
+ha_innobase::update_thd(
+/*====================*/
+ THD* thd) /*!< in: thd to use the handle */
+{
+ trx_t* trx;
+
+ trx = check_trx_exists(thd);
+
+ if (prebuilt->trx != trx) {
+
+ row_update_prebuilt_trx(prebuilt, trx);
+ }
+
+ user_thd = thd;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+ THD* thd = ha_thd();
+ ut_ad(EQ_CURRENT_THD(thd));
+ update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers that InnoDB takes part in an SQL statement, so that MySQL knows to
+roll back the statement if the statement results in an error. This MUST be
+called for every SQL statement that may be rolled back by MySQL. Calling this
+several times to register the same statement is allowed, too. */
+static inline
+void
+innobase_register_stmt(
+/*===================*/
+ handlerton* hton, /*!< in: Innobase hton */
+ THD* thd) /*!< in: MySQL thd (connection) object */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ /* Register the statement */
+ trans_register_ha(thd, FALSE, hton);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction in MySQL, so that the MySQL XA code knows
+to call the InnoDB prepare and commit, or rollback for the transaction. This
+MUST be called for every transaction for which the user may call commit or
+rollback. Calling this several times to register the same transaction is
+allowed, too.
+This function also registers the current SQL statement. */
+static inline
+void
+innobase_register_trx_and_stmt(
+/*===========================*/
+ handlerton *hton, /*!< in: Innobase handlerton */
+ THD* thd) /*!< in: MySQL thd (connection) object */
+{
+ /* NOTE that actually innobase_register_stmt() registers also
+ the transaction in the AUTOCOMMIT=1 mode. */
+
+ innobase_register_stmt(hton, thd);
+
+ if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ /* No autocommit mode, register for a transaction */
+ trans_register_ha(thd, TRUE, hton);
+ }
+}
+
+/* BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+ ------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserver the InnoDB kernel mutex.
+Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
+the InnoDB kernel mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+ THD* thd, /*!< in: thd of the user who is trying to
+ store a result to the query cache or
+ retrieve it */
+ char* full_name, /*!< in: concatenation of database name,
+ the null character NUL, and the table
+ name */
+ uint full_name_len, /*!< in: length of the full name, i.e.
+ len(dbname) + len(tablename) + 1 */
+ ulonglong *unused) /*!< unused for this engine */
+{
+ ibool is_autocommit;
+ trx_t* trx;
+ char norm_name[1000];
+
+ ut_a(full_name_len < 999);
+
+ trx = check_trx_exists(thd);
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+ /* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+ plain SELECT if AUTOCOMMIT is not on. */
+
+ return((my_bool)FALSE);
+ }
+
+ if (trx->has_search_latch) {
+ sql_print_error("The calling thread is holding the adaptive "
+ "search, latch though calling "
+ "innobase_query_caching_of_table_permitted.");
+
+ mutex_enter(&kernel_mutex);
+ trx_print(stderr, trx, 1024);
+ mutex_exit(&kernel_mutex);
+ }
+
+ innobase_release_stat_resources(trx);
+
+ if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ is_autocommit = TRUE;
+ } else {
+ is_autocommit = FALSE;
+
+ }
+
+ if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
+ /* We are going to retrieve the query result from the query
+ cache. This cannot be a store operation to the query cache
+ because then MySQL would have locks on tables already.
+
+ TODO: if the user has used LOCK TABLES to lock the table,
+ then we open a transaction in the call of row_.. below.
+ That trx can stay open until UNLOCK TABLES. The same problem
+ exists even if we do not use the query cache. MySQL should be
+ modified so that it ALWAYS calls some cleanup function when
+ the processing of a query ends!
+
+ We can imagine we instantaneously serialize this consistent
+ read trx to the current trx id counter. If trx2 would have
+ changed the tables of a query result stored in the cache, and
+ trx2 would have already committed, making the result obsolete,
+ then trx2 would have already invalidated the cache. Thus we
+ can trust the result in the cache is ok for this query. */
+
+ return((my_bool)TRUE);
+ }
+
+ /* Normalize the table name to InnoDB format */
+
+ memcpy(norm_name, full_name, full_name_len);
+
+ norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the
+ separator between db and table */
+ norm_name[full_name_len] = '\0';
+#ifdef __WIN__
+ innobase_casedn_str(norm_name);
+#endif
+ /* The call of row_search_.. will start a new transaction if it is
+ not yet started */
+
+ if (trx->active_trans == 0) {
+
+ innobase_register_trx_and_stmt(innodb_hton_ptr, thd);
+ trx->active_trans = 1;
+ }
+
+ if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
+
+ /* printf("Query cache for %s permitted\n", norm_name); */
+
+ return((my_bool)TRUE);
+ }
+
+ /* printf("Query cache for %s NOT permitted\n", norm_name); */
+
+ return((my_bool)FALSE);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+extern "C" UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name, /*!< in: concatenation of
+ database name, null char NUL,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+ ulint full_name_len) /*!< in: full name length where
+ also the null chars count */
+{
+ /* Note that the sync0sync.h rank of the query cache mutex is just
+ above the InnoDB kernel mutex. The caller of this function must not
+ have latches of a lower rank. */
+
+ /* Argument TRUE below means we are using transactions */
+#ifdef HAVE_QUERY_CACHE
+ mysql_query_cache_invalidate4((THD*) trx->mysql_thd,
+ full_name,
+ (uint32) full_name_len,
+ TRUE);
+#endif
+}
+
+/*****************************************************************//**
+Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+/*========================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ void* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool file_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an UTF-8 string */
+{
+ char nz[NAME_LEN + 1];
+#if MYSQL_VERSION_ID >= 50141
+ char nz2[NAME_LEN + 1 + EXPLAIN_FILENAME_MAX_EXTRA_LENGTH];
+#else /* MYSQL_VERSION_ID >= 50141 */
+ char nz2[NAME_LEN + 1 + sizeof srv_mysql50_table_name_prefix];
+#endif /* MYSQL_VERSION_ID >= 50141 */
+
+ const char* s = id;
+ int q;
+
+ if (file_id) {
+ /* Decode the table name. The MySQL function expects
+ a NUL-terminated string. The input and output strings
+ buffers must not be shared. */
+
+ if (UNIV_UNLIKELY(idlen > (sizeof nz) - 1)) {
+ idlen = (sizeof nz) - 1;
+ }
+
+ memcpy(nz, id, idlen);
+ nz[idlen] = 0;
+
+ s = nz2;
+#if MYSQL_VERSION_ID >= 50141
+ idlen = explain_filename((THD*) thd, nz, nz2, sizeof nz2,
+ EXPLAIN_PARTITIONS_AS_COMMENT);
+ goto no_quote;
+#else /* MYSQL_VERSION_ID >= 50141 */
+ idlen = filename_to_tablename(nz, nz2, sizeof nz2);
+#endif /* MYSQL_VERSION_ID >= 50141 */
+ }
+
+ /* See if the identifier needs to be quoted. */
+ if (UNIV_UNLIKELY(!thd)) {
+ q = '"';
+ } else {
+ q = get_quote_char_for_identifier((THD*) thd, s, (int) idlen);
+ }
+
+ if (q == EOF) {
+#if MYSQL_VERSION_ID >= 50141
+no_quote:
+#endif /* MYSQL_VERSION_ID >= 50141 */
+ if (UNIV_UNLIKELY(idlen > buflen)) {
+ idlen = buflen;
+ }
+ memcpy(buf, s, idlen);
+ return(buf + idlen);
+ }
+
+ /* Quote the identifier. */
+ if (buflen < 2) {
+ return(buf);
+ }
+
+ *buf++ = q;
+ buflen--;
+
+ for (; idlen; idlen--) {
+ int c = *s++;
+ if (UNIV_UNLIKELY(c == q)) {
+ if (UNIV_UNLIKELY(buflen < 3)) {
+ break;
+ }
+
+ *buf++ = c;
+ *buf++ = c;
+ buflen -= 2;
+ } else {
+ if (UNIV_UNLIKELY(buflen < 2)) {
+ break;
+ }
+
+ *buf++ = c;
+ buflen--;
+ }
+ }
+
+ *buf++ = q;
+ return(buf);
+}
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+extern "C" UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ void* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool table_id)/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an index name */
+{
+ char* s = buf;
+ const char* bufend = buf + buflen;
+
+ if (table_id) {
+ const char* slash = (const char*) memchr(id, '/', idlen);
+ if (!slash) {
+
+ goto no_db_name;
+ }
+
+ /* Print the database name and table name separately. */
+ s = innobase_convert_identifier(s, bufend - s, id, slash - id,
+ thd, TRUE);
+ if (UNIV_LIKELY(s < bufend)) {
+ *s++ = '.';
+ s = innobase_convert_identifier(s, bufend - s,
+ slash + 1, idlen
+ - (slash - id) - 1,
+ thd, TRUE);
+ }
+ } else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
+ /* Temporary index name (smart ALTER TABLE) */
+ const char temp_index_suffix[]= "--temporary--";
+
+ s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
+ thd, FALSE);
+ if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
+ memcpy(s, temp_index_suffix,
+ sizeof temp_index_suffix - 1);
+ s += sizeof temp_index_suffix - 1;
+ }
+ } else {
+no_db_name:
+ s = innobase_convert_identifier(buf, buflen, id, idlen,
+ thd, table_id);
+ }
+
+ return(s);
+
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return TRUE if interrupted */
+extern "C" UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd && thd_killed((THD*) trx->mysql_thd));
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+extern "C" UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx) /*!< in: transaction */
+{
+ return(trx && trx->mysql_thd
+ && THDVAR((THD*) trx->mysql_thd, strict_mode));
+}
+
+/**************************************************************//**
+Resets some fields of a prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+static
+void
+reset_template(
+/*===========*/
+ row_prebuilt_t* prebuilt) /*!< in/out: prebuilt struct */
+{
+ prebuilt->keep_other_fields_on_keyread = 0;
+ prebuilt->read_just_key = 0;
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_idx() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+UNIV_INTERN
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+ /* If current thd does not yet have a trx struct, create one.
+ If the current handle does not yet have a prebuilt struct, create
+ one. Update the trx pointers in the prebuilt struct. Normally
+ this operation is done in external_lock. */
+
+ update_thd(ha_thd());
+
+ /* Initialize the prebuilt struct much like it would be inited in
+ external_lock */
+
+ innobase_release_stat_resources(prebuilt->trx);
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started(prebuilt->trx);
+
+ /* Assign a read view if the transaction does not have it yet */
+
+ trx_assign_read_view(prebuilt->trx);
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+
+ if (prebuilt->trx->active_trans == 0) {
+
+ innobase_register_trx_and_stmt(ht, user_thd);
+
+ prebuilt->trx->active_trans = 1;
+ }
+
+ /* We did the necessary inits in this function, no need to repeat them
+ in row_search_for_mysql */
+
+ prebuilt->sql_stat_start = FALSE;
+
+ /* We let HANDLER always to do the reads as consistent reads, even
+ if the trx isolation level would have been specified as SERIALIZABLE */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+
+ /* Always fetch all columns in the index record */
+
+ prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+ /* We want always to fetch all columns in the whole row? Or do
+ we???? */
+
+ prebuilt->used_in_HANDLER = TRUE;
+ reset_template(prebuilt);
+}
+
+/*********************************************************************//**
+Opens an InnoDB database.
+@return 0 on success, error code on failure */
+static
+int
+innobase_init(
+/*==========*/
+ void *p) /*!< in: InnoDB handlerton */
+{
+ static char current_dir[3]; /*!< Set if using current lib */
+ int err;
+ bool ret;
+ char *default_path;
+ uint format_id;
+
+ DBUG_ENTER("innobase_init");
+ handlerton *innobase_hton= (handlerton *)p;
+ innodb_hton_ptr = innobase_hton;
+
+ innobase_hton->state = SHOW_OPTION_YES;
+ innobase_hton->db_type= DB_TYPE_INNODB;
+ innobase_hton->savepoint_offset=sizeof(trx_named_savept_t);
+ innobase_hton->close_connection=innobase_close_connection;
+ innobase_hton->savepoint_set=innobase_savepoint;
+ innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
+ innobase_hton->savepoint_release=innobase_release_savepoint;
+ innobase_hton->commit=innobase_commit;
+ innobase_hton->rollback=innobase_rollback;
+ innobase_hton->prepare=innobase_xa_prepare;
+ innobase_hton->recover=innobase_xa_recover;
+ innobase_hton->commit_by_xid=innobase_commit_by_xid;
+ innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
+ innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
+ innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
+ innobase_hton->close_cursor_read_view=innobase_close_cursor_view;
+ innobase_hton->create=innobase_create_handler;
+ innobase_hton->drop_database=innobase_drop_database;
+ innobase_hton->panic=innobase_end;
+ innobase_hton->start_consistent_snapshot=innobase_start_trx_and_assign_read_view;
+ innobase_hton->flush_logs=innobase_flush_logs;
+ innobase_hton->show_status=innobase_show_status;
+ innobase_hton->flags=HTON_NO_FLAGS;
+ innobase_hton->release_temporary_latches=innobase_release_temporary_latches;
+ innobase_hton->alter_table_flags = innobase_alter_table_flags;
+
+ ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
+
+#ifdef UNIV_DEBUG
+ static const char test_filename[] = "-@";
+ char test_tablename[sizeof test_filename
+ + sizeof srv_mysql50_table_name_prefix];
+ if ((sizeof test_tablename) - 1
+ != filename_to_tablename(test_filename, test_tablename,
+ sizeof test_tablename)
+ || strncmp(test_tablename,
+ srv_mysql50_table_name_prefix,
+ sizeof srv_mysql50_table_name_prefix)
+ || strcmp(test_tablename
+ + sizeof srv_mysql50_table_name_prefix,
+ test_filename)) {
+ sql_print_error("tablename encoding has been changed");
+ goto error;
+ }
+#endif /* UNIV_DEBUG */
+
+ srv_page_size = 0;
+ srv_page_size_shift = 0;
+
+ if (innobase_page_size != (1 << 14)) {
+ uint n_shift;
+
+ fprintf(stderr,
+ "InnoDB: Warning: innodb_page_size has been changed from default value 16384. (###EXPERIMENTAL### operation)\n");
+ for (n_shift = 12; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; n_shift++) {
+ if (innobase_page_size == ((ulong)1 << n_shift)) {
+ srv_page_size_shift = n_shift;
+ srv_page_size = (1 << srv_page_size_shift);
+ fprintf(stderr,
+ "InnoDB: The universal page size of the database is set to %lu.\n",
+ srv_page_size);
+ break;
+ }
+ }
+ } else {
+ srv_page_size_shift = 14;
+ srv_page_size = (1 << srv_page_size_shift);
+ }
+
+ if (!srv_page_size_shift) {
+ fprintf(stderr,
+ "InnoDB: Error: %lu is not valid value for innodb_page_size.\n",
+ innobase_page_size);
+ goto error;
+ }
+
+#ifndef MYSQL_SERVER
+ innodb_overwrite_relay_log_info = FALSE;
+#endif
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+ /* read master log position from relay-log.info if exists */
+ char fname[FN_REFLEN+128];
+ int pos;
+ int info_fd;
+ IO_CACHE info_file;
+
+ fname[0] = '\0';
+
+ if(innobase_overwrite_relay_log_info) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: innodb_overwrite_relay_log_info is enabled."
+ " Updates in other storage engines may have problem with consistency.\n");
+
+ bzero((char*) &info_file, sizeof(info_file));
+ fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32);
+
+ int error=0;
+
+ if (!access(fname,F_OK)) {
+ /* exist */
+ if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) {
+ error=1;
+ } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2,
+ READ_CACHE, 0L, 0, MYF(MY_WME))) {
+ error=1;
+ }
+
+ if (error) {
+relay_info_error:
+ if (info_fd >= 0)
+ my_close(info_fd, MYF(0));
+ fname[0] = '\0';
+ goto skip_relay;
+ }
+ } else {
+ fname[0] = '\0';
+ goto skip_relay;
+ }
+
+ if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") || /* dummy (it is relay-log) */
+ init_intvar_from_file(&pos, &info_file, BIN_LOG_HEADER_SIZE)) {
+ end_io_cache(&info_file);
+ error=1;
+ goto relay_info_error;
+ }
+
+ fprintf(stderr,
+ "InnoDB: relay-log.info is detected.\n"
+ "InnoDB: relay log: position %u, file name %s\n",
+ pos, fname);
+
+ strncpy(trx_sys_mysql_relay_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+ trx_sys_mysql_relay_log_pos = (ib_int64_t) pos;
+
+ if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") ||
+ init_intvar_from_file(&pos, &info_file, 0)) {
+ end_io_cache(&info_file);
+ error=1;
+ goto relay_info_error;
+ }
+
+ fprintf(stderr,
+ "InnoDB: master log: position %u, file name %s\n",
+ pos, fname);
+
+ strncpy(trx_sys_mysql_master_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+ trx_sys_mysql_master_log_pos = (ib_int64_t) pos;
+
+ end_io_cache(&info_file);
+ if (info_fd >= 0)
+ my_close(info_fd, MYF(0));
+ }
+skip_relay:
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+ /* Check that values don't overflow on 32-bit systems. */
+ if (sizeof(ulint) == 4) {
+ if (innobase_buffer_pool_size > UINT_MAX32) {
+ sql_print_error(
+ "innobase_buffer_pool_size can't be over 4GB"
+ " on 32-bit systems");
+
+ goto error;
+ }
+
+ if (innobase_log_file_size > UINT_MAX32) {
+ sql_print_error(
+ "innobase_log_file_size can't be over 4GB"
+ " on 32-bit systems");
+
+ goto error;
+ }
+ }
+
+ os_innodb_umask = (ulint)my_umask;
+
+ /* First calculate the default path for innodb_data_home_dir etc.,
+ in case the user has not given any value.
+
+ Note that when using the embedded server, the datadirectory is not
+ necessarily the current directory of this program. */
+
+ if (mysqld_embedded) {
+ default_path = mysql_real_data_home;
+ fil_path_to_mysql_datadir = mysql_real_data_home;
+ } else {
+ /* It's better to use current lib, to keep paths short */
+ current_dir[0] = FN_CURLIB;
+ current_dir[1] = FN_LIBCHAR;
+ current_dir[2] = 0;
+ default_path = current_dir;
+ }
+
+ ut_a(default_path);
+
+ if (specialflag & SPECIAL_NO_PRIOR) {
+ srv_set_thread_priorities = FALSE;
+ } else {
+ srv_set_thread_priorities = TRUE;
+ srv_query_thread_priority = QUERY_PRIOR;
+ }
+
+ /* Set InnoDB initialization parameters according to the values
+ read from MySQL .cnf file */
+
+ /*--------------- Data files -------------------------*/
+
+ /* The default dir for data files is the datadir of MySQL */
+
+ srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
+ default_path);
+
+ /* Set default InnoDB data file size to 10 MB and let it be
+ auto-extending. Thus users can use InnoDB in >= 4.0 without having
+ to specify any startup options. */
+
+ if (!innobase_data_file_path) {
+ innobase_data_file_path = (char*) "ibdata1:10M:autoextend";
+ }
+
+ /* Since InnoDB edits the argument in the next call, we make another
+ copy of it: */
+
+ internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
+ MYF(MY_FAE));
+
+ ret = (bool) srv_parse_data_file_paths_and_sizes(
+ internal_innobase_data_file_path);
+ if (ret == FALSE) {
+ sql_print_error(
+ "InnoDB: syntax error in innodb_data_file_path");
+mem_free_and_error:
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path,
+ MYF(MY_ALLOW_ZERO_PTR));
+ goto error;
+ }
+
+ srv_doublewrite_file = innobase_doublewrite_file;
+
+ srv_extra_undoslots = (ibool) innobase_extra_undoslots;
+
+ srv_use_sys_stats_table = (ibool) innobase_use_sys_stats_table;
+
+ /* -------------- Log files ---------------------------*/
+
+ /* The default dir for log files is the datadir of MySQL */
+
+ if (!innobase_log_group_home_dir) {
+ innobase_log_group_home_dir = default_path;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ /* Since innodb_log_arch_dir has no relevance under MySQL,
+ starting from 4.0.6 we always set it the same as
+ innodb_log_group_home_dir: */
+
+ innobase_log_arch_dir = innobase_log_group_home_dir;
+
+ srv_arch_dir = innobase_log_arch_dir;
+#endif /* UNIG_LOG_ARCHIVE */
+
+ ret = (bool)
+ srv_parse_log_group_home_dirs(innobase_log_group_home_dir);
+
+ if (ret == FALSE || innobase_mirrored_log_groups != 1) {
+ sql_print_error("syntax error in innodb_log_group_home_dir, or a "
+ "wrong number of mirrored log groups");
+
+ goto mem_free_and_error;
+ }
+
+ /* Validate the file format by animal name */
+ if (innobase_file_format_name != NULL) {
+
+ format_id = innobase_file_format_name_lookup(
+ innobase_file_format_name);
+
+ if (format_id > DICT_TF_FORMAT_MAX) {
+
+ sql_print_error("InnoDB: wrong innodb_file_format.");
+
+ goto mem_free_and_error;
+ }
+ } else {
+ /* Set it to the default file format id. Though this
+ should never happen. */
+ format_id = 0;
+ }
+
+ srv_file_format = format_id;
+
+ /* Given the type of innobase_file_format_name we have little
+ choice but to cast away the constness from the returned name.
+ innobase_file_format_name is used in the MySQL set variable
+ interface and so can't be const. */
+
+ innobase_file_format_name =
+ (char*) trx_sys_file_format_id_to_name(format_id);
+
+ /* Process innobase_file_format_check variable */
+ ut_a(innobase_file_format_check != NULL);
+
+ /* As a side effect it will set srv_check_file_format_at_startup
+ on valid input. First we check for "on"/"off". */
+ if (!innobase_file_format_check_on_off(innobase_file_format_check)) {
+
+ /* Did the user specify a format name that we support ?
+ As a side effect it will update the variable
+ srv_check_file_format_at_startup */
+ if (innobase_file_format_validate_and_set(
+ innobase_file_format_check) < 0) {
+
+ sql_print_error("InnoDB: invalid "
+ "innodb_file_format_check value: "
+ "should be either 'on' or 'off' or "
+ "any value up to %s or its "
+ "equivalent numeric id",
+ trx_sys_file_format_id_to_name(
+ DICT_TF_FORMAT_MAX));
+
+ goto mem_free_and_error;
+ }
+ }
+
+ if (innobase_change_buffering) {
+ ulint use;
+
+ for (use = 0;
+ use < UT_ARR_SIZE(innobase_change_buffering_values);
+ use++) {
+ if (!innobase_strcasecmp(
+ innobase_change_buffering,
+ innobase_change_buffering_values[use])) {
+ ibuf_use = (ibuf_use_t) use;
+ goto innobase_change_buffering_inited_ok;
+ }
+ }
+
+ sql_print_error("InnoDB: invalid value "
+ "innodb_change_buffering=%s",
+ innobase_change_buffering);
+ goto mem_free_and_error;
+ }
+
+innobase_change_buffering_inited_ok:
+ ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
+ innobase_change_buffering = (char*)
+ innobase_change_buffering_values[ibuf_use];
+
+ /* --------------------------------------------------*/
+
+ srv_file_flush_method_str = innobase_file_flush_method;
+
+ srv_n_log_groups = (ulint) innobase_mirrored_log_groups;
+ srv_n_log_files = (ulint) innobase_log_files_in_group;
+ srv_log_file_size = (ulint) innobase_log_file_size;
+
+ srv_thread_concurrency_timer_based =
+ (ibool) innobase_thread_concurrency_timer_based;
+
+#ifdef UNIV_LOG_ARCHIVE
+ srv_log_archive_on = (ulint) innobase_log_archive;
+#endif /* UNIV_LOG_ARCHIVE */
+ srv_log_buffer_size = (ulint) innobase_log_buffer_size;
+
+ srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_read_ahead &= 3;
+ srv_adaptive_checkpoint %= 3;
+
+ srv_force_recovery = (ulint) innobase_force_recovery;
+
+ srv_recovery_stats = (ibool) innobase_recovery_stats;
+
+ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+ srv_use_checksums = (ibool) innobase_use_checksums;
+ srv_fast_checksum = (ibool) innobase_fast_checksum;
+ srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum;
+
+#ifdef HAVE_LARGE_PAGES
+ if ((os_use_large_pages = (ibool) my_use_large_pages))
+ os_large_page_size = (ulint) opt_large_page_size;
+#endif
+
+ row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+ srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+
+ srv_max_n_open_files = (ulint) innobase_open_files;
+ srv_innodb_status = (ibool) innobase_create_status_file;
+
+ srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+ /* Store the default charset-collation number of this MySQL
+ installation */
+
+ data_mysql_default_charset_coll = (ulint)default_charset_info->number;
+
+ ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
+ my_charset_latin1.number);
+ ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
+
+ /* Store the latin1_swedish_ci character ordering table to InnoDB. For
+ non-latin1_swedish_ci charsets we use the MySQL comparison functions,
+ and consequently we do not need to know the ordering internally in
+ InnoDB. */
+
+ ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci"));
+ srv_latin1_ordering = my_charset_latin1.sort_order;
+
+ innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+ innobase_old_blocks_pct, FALSE);
+
+ innobase_commit_concurrency_init_default();
+
+ /* Since we in this module access directly the fields of a trx
+ struct, and due to different headers and flags it might happen that
+ mutex_t has a different size in this module and in InnoDB
+ modules, we check at run time that the size is the same in
+ these compilation modules. */
+
+ err = innobase_start_or_create_for_mysql();
+
+ if (err != DB_SUCCESS) {
+ goto mem_free_and_error;
+ }
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+ if(innobase_overwrite_relay_log_info) {
+ /* If InnoDB progressed from relay-log.info, overwrite it */
+ if (fname[0] == '\0') {
+ fprintf(stderr,
+ "InnoDB: something wrong with relay-info.log. InnoDB will not overwrite it.\n");
+ } else if (0 != strcmp(fname, trx_sys_mysql_master_log_name)
+ || pos != trx_sys_mysql_master_log_pos) {
+ /* Overwrite relay-log.info */
+ bzero((char*) &info_file, sizeof(info_file));
+ fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32);
+
+ int error = 0;
+
+ if (!access(fname,F_OK)) {
+ /* exist */
+ if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) {
+ error = 1;
+ } else if (init_io_cache(&info_file, info_fd, IO_SIZE*2,
+ WRITE_CACHE, 0L, 0, MYF(MY_WME))) {
+ error = 1;
+ }
+
+ if (error) {
+ if (info_fd >= 0)
+ my_close(info_fd, MYF(0));
+ goto skip_overwrite;
+ }
+ } else {
+ error = 1;
+ goto skip_overwrite;
+ }
+
+ char buff[FN_REFLEN*2+22*2+4], *pos;
+
+ my_b_seek(&info_file, 0L);
+ pos=strmov(buff, trx_sys_mysql_relay_log_name);
+ *pos++='\n';
+ pos=longlong10_to_str(trx_sys_mysql_relay_log_pos, pos, 10);
+ *pos++='\n';
+ pos=strmov(pos, trx_sys_mysql_master_log_name);
+ *pos++='\n';
+ pos=longlong10_to_str(trx_sys_mysql_master_log_pos, pos, 10);
+ *pos='\n';
+
+ if (my_b_write(&info_file, (uchar*) buff, (size_t) (pos-buff)+1))
+ error = 1;
+ if (flush_io_cache(&info_file))
+ error = 1;
+
+ end_io_cache(&info_file);
+ if (info_fd >= 0)
+ my_close(info_fd, MYF(0));
+skip_overwrite:
+ if (error) {
+ fprintf(stderr,
+ "InnoDB: ERROR: error occured during overwriting relay-log.info.\n");
+ } else {
+ fprintf(stderr,
+ "InnoDB: relay-log.info was overwritten.\n");
+ }
+ } else {
+ fprintf(stderr,
+ "InnoDB: InnoDB and relay-log.info are synchronized. InnoDB will not overwrite it.\n");
+ }
+ }
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+ innobase_open_tables = hash_create(200);
+ pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST);
+ pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
+ pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
+ pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
+ pthread_cond_init(&commit_cond, NULL);
+ innodb_inited= 1;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+ if (innobase_hton != p) {
+ innobase_hton = reinterpret_cast<handlerton*>(p);
+ *innobase_hton = *innodb_hton_ptr;
+ }
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+ /* Get the current high water mark format. */
+ innobase_file_format_check = (char*) trx_sys_file_format_max_get();
+
+ btr_search_fully_disabled = (!btr_search_enabled);
+ DBUG_RETURN(FALSE);
+error:
+ DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+Closes an InnoDB database.
+@return TRUE if error */
+static
+int
+innobase_end(
+/*=========*/
+ handlerton* hton, /*!< in/out: InnoDB handlerton */
+ ha_panic_function type __attribute__((unused)))
+ /*!< in: ha_panic() parameter */
+{
+ int err= 0;
+
+ DBUG_ENTER("innobase_end");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+#ifdef __NETWARE__ /* some special cleanup for NetWare */
+ if (nw_panic) {
+ set_panic_flag_for_netware();
+ }
+#endif
+ if (innodb_inited) {
+
+ srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+ innodb_inited = 0;
+ hash_table_free(innobase_open_tables);
+ innobase_open_tables = NULL;
+ if (innobase_shutdown_for_mysql() != DB_SUCCESS) {
+ err = 1;
+ }
+ srv_free_paths_and_sizes();
+ my_free(internal_innobase_data_file_path,
+ MYF(MY_ALLOW_ZERO_PTR));
+ pthread_mutex_destroy(&innobase_share_mutex);
+ pthread_mutex_destroy(&prepare_commit_mutex);
+ pthread_mutex_destroy(&commit_threads_m);
+ pthread_mutex_destroy(&commit_cond_m);
+ pthread_cond_destroy(&commit_cond);
+ }
+
+ DBUG_RETURN(err);
+}
+
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+ handlerton* hton) /*!< in/out: InnoDB handlerton */
+{
+ bool result = 0;
+
+ DBUG_ENTER("innobase_flush_logs");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ log_buffer_flush_to_disk();
+
+ DBUG_RETURN(result);
+}
+
+/****************************************************************//**
+Return alter table flags supported in an InnoDB database. */
+static
+uint
+innobase_alter_table_flags(
+/*=======================*/
+ uint flags)
+{
+ return(HA_ONLINE_ADD_INDEX_NO_WRITES
+ | HA_ONLINE_DROP_INDEX_NO_WRITES
+ | HA_ONLINE_ADD_UNIQUE_INDEX_NO_WRITES
+ | HA_ONLINE_DROP_UNIQUE_INDEX_NO_WRITES
+ | HA_ONLINE_ADD_PK_INDEX_NO_WRITES);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ return;
+ }
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+ THD *thd=current_thd;
+
+ if (thd && thd->slave_thread) {
+ /* Update the replication position info inside InnoDB.
+ In embedded server, does nothing. */
+ const char *log_file_name, *group_relay_log_name;
+ ulonglong log_pos, relay_log_pos;
+ bool res = rpl_get_position_info(&log_file_name, &log_pos,
+ &group_relay_log_name,
+ &relay_log_pos);
+ if (res) {
+ trx->mysql_master_log_file_name = log_file_name;
+ trx->mysql_master_log_pos = (ib_int64_t)log_pos;
+ trx->mysql_relay_log_file_name = group_relay_log_name;
+ trx->mysql_relay_log_pos = (ib_int64_t)relay_log_pos;
+ }
+ }
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+ trx_commit_for_mysql(trx);
+}
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return 0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd) /*!< in: MySQL thread handle of the user for whom
+ the transaction should be committed */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* Create a new trx struct for thd, if it does not yet have one */
+
+ trx = check_trx_exists(thd);
+
+ /* This is just to play safe: release a possible FIFO ticket and
+ search latch. Since we will reserve the kernel mutex, we have to
+ release the search system latch first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ /* If the transaction is not started yet, start it */
+
+ trx_start_if_not_started(trx);
+
+ /* Assign a read view if the transaction does not have it yet */
+
+ trx_assign_read_view(trx);
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+
+ if (trx->active_trans == 0) {
+ innobase_register_trx_and_stmt(hton, thd);
+ trx->active_trans = 1;
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return 0 */
+static
+int
+innobase_commit(
+/*============*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: MySQL thread handle of the user for whom
+ the transaction should be committed */
+ bool all) /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement ended */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_commit");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("ending transaction"));
+
+ trx = check_trx_exists(thd);
+
+ /* Since we will reserve the kernel mutex, we have to release
+ the search system latch first to obey the latching order. */
+
+ if (trx->has_search_latch) {
+ trx_search_latch_release_if_reserved(trx);
+ }
+
+ /* The flag trx->active_trans is set to 1 in
+
+ 1. ::external_lock(),
+ 2. ::start_stmt(),
+ 3. innobase_query_caching_of_table_permitted(),
+ 4. innobase_savepoint(),
+ 5. ::init_table_handle_for_HANDLER(),
+ 6. innobase_start_trx_and_assign_read_view(),
+ 7. ::transactional_table_lock()
+
+ and it is only set to 0 in a commit or a rollback. If it is 0 we know
+ there cannot be resources to be freed and we could return immediately.
+ For the time being, we play safe and do the cleanup though there should
+ be nothing to clean up. */
+
+ if (trx->active_trans == 0
+ && trx->conc_state != TRX_NOT_STARTED) {
+
+ sql_print_error("trx->active_trans == 0, but"
+ " trx->conc_state != TRX_NOT_STARTED");
+ }
+ if (all
+ || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+ /* We were instructed to commit the whole transaction, or
+ this is an SQL statement end and autocommit is on */
+
+ /* We need current binlog position for ibbackup to work.
+ Note, the position is current because of
+ prepare_commit_mutex */
+retry:
+ if (innobase_commit_concurrency > 0) {
+ pthread_mutex_lock(&commit_cond_m);
+ commit_threads++;
+
+ if (commit_threads > innobase_commit_concurrency) {
+ commit_threads--;
+ pthread_cond_wait(&commit_cond,
+ &commit_cond_m);
+ pthread_mutex_unlock(&commit_cond_m);
+ goto retry;
+ }
+ else {
+ pthread_mutex_unlock(&commit_cond_m);
+ }
+ }
+
+ /* The following calls to read the MySQL binary log
+ file name and the position return consistent results:
+ 1) Other InnoDB transactions cannot intervene between
+ these calls as we are holding prepare_commit_mutex.
+ 2) Binary logging of other engines is not relevant
+ to InnoDB as all InnoDB requires is that committing
+ InnoDB transactions appear in the same order in the
+ MySQL binary log as they appear in InnoDB logs.
+ 3) A MySQL log file rotation cannot happen because
+ MySQL protects against this by having a counter of
+ transactions in prepared state and it only allows
+ a rotation when the counter drops to zero. See
+ LOCK_prep_xids and COND_prep_xids in log.cc. */
+ trx->mysql_log_file_name = mysql_bin_log_file_name();
+ trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
+
+ /* Don't do write + flush right now. For group commit
+ to work we want to do the flush after releasing the
+ prepare_commit_mutex. */
+ trx->flush_log_later = TRUE;
+ innobase_commit_low(trx);
+ trx->flush_log_later = FALSE;
+
+ if (innobase_commit_concurrency > 0) {
+ pthread_mutex_lock(&commit_cond_m);
+ commit_threads--;
+ pthread_cond_signal(&commit_cond);
+ pthread_mutex_unlock(&commit_cond_m);
+ }
+
+ if (trx->active_trans == 2) {
+
+ pthread_mutex_unlock(&prepare_commit_mutex);
+ }
+
+ /* Now do a write + flush of logs. */
+ trx_commit_complete_for_mysql(trx);
+ trx->active_trans = 0;
+
+ } else {
+ /* We just mark the SQL statement ended and do not do a
+ transaction commit */
+
+ /* If we had reserved the auto-inc lock for some
+ table in this SQL statement we release it now */
+
+ row_unlock_table_autoinc_for_mysql(trx);
+
+ /* Store the current undo_no of the transaction so that we
+ know where to roll back if we have to roll back the next
+ SQL statement */
+
+ trx_mark_sql_stat_end(trx);
+ }
+
+ trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+ if (trx->declared_to_be_inside_innodb) {
+ /* Release our possible ticket in the FIFO */
+
+ srv_conc_force_exit_innodb(trx);
+ }
+
+ /* Tell the InnoDB server that there might be work for utility
+ threads: */
+ srv_active_wake_master_thread();
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return 0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of the user
+ whose transaction should be rolled back */
+ bool all) /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement ended */
+{
+ int error = 0;
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_rollback");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ DBUG_PRINT("trans", ("aborting transaction"));
+
+ trx = check_trx_exists(thd);
+
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the kernel mutex, we have to release the search system latch
+ first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
+
+ row_unlock_table_autoinc_for_mysql(trx);
+
+ if (all
+ || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ error = trx_rollback_for_mysql(trx);
+ trx->active_trans = 0;
+ } else {
+ error = trx_rollback_last_sql_stat_for_mysql(trx);
+ }
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction
+@return 0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ int error = 0;
+
+ DBUG_ENTER("innobase_rollback_trx");
+ DBUG_PRINT("trans", ("aborting transaction"));
+
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the kernel mutex, we have to release the search system latch
+ first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ /* If we had reserved the auto-inc lock for some table (if
+ we come here to roll back the latest SQL statement) we
+ release it now before a possibly lengthy rollback */
+
+ row_unlock_table_autoinc_for_mysql(trx);
+
+ error = trx_rollback_for_mysql(trx);
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+ handlerton *hton, /*!< in: Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of the user
+ whose transaction should be rolled back */
+ void* savepoint) /*!< in: savepoint data */
+{
+ ib_int64_t mysql_binlog_cache_pos;
+ int error = 0;
+ trx_t* trx;
+ char name[64];
+
+ DBUG_ENTER("innobase_rollback_to_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = check_trx_exists(thd);
+
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the kernel mutex, we have to release the search system latch
+ first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+
+ longlong2str((ulint)savepoint, name, 36, 1);
+
+ error = (int) trx_rollback_to_savepoint_for_mysql(trx, name,
+ &mysql_binlog_cache_pos);
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+ handlerton* hton, /*!< in: handlerton for Innodb */
+ THD* thd, /*!< in: handle to the MySQL thread of the user
+ whose transaction should be rolled back */
+ void* savepoint) /*!< in: savepoint data */
+{
+ int error = 0;
+ trx_t* trx;
+ char name[64];
+
+ DBUG_ENTER("innobase_release_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = check_trx_exists(thd);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+
+ longlong2str((ulint)savepoint, name, 36, 1);
+
+ error = (int) trx_release_savepoint_for_mysql(trx, name);
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+ handlerton* hton, /*!< in: handle to the Innodb handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread */
+ void* savepoint) /*!< in: savepoint data */
+{
+ int error = 0;
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_savepoint");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /*
+ In the autocommit mode there is no sense to set a savepoint
+ (unless we are in sub-statement), so SQL layer ensures that
+ this method is never called in such situation.
+ */
+#ifdef MYSQL_SERVER /* plugins cannot access thd->in_sub_stmt */
+ DBUG_ASSERT(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) ||
+ thd->in_sub_stmt);
+#endif /* MYSQL_SERVER */
+
+ trx = check_trx_exists(thd);
+
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the kernel mutex, we have to release the search system latch
+ first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ /* cannot happen outside of transaction */
+ DBUG_ASSERT(trx->active_trans);
+
+ /* TODO: use provided savepoint data area to store savepoint data */
+ char name[64];
+ longlong2str((ulint)savepoint,name,36,1);
+
+ error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+
+ DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return 0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+ handlerton* hton, /*!< in: innobase handlerton */
+ THD* thd) /*!< in: handle to the MySQL thread of the user
+ whose resources should be free'd */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("innobase_close_connection");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+ trx = thd_to_trx(thd);
+
+ ut_a(trx);
+
+ if (trx->active_trans == 0
+ && trx->conc_state != TRX_NOT_STARTED) {
+
+ sql_print_error("trx->active_trans == 0, but"
+ " trx->conc_state != TRX_NOT_STARTED");
+ }
+
+
+ if (trx->conc_state != TRX_NOT_STARTED &&
+ global_system_variables.log_warnings) {
+ sql_print_warning(
+ "MySQL is closing a connection that has an active "
+ "InnoDB transaction. %lu row modifications will "
+ "roll back.",
+ (ulong) trx->undo_no.low);
+ }
+
+ innobase_rollback_trx(trx);
+
+ thr_local_free(trx->mysql_thread_id);
+ trx_free_for_mysql(trx);
+
+ DBUG_RETURN(0);
+}
+
+
+/*************************************************************************//**
+** InnoDB database tables
+*****************************************************************************/
+
+/****************************************************************//**
+Get the record format from the data dictionary.
+@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT,
+ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */
+UNIV_INTERN
+enum row_type
+ha_innobase::get_row_type() const
+/*=============================*/
+{
+ if (prebuilt && prebuilt->table) {
+ const ulint flags = prebuilt->table->flags;
+
+ if (UNIV_UNLIKELY(!flags)) {
+ return(ROW_TYPE_REDUNDANT);
+ }
+
+ ut_ad(flags & DICT_TF_COMPACT);
+
+ switch (flags & DICT_TF_FORMAT_MASK) {
+ case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+ return(ROW_TYPE_COMPACT);
+ case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT:
+ if (flags & DICT_TF_ZSSIZE_MASK) {
+ return(ROW_TYPE_COMPRESSED);
+ } else {
+ return(ROW_TYPE_DYNAMIC);
+ }
+#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
+# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#endif
+ }
+ }
+ ut_ad(0);
+ return(ROW_TYPE_NOT_USED);
+}
+
+
+
+/****************************************************************//**
+Get the table flags to use for the statement.
+@return table flags */
+UNIV_INTERN
+handler::Table_flags
+ha_innobase::table_flags() const
+/*============================*/
+{
+ /* Need to use tx_isolation here since table flags is (also)
+ called before prebuilt is inited. */
+ ulong const tx_isolation = thd_tx_isolation(ha_thd());
+ if (tx_isolation <= ISO_READ_COMMITTED)
+ return int_table_flags;
+ return int_table_flags | HA_BINLOG_STMT_CAPABLE;
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+ ".ibd",
+ NullS
+};
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return table type */
+UNIV_INTERN
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+ return(innobase_hton_name);
+}
+
+/****************************************************************//**
+Returns the index type. */
+UNIV_INTERN
+const char*
+ha_innobase::index_type(
+/*====================*/
+ uint)
+ /*!< out: index type */
+{
+ return("BTREE");
+}
+
+/****************************************************************//**
+Returns the table file name extension.
+@return file extension string */
+UNIV_INTERN
+const char**
+ha_innobase::bas_ext() const
+/*========================*/
+{
+ return(ha_innobase_exts);
+}
+
+/****************************************************************//**
+Returns the operations supported for indexes.
+@return flags of supported operations */
+UNIV_INTERN
+ulong
+ha_innobase::index_flags(
+/*=====================*/
+ uint,
+ uint,
+ bool)
+const
+{
+ return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+ | HA_READ_RANGE | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
+}
+
+/****************************************************************//**
+Returns the maximum number of keys.
+@return MAX_KEY */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_keys() const
+/*===================================*/
+{
+ return(MAX_KEY);
+}
+
+/****************************************************************//**
+Returns the maximum key length.
+@return maximum supported key length, in bytes */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_length() const
+/*=========================================*/
+{
+ /* An InnoDB page must store >= 2 keys; a secondary key record
+ must also contain the primary key value: max key length is
+ therefore set to slightly less than 1 / 4 of page size which
+ is 16 kB; but currently MySQL does not work with keys whose
+ size is > MAX_KEY_LENGTH */
+ return(3500);
+}
+
+/****************************************************************//**
+Returns the key map of keys that are usable for scanning.
+@return key_map_full */
+UNIV_INTERN
+const key_map*
+ha_innobase::keys_to_use_for_scanning()
+{
+ return(&key_map_full);
+}
+
+/****************************************************************//**
+Determines if table caching is supported.
+@return HA_CACHE_TBL_ASKTRANSACT */
+UNIV_INTERN
+uint8
+ha_innobase::table_cache_type()
+{
+ return(HA_CACHE_TBL_ASKTRANSACT);
+}
+
+/****************************************************************//**
+Determines if the primary key is clustered index.
+@return true */
+UNIV_INTERN
+bool
+ha_innobase::primary_key_is_clustered()
+{
+ return(true);
+}
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case. */
+static
+void
+normalize_table_name(
+/*=================*/
+ char* norm_name, /*!< out: normalized name as a
+ null-terminated string */
+ const char* name) /*!< in: table name string */
+{
+ char* name_ptr;
+ char* db_ptr;
+ char* ptr;
+
+ /* Scan name from the end */
+
+ ptr = strend(name)-1;
+
+ while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ }
+
+ name_ptr = ptr + 1;
+
+ DBUG_ASSERT(ptr > name);
+
+ ptr--;
+
+ while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ }
+
+ db_ptr = ptr + 1;
+
+ memcpy(norm_name, db_ptr, strlen(name) + 1 - (db_ptr - name));
+
+ norm_name[name_ptr - db_ptr - 1] = '/';
+
+#ifdef __WIN__
+ innobase_casedn_str(norm_name);
+#endif
+}
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+static
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+ const Field* field) /*!< in: MySQL field */
+{
+ ulonglong max_value = 0;
+
+ switch(field->key_type()) {
+ /* TINY */
+ case HA_KEYTYPE_BINARY:
+ max_value = 0xFFULL;
+ break;
+ case HA_KEYTYPE_INT8:
+ max_value = 0x7FULL;
+ break;
+ /* SHORT */
+ case HA_KEYTYPE_USHORT_INT:
+ max_value = 0xFFFFULL;
+ break;
+ case HA_KEYTYPE_SHORT_INT:
+ max_value = 0x7FFFULL;
+ break;
+ /* MEDIUM */
+ case HA_KEYTYPE_UINT24:
+ max_value = 0xFFFFFFULL;
+ break;
+ case HA_KEYTYPE_INT24:
+ max_value = 0x7FFFFFULL;
+ break;
+ /* LONG */
+ case HA_KEYTYPE_ULONG_INT:
+ max_value = 0xFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONG_INT:
+ max_value = 0x7FFFFFFFULL;
+ break;
+ /* BIG */
+ case HA_KEYTYPE_ULONGLONG:
+ max_value = 0xFFFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_LONGLONG:
+ max_value = 0x7FFFFFFFFFFFFFFFULL;
+ break;
+ case HA_KEYTYPE_FLOAT:
+ /* We use the maximum as per IEEE754-2008 standard, 2^24 */
+ max_value = 0x1000000ULL;
+ break;
+ case HA_KEYTYPE_DOUBLE:
+ /* We use the maximum as per IEEE754-2008 standard, 2^53 */
+ max_value = 0x20000000000000ULL;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(max_value);
+}
+
+/*******************************************************************//**
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@return TRUE if all column types match. */
+static
+ibool
+innobase_match_index_columns(
+/*=========================*/
+ const KEY* key_info, /*!< in: Index info
+ from mysql */
+ const dict_index_t* index_info) /*!< in: Index info
+ from Innodb */
+{
+ const KEY_PART_INFO* key_part;
+ const KEY_PART_INFO* key_end;
+ const dict_field_t* innodb_idx_fld;
+ const dict_field_t* innodb_idx_fld_end;
+
+ DBUG_ENTER("innobase_match_index_columns");
+
+ /* Check whether user defined index column count matches */
+ if (key_info->key_parts != index_info->n_user_defined_cols) {
+ DBUG_RETURN(FALSE);
+ }
+
+ key_part = key_info->key_part;
+ key_end = key_part + key_info->key_parts;
+ innodb_idx_fld = index_info->fields;
+ innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+ /* Check each index column's datatype. We do not check
+ column name because there exists case that index
+ column name got modified in mysql but such change does not
+ propagate to InnoDB.
+ One hidden assumption here is that the index column sequences
+ are matched up between those in mysql and Innodb. */
+ for (; key_part != key_end; ++key_part) {
+ ulint col_type;
+ ibool is_unsigned;
+ ulint mtype = innodb_idx_fld->col->mtype;
+
+ /* Need to translate to InnoDB column type before
+ comparison. */
+ col_type = get_innobase_type_from_mysql_type(&is_unsigned,
+ key_part->field);
+
+ /* Ignore Innodb specific system columns. */
+ while (mtype == DATA_SYS) {
+ innodb_idx_fld++;
+
+ if (innodb_idx_fld >= innodb_idx_fld_end) {
+ DBUG_RETURN(FALSE);
+ }
+ }
+
+ if (col_type != mtype) {
+ /* Column Type mismatches */
+ DBUG_RETURN(FALSE);
+ }
+
+ innodb_idx_fld++;
+ }
+
+ DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+This function builds a translation table in INNOBASE_SHARE
+structure for fast index location with mysql array number from its
+table->key_info structure. This also provides the necessary translation
+between the key order in mysql key_info and Innodb ib_table->indexes if
+they are not fully matched with each other.
+Note we do not have any mutex protecting the translation table
+building based on the assumption that there is no concurrent
+index creation/drop and DMLs that requires index lookup. All table
+handle will be closed before the index creation/drop.
+@return TRUE if index translation table built successfully */
+static
+ibool
+innobase_build_index_translation(
+/*=============================*/
+ const TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ dict_table_t* ib_table, /*!< in: table in Innodb data
+ dictionary */
+ INNOBASE_SHARE* share) /*!< in/out: share structure
+ where index translation table
+ will be constructed in. */
+{
+ ulint mysql_num_index;
+ ulint ib_num_index;
+ dict_index_t** index_mapping;
+ ibool ret = TRUE;
+
+ DBUG_ENTER("innobase_build_index_translation");
+
+ mysql_num_index = table->s->keys;
+ ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+
+ index_mapping = share->idx_trans_tbl.index_mapping;
+
+ /* If there exists inconsistency between MySQL and InnoDB dictionary
+ (metadata) information, the number of index defined in MySQL
+ could exceed that in InnoDB, do not build index translation
+ table in such case */
+ if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* If index entry count is non-zero, nothing has
+ changed since last update, directly return TRUE */
+ if (share->idx_trans_tbl.index_count) {
+ /* Index entry count should still match mysql_num_index */
+ ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
+ goto func_exit;
+ }
+
+ /* The number of index increased, rebuild the mapping table */
+ if (mysql_num_index > share->idx_trans_tbl.array_size) {
+ index_mapping = (dict_index_t**) my_realloc(index_mapping,
+ mysql_num_index *
+ sizeof(*index_mapping),
+ MYF(MY_ALLOW_ZERO_PTR));
+
+ if (!index_mapping) {
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ share->idx_trans_tbl.array_size = mysql_num_index;
+ }
+
+
+ /* For each index in the mysql key_info array, fetch its
+ corresponding InnoDB index pointer into index_mapping
+ array. */
+ for (ulint count = 0; count < mysql_num_index; count++) {
+
+ /* Fetch index pointers into index_mapping according to mysql
+ index sequence */
+ index_mapping[count] = dict_table_get_index_on_name(
+ ib_table, table->key_info[count].name);
+
+ if (!index_mapping[count]) {
+ sql_print_error("Cannot find index %s in InnoDB "
+ "index dictionary.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+
+ /* Double check fetched index has the same
+ column info as those in mysql key_info. */
+ if (!innobase_match_index_columns(&table->key_info[count],
+ index_mapping[count])) {
+ sql_print_error("Found index %s whose column info "
+ "does not match that of MySQL.",
+ table->key_info[count].name);
+ ret = FALSE;
+ goto func_exit;
+ }
+ }
+
+ /* Successfully built the translation table */
+ share->idx_trans_tbl.index_count = mysql_num_index;
+
+func_exit:
+ if (!ret) {
+ /* Build translation table failed. */
+ my_free(index_mapping, MYF(MY_ALLOW_ZERO_PTR));
+
+ share->idx_trans_tbl.array_size = 0;
+ share->idx_trans_tbl.index_count = 0;
+ index_mapping = NULL;
+ }
+
+ share->idx_trans_tbl.index_mapping = index_mapping;
+
+ DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+This function uses index translation table to quickly locate the
+requested index structure.
+Note we do not have mutex protection for the index translatoin table
+access, it is based on the assumption that there is no concurrent
+translation table rebuild (fter create/drop index) and DMLs that
+require index lookup.
+@return dict_index_t structure for requested index. NULL if
+fail to locate the index structure. */
+static
+dict_index_t*
+innobase_index_lookup(
+/*==================*/
+ INNOBASE_SHARE* share, /*!< in: share structure for index
+ translation table. */
+ uint keynr) /*!< in: index number for the requested
+ index */
+{
+ if (!share->idx_trans_tbl.index_mapping
+ || keynr >= share->idx_trans_tbl.index_count) {
+ return(NULL);
+ }
+
+ return(share->idx_trans_tbl.index_mapping[keynr]);
+}
+
+/************************************************************************
+Set the autoinc column max value. This should only be called once from
+ha_innobase::open(). Therefore there's no need for a covering lock. */
+UNIV_INTERN
+void
+ha_innobase::innobase_initialize_autoinc()
+/*======================================*/
+{
+ ulonglong auto_inc;
+ const Field* field = table->found_next_number_field;
+
+ if (field != NULL) {
+ auto_inc = innobase_get_int_col_max_value(field);
+ } else {
+ /* We have no idea what's been passed in to us as the
+ autoinc column. We set it to the 0, effectively disabling
+ updates to the table. */
+ auto_inc = 0;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Unable to determine the AUTOINC "
+ "column name\n");
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+ /* If the recovery level is set so high that writes
+ are disabled we force the AUTOINC counter to 0
+ value effectively disabling writes to the table.
+ Secondly, we avoid reading the table in case the read
+ results in failure due to a corrupted table/index.
+
+ We will not return an error to the client, so that the
+ tables can be dumped with minimal hassle. If an error
+ were returned in this case, the first attempt to read
+ the table would fail and subsequent SELECTs would succeed. */
+ auto_inc = 0;
+ } else if (field == NULL) {
+ /* This is a far more serious error, best to avoid
+ opening the table and return failure. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ } else {
+ dict_index_t* index;
+ const char* col_name;
+ ulonglong read_auto_inc;
+ ulint err;
+
+ update_thd(ha_thd());
+
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+ col_name = field->field_name;
+ index = innobase_get_index(table->s->next_number_index);
+
+ /* Execute SELECT MAX(col_name) FROM TABLE; */
+ err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+
+ switch (err) {
+ case DB_SUCCESS: {
+ ulonglong col_max_value;
+
+ col_max_value = innobase_get_int_col_max_value(field);
+
+ /* At the this stage we do not know the increment
+ nor the offset, so use a default increment of 1. */
+
+ auto_inc = innobase_next_autoinc(
+ read_auto_inc, 1, 1, col_max_value);
+
+ break;
+ }
+ case DB_RECORD_NOT_FOUND:
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: MySQL and InnoDB data "
+ "dictionaries are out of sync.\n"
+ "InnoDB: Unable to find the AUTOINC column "
+ "%s in the InnoDB table %s.\n"
+ "InnoDB: We set the next AUTOINC column "
+ "value to 0,\n"
+ "InnoDB: in effect disabling the AUTOINC "
+ "next value generation.\n"
+ "InnoDB: You can either set the next "
+ "AUTOINC value explicitly using ALTER TABLE\n"
+ "InnoDB: or fix the data dictionary by "
+ "recreating the table.\n",
+ col_name, index->table->name);
+
+ /* This will disable the AUTOINC generation. */
+ auto_inc = 0;
+
+ /* We want the open to succeed, so that the user can
+ take corrective action. ie. reads should succeed but
+ updates should fail. */
+ err = DB_SUCCESS;
+ break;
+ default:
+ /* row_search_max_autoinc() should only return
+ one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
+ ut_error;
+ }
+ }
+
+ dict_table_autoinc_initialize(prebuilt->table, auto_inc);
+}
+
+/*****************************************************************//**
+Creates and opens a handle to a table which already exists in an InnoDB
+database.
+@return 1 if error, 0 if success */
+UNIV_INTERN
+int
+ha_innobase::open(
+/*==============*/
+ const char* name, /*!< in: table name */
+ int mode, /*!< in: not used */
+ uint test_if_locked) /*!< in: not used */
+{
+ dict_table_t* ib_table;
+ char norm_name[1000];
+ THD* thd;
+ ulint retries = 0;
+ char* is_part = NULL;
+
+ DBUG_ENTER("ha_innobase::open");
+
+ UT_NOT_USED(mode);
+ UT_NOT_USED(test_if_locked);
+
+ thd = ha_thd();
+
+ /* Under some cases MySQL seems to call this function while
+ holding btr_search_latch. This breaks the latching order as
+ we acquire dict_sys->mutex below and leads to a deadlock. */
+ if (thd != NULL) {
+ innobase_release_temporary_latches(ht, thd);
+ }
+
+ normalize_table_name(norm_name, name);
+
+ user_thd = NULL;
+
+ if (!(share=get_share(name))) {
+
+ DBUG_RETURN(1);
+ }
+
+ if (share->ib_table && share->ib_table->is_corrupt) {
+ free_share(share);
+
+ DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+ }
+
+ /* Create buffers for packing the fields of a record. Why
+ table->stored_rec_length did not work here? Obviously, because char
+ fields when packed actually became 1 byte longer, when we also
+ stored the string length as the first byte. */
+
+ upd_and_key_val_buff_len =
+ table->s->stored_rec_length + table->s->max_key_length
+ + MAX_REF_PARTS * 3;
+ if (!(uchar*) my_multi_malloc(MYF(MY_WME),
+ &upd_buff, upd_and_key_val_buff_len,
+ &key_val_buff, upd_and_key_val_buff_len,
+ NullS)) {
+ free_share(share);
+
+ DBUG_RETURN(1);
+ }
+
+ /* We look for pattern #P# to see if the table is partitioned
+ MySQL table. The retry logic for partitioned tables is a
+ workaround for http://bugs.mysql.com/bug.php?id=33349. Look
+ at support issue https://support.mysql.com/view.php?id=21080
+ for more details. */
+ is_part = strstr(norm_name, "#P#");
+retry:
+ /* Get pointer to a table object in InnoDB dictionary cache */
+ ib_table = dict_table_get(norm_name, TRUE);
+
+ if (ib_table && ib_table->is_corrupt) {
+ free_share(share);
+ my_free(upd_buff, MYF(0));
+
+ DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+ }
+
+ if (share->ib_table) {
+ ut_a(share->ib_table == ib_table);
+ } else {
+ share->ib_table = ib_table;
+ }
+
+ if (NULL == ib_table) {
+ if (is_part && retries < 10) {
+ ++retries;
+ os_thread_sleep(100000);
+ goto retry;
+ }
+
+ if (is_part) {
+ sql_print_error("Failed to open table %s after "
+ "%lu attempts.\n", norm_name,
+ retries);
+ }
+
+ sql_print_error("Cannot find or open table %s from\n"
+ "the internal data dictionary of InnoDB "
+ "though the .frm file for the\n"
+ "table exists. Maybe you have deleted and "
+ "recreated InnoDB data\n"
+ "files but have forgotten to delete the "
+ "corresponding .frm files\n"
+ "of InnoDB tables, or you have moved .frm "
+ "files to another database?\n"
+ "or, the table contains indexes that this "
+ "version of the engine\n"
+ "doesn't support.\n"
+ "See " REFMAN "innodb-troubleshooting.html\n"
+ "how you can resolve the problem.\n",
+ norm_name);
+ free_share(share);
+ my_free(upd_buff, MYF(0));
+ my_errno = ENOENT;
+
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ }
+
+ if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
+ sql_print_error("MySQL is trying to open a table handle but "
+ "the .ibd file for\ntable %s does not exist.\n"
+ "Have you deleted the .ibd file from the "
+ "database directory under\nthe MySQL datadir, "
+ "or have you used DISCARD TABLESPACE?\n"
+ "See " REFMAN "innodb-troubleshooting.html\n"
+ "how you can resolve the problem.\n",
+ norm_name);
+ free_share(share);
+ my_free(upd_buff, MYF(0));
+ my_errno = ENOENT;
+
+ dict_table_decrement_handle_count(ib_table, FALSE);
+ DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+ }
+
+ prebuilt = row_create_prebuilt(ib_table);
+
+ prebuilt->mysql_row_len = table->s->stored_rec_length;;
+ prebuilt->default_rec = table->s->default_values;
+ ut_ad(prebuilt->default_rec);
+
+ /* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+
+ primary_key = table->s->primary_key;
+ key_used_on_scan = primary_key;
+
+ if (!innobase_build_index_translation(table, ib_table, share)) {
+ sql_print_error("Build InnoDB index translation table for"
+ " Table %s failed", name);
+ }
+
+ /* Allocate a buffer for a 'row reference'. A row reference is
+ a string of bytes of length ref_length which uniquely specifies
+ a row in our table. Note that MySQL may also compare two row
+ references for equality by doing a simple memcmp on the strings
+ of length ref_length! */
+
+ if (!row_table_got_default_clust_index(ib_table)) {
+ prebuilt->clust_index_was_generated = FALSE;
+
+ if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
+ sql_print_error("Table %s has a primary key in "
+ "InnoDB data dictionary, but not "
+ "in MySQL!", name);
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user's attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has a "
+ "primary key in InnoDB data "
+ "dictionary, but not in "
+ "MySQL!", name);
+
+ /* If primary_key >= MAX_KEY, its (primary_key)
+ value could be out of bound if continue to index
+ into key_info[] array. Find InnoDB primary index,
+ and assign its key_length to ref_length.
+ In addition, since MySQL indexes are sorted starting
+ with primary index, unique index etc., initialize
+ ref_length to the first index key length in
+ case we fail to find InnoDB cluster index.
+
+ Please note, this will not resolve the primary
+ index mismatch problem, other side effects are
+ possible if users continue to use the table.
+ However, we allow this table to be opened so
+ that user can adopt necessary measures for the
+ mismatch while still being accessible to the table
+ date. */
+ ref_length = table->key_info[0].key_length;
+
+ /* Find correspoinding cluster index
+ key length in MySQL's key_info[] array */
+ for (ulint i = 0; i < table->s->keys; i++) {
+ dict_index_t* index;
+ index = innobase_get_index(i);
+ if (dict_index_is_clust(index)) {
+ ref_length =
+ table->key_info[i].key_length;
+ }
+ }
+ } else {
+ /* MySQL allocates the buffer for ref.
+ key_info->key_length includes space for all key
+ columns + one byte for each column that may be
+ NULL. ref_length must be as exact as possible to
+ save space, because all row reference buffers are
+ allocated based on ref_length. */
+
+ ref_length = table->key_info[primary_key].key_length;
+ }
+ } else {
+ if (primary_key != MAX_KEY) {
+ sql_print_error(
+ "Table %s has no primary key in InnoDB data "
+ "dictionary, but has one in MySQL! If you "
+ "created the table with a MySQL version < "
+ "3.23.54 and did not define a primary key, "
+ "but defined a unique key with all non-NULL "
+ "columns, then MySQL internally treats that "
+ "key as the primary key. You can fix this "
+ "error by dump + DROP + CREATE + reimport "
+ "of the table.", name);
+
+ /* This mismatch could cause further problems
+ if not attended, bring this to the user attention
+ by printing a warning in addition to log a message
+ in the errorlog */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NO_SUCH_INDEX,
+ "InnoDB: Table %s has no "
+ "primary key in InnoDB data "
+ "dictionary, but has one in "
+ "MySQL!", name);
+ }
+
+ prebuilt->clust_index_was_generated = TRUE;
+
+ ref_length = DATA_ROW_ID_LEN;
+
+ /* If we automatically created the clustered index, then
+ MySQL does not know about it, and MySQL must NOT be aware
+ of the index used on scan, to make it avoid checking if we
+ update the column of the index. That is why we assert below
+ that key_used_on_scan is the undefined value MAX_KEY.
+ The column is the row id in the automatical generation case,
+ and it will never be updated anyway. */
+
+ if (key_used_on_scan != MAX_KEY) {
+ sql_print_warning(
+ "Table %s key_used_on_scan is %lu even "
+ "though there is no primary key inside "
+ "InnoDB.", name, (ulong) key_used_on_scan);
+ }
+ }
+
+ /* Index block size in InnoDB: used by MySQL in query optimization */
+ stats.block_size = 16 * 1024;
+
+ /* Init table lock structure */
+ thr_lock_data_init(&share->lock,&lock,(void*) 0);
+
+ if (prebuilt->table) {
+ /* We update the highest file format in the system table
+ space, if this table has higher file format setting. */
+
+ trx_sys_file_format_max_upgrade(
+ (const char**) &innobase_file_format_check,
+ dict_table_get_format(prebuilt->table));
+ }
+
+ /* Only if the table has an AUTOINC column. */
+ if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
+ dict_table_autoinc_lock(prebuilt->table);
+
+ /* Since a table can already be "open" in InnoDB's internal
+ data dictionary, we only init the autoinc counter once, the
+ first time the table is loaded. We can safely reuse the
+ autoinc value from a previous MySQL open. */
+ if (dict_table_autoinc_read(prebuilt->table) == 0) {
+
+ innobase_initialize_autoinc();
+ }
+
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
+
+ info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_part_length() const
+{
+ return(DICT_MAX_INDEX_COL_LEN - 1);
+}
+
+/******************************************************************//**
+Closes a handle to an InnoDB table.
+@return 0 */
+UNIV_INTERN
+int
+ha_innobase::close(void)
+/*====================*/
+{
+ THD* thd;
+
+ DBUG_ENTER("ha_innobase::close");
+
+ thd = ha_thd();
+ if (thd != NULL) {
+ innobase_release_temporary_latches(ht, thd);
+ }
+
+ row_prebuilt_free(prebuilt, FALSE);
+
+ my_free(upd_buff, MYF(0));
+ free_share(share);
+
+ /* Tell InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ DBUG_RETURN(0);
+}
+
+/* The following accessor functions should really be inside MySQL code! */
+
+/**************************************************************//**
+Gets field offset for a field in a table.
+@return offset */
+static inline
+uint
+get_field_offset(
+/*=============*/
+ TABLE* table, /*!< in: MySQL table object */
+ Field* field) /*!< in: MySQL field object */
+{
+ return((uint) (field->ptr - table->record[0]));
+}
+
+/**************************************************************//**
+Checks if a field in a record is SQL NULL. Uses the record format
+information in table to track the null bit in record.
+@return 1 if NULL, 0 otherwise */
+static inline
+uint
+field_in_record_is_null(
+/*====================*/
+ TABLE* table, /*!< in: MySQL table object */
+ Field* field, /*!< in: MySQL field object */
+ char* record) /*!< in: a row in MySQL format */
+{
+ int null_offset;
+
+ if (!field->null_ptr) {
+
+ return(0);
+ }
+
+ null_offset = (uint) ((char*) field->null_ptr
+ - (char*) table->record[0]);
+
+ if (record[null_offset] & field->null_bit) {
+
+ return(1);
+ }
+
+ return(0);
+}
+
+/**************************************************************//**
+Sets a field in a record to SQL NULL. Uses the record format
+information in table to track the null bit in record. */
+static inline
+void
+set_field_in_record_to_null(
+/*========================*/
+ TABLE* table, /*!< in: MySQL table object */
+ Field* field, /*!< in: MySQL field object */
+ char* record) /*!< in: a row in MySQL format */
+{
+ int null_offset;
+
+ null_offset = (uint) ((char*) field->null_ptr
+ - (char*) table->record[0]);
+
+ record[null_offset] = record[null_offset] | field->null_bit;
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.c in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+ int mysql_type, /*!< in: MySQL type */
+ uint charset_number, /*!< in: number of the charset */
+ const unsigned char* a, /*!< in: data field */
+ unsigned int a_length, /*!< in: data field length,
+ not UNIV_SQL_NULL */
+ const unsigned char* b, /*!< in: data field */
+ unsigned int b_length) /*!< in: data field length,
+ not UNIV_SQL_NULL */
+{
+ CHARSET_INFO* charset;
+ enum_field_types mysql_tp;
+ int ret;
+
+ DBUG_ASSERT(a_length != UNIV_SQL_NULL);
+ DBUG_ASSERT(b_length != UNIV_SQL_NULL);
+
+ mysql_tp = (enum_field_types) mysql_type;
+
+ switch (mysql_tp) {
+
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_VARCHAR:
+ /* Use the charset number to pick the right charset struct for
+ the comparison. Since the MySQL function get_charset may be
+ slow before Bar removes the mutex operation there, we first
+ look at 2 common charsets directly. */
+
+ if (charset_number == default_charset_info->number) {
+ charset = default_charset_info;
+ } else if (charset_number == my_charset_latin1.number) {
+ charset = &my_charset_latin1;
+ } else {
+ charset = get_charset(charset_number, MYF(MY_WME));
+
+ if (charset == NULL) {
+ sql_print_error("InnoDB needs charset %lu for doing "
+ "a comparison, but MySQL cannot "
+ "find that charset.",
+ (ulong) charset_number);
+ ut_a(0);
+ }
+ }
+
+ /* Starting from 4.1.3, we use strnncollsp() in comparisons of
+ non-latin1_swedish_ci strings. NOTE that the collation order
+ changes then: 'b\0\0...' is ordered BEFORE 'b ...'. Users
+ having indexes on such data need to rebuild their tables! */
+
+ ret = charset->coll->strnncollsp(charset,
+ a, a_length,
+ b, b_length, 0);
+ if (ret < 0) {
+ return(-1);
+ } else if (ret > 0) {
+ return(1);
+ } else {
+ return(0);
+ }
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return DATA_BINARY, DATA_VARCHAR, ... */
+extern "C" UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+ ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an
+ 'unsigned type';
+ at least ENUM and SET,
+ and unsigned integer
+ types are 'unsigned types' */
+ const void* f) /*!< in: MySQL Field */
+{
+ const class Field* field = reinterpret_cast<const class Field*>(f);
+
+ /* The following asserts try to check that the MySQL type code fits in
+ 8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
+ the type */
+
+ DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
+ DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
+ DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
+ DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
+ DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+
+ if (field->flags & UNSIGNED_FLAG) {
+
+ *unsigned_flag = DATA_UNSIGNED;
+ } else {
+ *unsigned_flag = 0;
+ }
+
+ if (field->real_type() == MYSQL_TYPE_ENUM
+ || field->real_type() == MYSQL_TYPE_SET) {
+
+ /* MySQL has field->type() a string type for these, but the
+ data is actually internally stored as an unsigned integer
+ code! */
+
+ *unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
+ flag set to zero, even though
+ internally this is an unsigned
+ integer type */
+ return(DATA_INT);
+ }
+
+ switch (field->type()) {
+ /* NOTE that we only allow string types in DATA_MYSQL and
+ DATA_VARMYSQL */
+ case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
+ case MYSQL_TYPE_VARCHAR: /* new >= 5.0.3 true VARCHAR */
+ if (field->binary()) {
+ return(DATA_BINARY);
+ } else if (strcmp(
+ field->charset()->name,
+ "latin1_swedish_ci") == 0) {
+ return(DATA_VARCHAR);
+ } else {
+ return(DATA_VARMYSQL);
+ }
+ case MYSQL_TYPE_BIT:
+ case MYSQL_TYPE_STRING: if (field->binary()) {
+
+ return(DATA_FIXBINARY);
+ } else if (strcmp(
+ field->charset()->name,
+ "latin1_swedish_ci") == 0) {
+ return(DATA_CHAR);
+ } else {
+ return(DATA_MYSQL);
+ }
+ case MYSQL_TYPE_NEWDECIMAL:
+ return(DATA_FIXBINARY);
+ case MYSQL_TYPE_LONG:
+ case MYSQL_TYPE_LONGLONG:
+ case MYSQL_TYPE_TINY:
+ case MYSQL_TYPE_SHORT:
+ case MYSQL_TYPE_INT24:
+ case MYSQL_TYPE_DATE:
+ case MYSQL_TYPE_DATETIME:
+ case MYSQL_TYPE_YEAR:
+ case MYSQL_TYPE_NEWDATE:
+ case MYSQL_TYPE_TIME:
+ case MYSQL_TYPE_TIMESTAMP:
+ return(DATA_INT);
+ case MYSQL_TYPE_FLOAT:
+ return(DATA_FLOAT);
+ case MYSQL_TYPE_DOUBLE:
+ return(DATA_DOUBLE);
+ case MYSQL_TYPE_DECIMAL:
+ return(DATA_DECIMAL);
+ case MYSQL_TYPE_GEOMETRY:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ return(DATA_BLOB);
+ case MYSQL_TYPE_NULL:
+ /* MySQL currently accepts "NULL" datatype, but will
+ reject such datatype in the next release. We will cope
+ with it and not trigger assertion failure in 5.1 */
+ break;
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Writes an unsigned integer value < 64k to 2 bytes, in the little-endian
+storage format. */
+static inline
+void
+innobase_write_to_2_little_endian(
+/*==============================*/
+ byte* buf, /*!< in: where to store */
+ ulint val) /*!< in: value to write, must be < 64k */
+{
+ ut_a(val < 256 * 256);
+
+ buf[0] = (byte)(val & 0xFF);
+ buf[1] = (byte)(val / 256);
+}
+
+/*******************************************************************//**
+Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
+storage format.
+@return value */
+static inline
+uint
+innobase_read_from_2_little_endian(
+/*===============================*/
+ const uchar* buf) /*!< in: from where to read */
+{
+ return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])));
+}
+
+/*******************************************************************//**
+Stores a key value for a row to a buffer.
+@return key value length as stored in buff */
+UNIV_INTERN
+uint
+ha_innobase::store_key_val_for_row(
+/*===============================*/
+ uint keynr, /*!< in: key number */
+ char* buff, /*!< in/out: buffer for the key value (in MySQL
+ format) */
+ uint buff_len,/*!< in: buffer length */
+ const uchar* record)/*!< in: row in MySQL format */
+{
+ KEY* key_info = table->key_info + keynr;
+ KEY_PART_INFO* key_part = key_info->key_part;
+ KEY_PART_INFO* end = key_part + key_info->key_parts;
+ char* buff_start = buff;
+ enum_field_types mysql_type;
+ Field* field;
+ ibool is_null;
+
+ DBUG_ENTER("store_key_val_for_row");
+
+ /* The format for storing a key field in MySQL is the following:
+
+ 1. If the column can be NULL, then in the first byte we put 1 if the
+ field value is NULL, 0 otherwise.
+
+ 2. If the column is of a BLOB type (it must be a column prefix field
+ in this case), then we put the length of the data in the field to the
+ next 2 bytes, in the little-endian format. If the field is SQL NULL,
+ then these 2 bytes are set to 0. Note that the length of data in the
+ field is <= column prefix length.
+
+ 3. In a column prefix field, prefix_len next bytes are reserved for
+ data. In a normal field the max field length next bytes are reserved
+ for data. For a VARCHAR(n) the max field length is n. If the stored
+ value is the SQL NULL then these data bytes are set to 0.
+
+ 4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that
+ in the MySQL row format, the length is stored in 1 or 2 bytes,
+ depending on the maximum allowed length. But in the MySQL key value
+ format, the length always takes 2 bytes.
+
+ We have to zero-fill the buffer so that MySQL is able to use a
+ simple memcmp to compare two key values to determine if they are
+ equal. MySQL does this to compare contents of two 'ref' values. */
+
+ bzero(buff, buff_len);
+
+ for (; key_part != end; key_part++) {
+ is_null = FALSE;
+
+ if (key_part->null_bit) {
+ if (record[key_part->null_offset]
+ & key_part->null_bit) {
+ *buff = 1;
+ is_null = TRUE;
+ } else {
+ *buff = 0;
+ }
+ buff++;
+ }
+
+ field = key_part->field;
+ mysql_type = field->type();
+
+ if (mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* >= 5.0.3 true VARCHAR */
+ ulint lenlen;
+ ulint len;
+ const byte* data;
+ ulint key_len;
+ ulint true_len;
+ CHARSET_INFO* cs;
+ int error=0;
+
+ key_len = key_part->length;
+
+ if (is_null) {
+ buff += key_len + 2;
+
+ continue;
+ }
+ cs = field->charset();
+
+ lenlen = (ulint)
+ (((Field_varstring*)field)->length_bytes);
+
+ data = row_mysql_read_true_varchar(&len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ lenlen);
+
+ true_len = len;
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) cs->cset->well_formed_len(cs,
+ (const char *) data,
+ (const char *) data + len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* In a column prefix index, we may need to truncate
+ the stored value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ /* The length in a key value is always stored in 2
+ bytes */
+
+ row_mysql_store_true_var_len((byte*)buff, true_len, 2);
+ buff += 2;
+
+ memcpy(buff, data, true_len);
+
+ /* Note that we always reserve the maximum possible
+ length of the true VARCHAR in the key value, though
+ only len first bytes after the 2 length bytes contain
+ actual data. The rest of the space was reset to zero
+ in the bzero() call above. */
+
+ buff += key_len;
+
+ } else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+ || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+ || mysql_type == MYSQL_TYPE_BLOB
+ || mysql_type == MYSQL_TYPE_LONG_BLOB
+ /* MYSQL_TYPE_GEOMETRY data is treated
+ as BLOB data in innodb. */
+ || mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+ CHARSET_INFO* cs;
+ ulint key_len;
+ ulint true_len;
+ int error=0;
+ ulint blob_len;
+ const byte* blob_data;
+
+ ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+ key_len = key_part->length;
+
+ if (is_null) {
+ buff += key_len + 2;
+
+ continue;
+ }
+
+ cs = field->charset();
+
+ blob_data = row_mysql_read_blob_ref(&blob_len,
+ (byte*) (record
+ + (ulint)get_field_offset(table, field)),
+ (ulint) field->pack_length());
+
+ true_len = blob_len;
+
+ ut_a(get_field_offset(table, field)
+ == key_part->offset);
+
+ /* For multi byte character sets we need to calculate
+ the true length of the key */
+
+ if (blob_len > 0 && cs->mbmaxlen > 1) {
+ true_len = (ulint) cs->cset->well_formed_len(cs,
+ (const char *) blob_data,
+ (const char *) blob_data
+ + blob_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+
+ /* All indexes on BLOB and TEXT are column prefix
+ indexes, and we may need to truncate the data to be
+ stored in the key value: */
+
+ if (true_len > key_len) {
+ true_len = key_len;
+ }
+
+ /* MySQL reserves 2 bytes for the length and the
+ storage of the number is little-endian */
+
+ innobase_write_to_2_little_endian(
+ (byte*)buff, true_len);
+ buff += 2;
+
+ memcpy(buff, blob_data, true_len);
+
+ /* Note that we always reserve the maximum possible
+ length of the BLOB prefix in the key value. */
+
+ buff += key_len;
+ } else {
+ /* Here we handle all other data types except the
+ true VARCHAR, BLOB and TEXT. Note that the column
+ value we store may be also in a column prefix
+ index. */
+
+ CHARSET_INFO* cs;
+ ulint true_len;
+ ulint key_len;
+ const uchar* src_start;
+ int error=0;
+ enum_field_types real_type;
+
+ key_len = key_part->length;
+
+ if (is_null) {
+ buff += key_len;
+
+ continue;
+ }
+
+ src_start = record + key_part->offset;
+ real_type = field->real_type();
+ true_len = key_len;
+
+ /* Character set for the field is defined only
+ to fields whose type is string and real field
+ type is not enum or set. For these fields check
+ if character set is multi byte. */
+
+ if (real_type != MYSQL_TYPE_ENUM
+ && real_type != MYSQL_TYPE_SET
+ && ( mysql_type == MYSQL_TYPE_VAR_STRING
+ || mysql_type == MYSQL_TYPE_STRING)) {
+
+ cs = field->charset();
+
+ /* For multi byte character sets we need to
+ calculate the true length of the key */
+
+ if (key_len > 0 && cs->mbmaxlen > 1) {
+
+ true_len = (ulint)
+ cs->cset->well_formed_len(cs,
+ (const char *)src_start,
+ (const char *)src_start
+ + key_len,
+ (uint) (key_len /
+ cs->mbmaxlen),
+ &error);
+ }
+ }
+
+ memcpy(buff, src_start, true_len);
+ buff += true_len;
+
+ /* Pad the unused space with spaces. Note that no
+ padding is ever needed for UCS-2 because in MySQL,
+ all UCS2 characters are 2 bytes, as MySQL does not
+ support surrogate pairs, which are needed to represent
+ characters in the range U+10000 to U+10FFFF. */
+
+ if (true_len < key_len) {
+ ulint pad_len = key_len - true_len;
+ memset(buff, ' ', pad_len);
+ buff += pad_len;
+ }
+ }
+ }
+
+ ut_a(buff <= buff_start + buff_len);
+
+ DBUG_RETURN((uint)(buff - buff_start));
+}
+
+/**************************************************************//**
+Builds a 'template' to the prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+static
+void
+build_template(
+/*===========*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct */
+ THD* thd, /*!< in: current user thread, used
+ only if templ_type is
+ ROW_MYSQL_REC_FIELDS */
+ TABLE* table, /* in: MySQL table */
+ ha_innobase* file, /* in: ha_innobase handler */
+ uint templ_type) /* in: ROW_MYSQL_WHOLE_ROW or
+ ROW_MYSQL_REC_FIELDS */
+{
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ mysql_row_templ_t* templ;
+ Field* field;
+ ulint n_fields, n_stored_fields;
+ ulint n_requested_fields = 0;
+ ibool fetch_all_in_key = FALSE;
+ ibool fetch_primary_key_cols = FALSE;
+ ulint sql_idx, innodb_idx=0;
+ /* byte offset of the end of last requested column */
+ ulint mysql_prefix_len = 0;
+ ibool do_idx_cond_push= FALSE;
+ ibool need_second_pass= FALSE;
+
+ if (prebuilt->select_lock_type == LOCK_X) {
+ /* We always retrieve the whole clustered index record if we
+ use exclusive row level locks, for example, if the read is
+ done in an UPDATE statement. */
+
+ templ_type = ROW_MYSQL_WHOLE_ROW;
+ }
+
+ if (templ_type == ROW_MYSQL_REC_FIELDS) {
+ if (prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_ALL_COLS) {
+
+ /* We know we must at least fetch all columns in the
+ key, or all columns in the table */
+
+ if (prebuilt->read_just_key) {
+ /* MySQL has instructed us that it is enough
+ to fetch the columns in the key; looks like
+ MySQL can set this flag also when there is
+ only a prefix of the column in the key: in
+ that case we retrieve the whole column from
+ the clustered index */
+
+ fetch_all_in_key = TRUE;
+ } else {
+ templ_type = ROW_MYSQL_WHOLE_ROW;
+ }
+ } else if (prebuilt->hint_need_to_fetch_extra_cols
+ == ROW_RETRIEVE_PRIMARY_KEY) {
+ /* We must at least fetch all primary key cols. Note
+ that if the clustered index was internally generated
+ by InnoDB on the row id (no primary key was
+ defined), then row_search_for_mysql() will always
+ retrieve the row id to a special buffer in the
+ prebuilt struct. */
+
+ fetch_primary_key_cols = TRUE;
+ }
+ }
+
+ clust_index = dict_table_get_first_index(prebuilt->table);
+
+ if (templ_type == ROW_MYSQL_REC_FIELDS) {
+ index = prebuilt->index;
+ } else {
+ index = clust_index;
+ }
+
+ if (index == clust_index) {
+ prebuilt->need_to_access_clustered = TRUE;
+ } else {
+ prebuilt->need_to_access_clustered = FALSE;
+ /* Below we check column by column if we need to access
+ the clustered index */
+ }
+
+ n_fields = (ulint)table->s->fields; /* number of columns */
+ n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
+
+ if (!prebuilt->mysql_template) {
+ prebuilt->mysql_template = (mysql_row_templ_t*)
+ mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
+ }
+
+ prebuilt->template_type = templ_type;
+ prebuilt->null_bitmap_len = table->s->null_bytes;
+
+ prebuilt->templ_contains_blob = FALSE;
+
+
+ /*
+ Setup index condition pushdown (note: we don't need to check if
+ this is a scan on primary key as that is checked in idx_cond_push)
+ */
+ if (file->active_index == file->pushed_idx_cond_keyno &&
+ file->active_index != MAX_KEY &&
+ templ_type == ROW_MYSQL_REC_FIELDS)
+ do_idx_cond_push= need_second_pass= TRUE;
+
+ /* Note that in InnoDB, i is the column number. MySQL calls columns
+ 'fields'. */
+ for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+ templ = prebuilt->mysql_template + n_requested_fields;
+ field = table->field[sql_idx];
+ if (!field->stored_in_db)
+ goto skip_field;
+
+ if (UNIV_LIKELY(templ_type == ROW_MYSQL_REC_FIELDS)) {
+ /* Decide which columns we should fetch
+ and which we can skip. */
+ register const ibool index_contains_field =
+ dict_index_contains_col_or_prefix(index, innodb_idx);
+ register const ibool index_covers_field =
+ field->part_of_key.is_set(file->active_index);
+
+ if (!index_contains_field && prebuilt->read_just_key) {
+ /* If this is a 'key read', we do not need
+ columns that are not in the key */
+
+ goto skip_field;
+ }
+
+ if (index_contains_field && fetch_all_in_key) {
+ /* This field is needed in the query */
+
+ goto include_field;
+ }
+
+ if (bitmap_is_set(table->read_set, sql_idx) ||
+ bitmap_is_set(table->write_set, sql_idx)) {
+ /* This field is needed in the query */
+
+ goto include_field;
+ }
+
+ if (fetch_primary_key_cols
+ && dict_table_col_in_clustered_key(
+ index->table, innodb_idx)) {
+ /* This field is needed in the query */
+
+ goto include_field;
+ }
+
+ /* This field is not needed in the query, skip it */
+
+ goto skip_field;
+include_field:
+ if (do_idx_cond_push &&
+ ((need_second_pass && !index_covers_field) ||
+ (!need_second_pass && index_covers_field)))
+ goto skip_field;
+ }
+ n_requested_fields++;
+
+ templ->col_no = innodb_idx;
+
+ if (index == clust_index) {
+ templ->rec_field_no = dict_col_get_clust_pos(
+ &index->table->cols[innodb_idx], index);
+ } else {
+ templ->rec_field_no = dict_index_get_nth_col_pos(
+ index, innodb_idx);
+ }
+
+ if (templ->rec_field_no == ULINT_UNDEFINED) {
+ prebuilt->need_to_access_clustered = TRUE;
+ }
+
+ if (field->null_ptr) {
+ templ->mysql_null_byte_offset =
+ (ulint) ((char*) field->null_ptr
+ - (char*) table->record[0]);
+
+ templ->mysql_null_bit_mask = (ulint) field->null_bit;
+ } else {
+ templ->mysql_null_bit_mask = 0;
+ }
+
+ templ->mysql_col_offset = (ulint)
+ get_field_offset(table, field);
+
+ templ->mysql_col_len = (ulint) field->pack_length();
+ if (mysql_prefix_len < templ->mysql_col_offset
+ + templ->mysql_col_len) {
+ mysql_prefix_len = templ->mysql_col_offset
+ + templ->mysql_col_len;
+ }
+ templ->type = index->table->cols[innodb_idx].mtype;
+ templ->mysql_type = (ulint)field->type();
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ templ->mysql_length_bytes = (ulint)
+ (((Field_varstring*)field)->length_bytes);
+ }
+
+ templ->charset = dtype_get_charset_coll(
+ index->table->cols[innodb_idx].prtype);
+ templ->mbminlen = index->table->cols[innodb_idx].mbminlen;
+ templ->mbmaxlen = index->table->cols[innodb_idx].mbmaxlen;
+ templ->is_unsigned = index->table->cols[innodb_idx].prtype
+ & DATA_UNSIGNED;
+ if (templ->type == DATA_BLOB) {
+ prebuilt->templ_contains_blob = TRUE;
+ }
+skip_field:
+ if (need_second_pass && (sql_idx+1 == n_fields))
+ {
+ prebuilt->n_index_fields= n_requested_fields;
+ need_second_pass= FALSE;
+ sql_idx= (~(ulint)0); /* to start from 0 */
+ innodb_idx= (~(ulint)0); /* to start from 0 */ ///psergey-merge-merge-last-change
+ }
+ if (field->stored_in_db) {
+ innodb_idx++;
+ }
+ }
+
+ prebuilt->n_template = n_requested_fields;
+ prebuilt->mysql_prefix_len = mysql_prefix_len;
+
+ if (do_idx_cond_push)
+ {
+ prebuilt->idx_cond_func= index_cond_func_innodb;
+ prebuilt->idx_cond_func_arg= file;
+ }
+ else
+ {
+ prebuilt->idx_cond_func= NULL;
+ prebuilt->n_index_fields= n_requested_fields;
+ }
+
+ if (index != clust_index && prebuilt->need_to_access_clustered) {
+ /* Change rec_field_no's to correspond to the clustered index
+ record */
+ for (ulint i = do_idx_cond_push? prebuilt->n_index_fields : 0;
+ i < n_requested_fields; i++) {
+ templ = prebuilt->mysql_template + i;
+ templ->rec_field_no = dict_col_get_clust_pos(
+ &index->table->cols[templ->col_no],
+ clust_index);
+ }
+ }
+}
+
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+ ulint error = DB_SUCCESS;
+
+ switch (innobase_autoinc_lock_mode) {
+ case AUTOINC_NO_LOCKING:
+ /* Acquire only the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
+ break;
+
+ case AUTOINC_NEW_STYLE_LOCKING:
+ /* For simple (single/multi) row INSERTs, we fallback to the
+ old style only if another transaction has already acquired
+ the AUTOINC lock on behalf of a LOAD FILE or INSERT ... SELECT
+ etc. type of statement. */
+ if (thd_sql_command(user_thd) == SQLCOM_INSERT
+ || thd_sql_command(user_thd) == SQLCOM_REPLACE) {
+ dict_table_t* table = prebuilt->table;
+
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(table);
+
+ /* We need to check that another transaction isn't
+ already holding the AUTOINC lock on the table. */
+ if (table->n_waiting_or_granted_auto_inc_locks) {
+ /* Release the mutex to avoid deadlocks. */
+ dict_table_autoinc_unlock(table);
+ } else {
+ break;
+ }
+ }
+ /* Fall through to old style locking. */
+
+ case AUTOINC_OLD_STYLE_LOCKING:
+ error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+ if (error == DB_SUCCESS) {
+
+ /* Acquire the AUTOINC mutex. */
+ dict_table_autoinc_lock(prebuilt->table);
+ }
+ break;
+
+ default:
+ ut_error;
+ }
+
+ return(ulong(error));
+}
+
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+ ulonglong autoinc) /*!< in: value to store */
+{
+ ulint error;
+
+ error = innobase_lock_autoinc();
+
+ if (error == DB_SUCCESS) {
+
+ dict_table_autoinc_initialize(prebuilt->table, autoinc);
+
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
+
+ return(ulong(error));
+}
+
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+ ulonglong auto_inc) /*!< in: value to store */
+{
+ ulint error;
+
+ error = innobase_lock_autoinc();
+
+ if (error == DB_SUCCESS) {
+
+ dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
+
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
+
+ return(ulong(error));
+}
+
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::write_row(
+/*===================*/
+ uchar* record) /*!< in: a row in MySQL format */
+{
+ ulint error = 0;
+ int error_result= 0;
+ ibool auto_inc_used= FALSE;
+ ulint sql_command;
+ trx_t* trx = thd_to_trx(user_thd);
+
+ DBUG_ENTER("ha_innobase::write_row");
+
+ if (prebuilt->trx != trx) {
+ sql_print_error("The transaction object for the table handle is at "
+ "%p, but for the current thread it is at %p",
+ (const void*) prebuilt->trx, (const void*) trx);
+
+ fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
+ ut_print_buf(stderr, ((const byte*)prebuilt) - 100, 200);
+ fputs("\n"
+ "InnoDB: Dump of 200 bytes around ha_data: ",
+ stderr);
+ ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ ha_statistic_increment(&SSV::ha_write_count);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+ table->timestamp_field->set_time();
+
+ sql_command = thd_sql_command(user_thd);
+
+ if ((sql_command == SQLCOM_ALTER_TABLE
+ || sql_command == SQLCOM_OPTIMIZE
+ || sql_command == SQLCOM_CREATE_INDEX
+ || sql_command == SQLCOM_DROP_INDEX)
+ && num_write_row >= 10000) {
+ /* ALTER TABLE is COMMITted at every 10000 copied rows.
+ The IX table lock for the original table has to be re-issued.
+ As this method will be called on a temporary table where the
+ contents of the original table is being copied to, it is
+ a bit tricky to determine the source table. The cursor
+ position in the source table need not be adjusted after the
+ intermediate COMMIT, since writes by other transactions are
+ being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
+
+ dict_table_t* src_table;
+ enum lock_mode mode;
+
+ num_write_row = 0;
+
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
+
+ /* Altering an InnoDB table */
+ /* Get the source table. */
+ src_table = lock_get_src_table(
+ prebuilt->trx, prebuilt->table, &mode);
+ if (!src_table) {
+no_commit:
+ /* Unknown situation: do not commit */
+ /*
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ALTER TABLE is holding lock"
+ " on %lu tables!\n",
+ prebuilt->trx->mysql_n_tables_locked);
+ */
+ ;
+ } else if (src_table == prebuilt->table) {
+ /* Source table is not in InnoDB format:
+ no need to re-acquire locks on it. */
+
+ /* Altering to InnoDB format */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ prebuilt->trx->active_trans = 1;
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ } else {
+ /* Ensure that there are no other table locks than
+ LOCK_IX and LOCK_AUTO_INC on the destination table. */
+
+ if (!lock_is_table_exclusive(prebuilt->table,
+ prebuilt->trx)) {
+ goto no_commit;
+ }
+
+ /* Commit the transaction. This will release the table
+ locks, so they have to be acquired again. */
+ innobase_commit(ht, user_thd, 1);
+ /* Note that this transaction is still active. */
+ prebuilt->trx->active_trans = 1;
+ /* Re-acquire the table lock on the source table. */
+ row_lock_table_for_mysql(prebuilt, src_table, mode);
+ /* We will need an IX lock on the destination table. */
+ prebuilt->sql_stat_start = TRUE;
+ }
+ }
+
+ num_write_row++;
+
+ /* This is the case where the table has an auto-increment column */
+ if (table->next_number_field && record == table->record[0]) {
+
+ /* Reset the error code before calling
+ innobase_get_auto_increment(). */
+ prebuilt->autoinc_error = DB_SUCCESS;
+
+ if ((error = update_auto_increment())) {
+ /* We don't want to mask autoinc overflow errors. */
+
+ /* Handle the case where the AUTOINC sub-system
+ failed during initialization. */
+ if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+ error_result = ER_AUTOINC_READ_FAILED;
+ /* Set the error message to report too. */
+ my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+ goto func_exit;
+ } else if (prebuilt->autoinc_error != DB_SUCCESS) {
+ error = (int) prebuilt->autoinc_error;
+ goto report_error;
+ }
+
+ /* MySQL errors are passed straight back. */
+ error_result = (int) error;
+ goto func_exit;
+ }
+
+ auto_inc_used = TRUE;
+ }
+
+ if (prebuilt->mysql_template == NULL
+ || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
+
+ /* Build the template used in converting quickly between
+ the two database formats */
+
+ build_template(prebuilt, NULL, table, this, ROW_MYSQL_WHOLE_ROW);
+ }
+
+ innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+ error = row_insert_for_mysql((byte*) record, prebuilt);
+
+#ifdef EXTENDED_FOR_USERSTAT
+ if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+ /* Handle duplicate key errors */
+ if (auto_inc_used) {
+ ulint err;
+ ulonglong auto_inc;
+ ulonglong col_max_value;
+
+ /* Note the number of rows processed for this statement, used
+ by get_auto_increment() to determine the number of AUTO-INC
+ values to reserve. This is only useful for a mult-value INSERT
+ and is a statement level counter.*/
+ if (trx->n_autoinc_rows > 0) {
+ --trx->n_autoinc_rows;
+ }
+
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
+
+ /* Get the value that MySQL attempted to store in the table.*/
+ auto_inc = table->next_number_field->val_int();
+
+ switch (error) {
+ case DB_DUPLICATE_KEY:
+
+ /* A REPLACE command and LOAD DATA INFILE REPLACE
+ handle a duplicate key error themselves, but we
+ must update the autoinc counter if we are performing
+ those statements. */
+
+ switch (sql_command) {
+ case SQLCOM_LOAD:
+ if ((trx->duplicates
+ & (TRX_DUP_IGNORE | TRX_DUP_REPLACE))) {
+
+ goto set_max_autoinc;
+ }
+ break;
+
+ case SQLCOM_REPLACE:
+ case SQLCOM_INSERT_SELECT:
+ case SQLCOM_REPLACE_SELECT:
+ goto set_max_autoinc;
+
+ default:
+ break;
+ }
+
+ break;
+
+ case DB_SUCCESS:
+ /* If the actual value inserted is greater than
+ the upper limit of the interval, then we try and
+ update the table upper limit. Note: last_value
+ will be 0 if get_auto_increment() was not called.*/
+
+ if (auto_inc >= prebuilt->autoinc_last_value) {
+set_max_autoinc:
+ /* This should filter out the negative
+ values set explicitly by the user. */
+ if (auto_inc <= col_max_value) {
+ ut_a(prebuilt->autoinc_increment > 0);
+
+ ulonglong need;
+ ulonglong offset;
+
+ offset = prebuilt->autoinc_offset;
+ need = prebuilt->autoinc_increment;
+
+ auto_inc = innobase_next_autoinc(
+ auto_inc,
+ need, offset, col_max_value);
+
+ err = innobase_set_max_autoinc(
+ auto_inc);
+
+ if (err != DB_SUCCESS) {
+ error = err;
+ }
+ }
+ }
+ break;
+ }
+ }
+
+ innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+report_error:
+ error_result = convert_error_code_to_mysql((int) error,
+ prebuilt->table->flags,
+ user_thd);
+
+func_exit:
+ innobase_active_small();
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ DBUG_RETURN(error_result);
+}
+
+/**********************************************************************//**
+Checks which fields have changed in a row and stores information
+of them to an update vector.
+@return error number or 0 */
+static
+int
+calc_row_difference(
+/*================*/
+ upd_t* uvect, /*!< in/out: update vector */
+ uchar* old_row, /*!< in: old row in MySQL format */
+ uchar* new_row, /*!< in: new row in MySQL format */
+ struct st_table* table, /*!< in: table in MySQL data
+ dictionary */
+ uchar* upd_buff, /*!< in: buffer to use */
+ ulint buff_len, /*!< in: buffer length */
+ row_prebuilt_t* prebuilt, /*!< in: InnoDB prebuilt struct */
+ THD* thd) /*!< in: user thread */
+{
+ uchar* original_upd_buff = upd_buff;
+ Field* field;
+ enum_field_types field_mysql_type;
+ uint n_fields;
+ ulint o_len;
+ ulint n_len;
+ ulint col_pack_len;
+ const byte* new_mysql_row_col;
+ const byte* o_ptr;
+ const byte* n_ptr;
+ byte* buf;
+ upd_field_t* ufield;
+ ulint col_type;
+ ulint n_changed = 0;
+ dfield_t dfield;
+ dict_index_t* clust_index;
+ uint sql_idx, innodb_idx= 0;
+
+ n_fields = table->s->fields;
+ clust_index = dict_table_get_first_index(prebuilt->table);
+
+ /* We use upd_buff to convert changed fields */
+ buf = (byte*) upd_buff;
+
+ for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+ field = table->field[sql_idx];
+ if (!field->stored_in_db)
+ continue;
+
+ o_ptr = (const byte*) old_row + get_field_offset(table, field);
+ n_ptr = (const byte*) new_row + get_field_offset(table, field);
+
+ /* Use new_mysql_row_col and col_pack_len save the values */
+
+ new_mysql_row_col = n_ptr;
+ col_pack_len = field->pack_length();
+
+ o_len = col_pack_len;
+ n_len = col_pack_len;
+
+ /* We use o_ptr and n_ptr to dig up the actual data for
+ comparison. */
+
+ field_mysql_type = field->type();
+
+ col_type = prebuilt->table->cols[innodb_idx].mtype;
+
+ switch (col_type) {
+
+ case DATA_BLOB:
+ o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+ n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_VARMYSQL:
+ if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR where
+ the real payload data length is stored in
+ 1 or 2 bytes */
+
+ o_ptr = row_mysql_read_true_varchar(
+ &o_len, o_ptr,
+ (ulint)
+ (((Field_varstring*)field)->length_bytes));
+
+ n_ptr = row_mysql_read_true_varchar(
+ &n_len, n_ptr,
+ (ulint)
+ (((Field_varstring*)field)->length_bytes));
+ }
+
+ break;
+ default:
+ ;
+ }
+
+ if (field->null_ptr) {
+ if (field_in_record_is_null(table, field,
+ (char*) old_row)) {
+ o_len = UNIV_SQL_NULL;
+ }
+
+ if (field_in_record_is_null(table, field,
+ (char*) new_row)) {
+ n_len = UNIV_SQL_NULL;
+ }
+ }
+
+ if (o_len != n_len || (o_len != UNIV_SQL_NULL &&
+ 0 != memcmp(o_ptr, n_ptr, o_len))) {
+ /* The field has changed */
+
+ ufield = uvect->fields + n_changed;
+
+ /* Let us use a dummy dfield to make the conversion
+ from the MySQL column format to the InnoDB format */
+
+ dict_col_copy_type(prebuilt->table->cols + innodb_idx,
+ dfield_get_type(&dfield));
+
+ if (n_len != UNIV_SQL_NULL) {
+ buf = row_mysql_store_col_in_innobase_format(
+ &dfield,
+ (byte*)buf,
+ TRUE,
+ new_mysql_row_col,
+ col_pack_len,
+ dict_table_is_comp(prebuilt->table));
+ dfield_copy_data(&ufield->new_val, &dfield);
+ } else {
+ dfield_set_null(&ufield->new_val);
+ }
+
+ ufield->exp = NULL;
+ ufield->orig_len = 0;
+ ufield->field_no = dict_col_get_clust_pos(
+ &prebuilt->table->cols[innodb_idx], clust_index);
+ n_changed++;
+ }
+ if (field->stored_in_db)
+ innodb_idx++;
+ }
+
+ uvect->n_fields = n_changed;
+ uvect->info_bits = 0;
+
+ ut_a(buf <= (byte*)original_upd_buff + buff_len);
+
+ return(0);
+}
+
+/**********************************************************************//**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::update_row(
+/*====================*/
+ const uchar* old_row, /*!< in: old row in MySQL format */
+ uchar* new_row) /*!< in: new row in MySQL format */
+{
+ upd_t* uvect;
+ int error = 0;
+ trx_t* trx = thd_to_trx(user_thd);
+
+ DBUG_ENTER("ha_innobase::update_row");
+
+ ut_a(prebuilt->trx == trx);
+
+ ha_statistic_increment(&SSV::ha_update_count);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+
+ if (prebuilt->upd_node) {
+ uvect = prebuilt->upd_node->update;
+ } else {
+ uvect = row_get_prebuilt_update_vector(prebuilt);
+ }
+
+ /* Build an update vector from the modified fields in the rows
+ (uses upd_buff of the handle) */
+
+ calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+ upd_buff, (ulint)upd_and_key_val_buff_len,
+ prebuilt, user_thd);
+
+ /* This is not a delete */
+ prebuilt->upd_node->is_delete = FALSE;
+
+ ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+
+ innodb_srv_conc_enter_innodb(trx);
+
+ error = row_update_for_mysql((byte*) old_row, prebuilt);
+
+ /* We need to do some special AUTOINC handling for the following case:
+
+ INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ...
+
+ We need to use the AUTOINC counter that was actually used by
+ MySQL in the UPDATE statement, which can be different from the
+ value used in the INSERT statement.*/
+
+ if (error == DB_SUCCESS
+ && table->next_number_field
+ && new_row == table->record[0]
+ && thd_sql_command(user_thd) == SQLCOM_INSERT
+ && (trx->duplicates & (TRX_DUP_IGNORE | TRX_DUP_REPLACE))
+ == TRX_DUP_IGNORE) {
+
+ ulonglong auto_inc;
+ ulonglong col_max_value;
+
+ auto_inc = table->next_number_field->val_int();
+
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
+
+ if (auto_inc <= col_max_value && auto_inc != 0) {
+
+ ulonglong need;
+ ulonglong offset;
+
+ offset = prebuilt->autoinc_offset;
+ need = prebuilt->autoinc_increment;
+
+ auto_inc = innobase_next_autoinc(
+ auto_inc, need, offset, col_max_value);
+
+ error = innobase_set_max_autoinc(auto_inc);
+ }
+ }
+
+#ifdef EXTENDED_FOR_USERSTAT
+ if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+ innodb_srv_conc_exit_innodb(trx);
+
+ error = convert_error_code_to_mysql(error,
+ prebuilt->table->flags, user_thd);
+
+ if (error == 0 /* success */
+ && uvect->n_fields == 0 /* no columns were updated */) {
+
+ /* This is the same as success, but instructs
+ MySQL that the row is not really updated and it
+ should not increase the count of updated rows.
+ This is fix for http://bugs.mysql.com/29157 */
+ error = HA_ERR_RECORD_IS_THE_SAME;
+ }
+
+ /* Tell InnoDB server that there might be work for
+ utility threads: */
+
+ innobase_active_small();
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::delete_row(
+/*====================*/
+ const uchar* record) /*!< in: a row in MySQL format */
+{
+ int error = 0;
+ trx_t* trx = thd_to_trx(user_thd);
+
+ DBUG_ENTER("ha_innobase::delete_row");
+
+ ut_a(prebuilt->trx == trx);
+
+ ha_statistic_increment(&SSV::ha_delete_count);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ if (!prebuilt->upd_node) {
+ row_get_prebuilt_update_vector(prebuilt);
+ }
+
+ /* This is a delete */
+
+ prebuilt->upd_node->is_delete = TRUE;
+
+ innodb_srv_conc_enter_innodb(trx);
+
+ error = row_update_for_mysql((byte*) record, prebuilt);
+
+#ifdef EXTENDED_FOR_USERSTAT
+ if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+ innodb_srv_conc_exit_innodb(trx);
+
+ error = convert_error_code_to_mysql(
+ error, prebuilt->table->flags, user_thd);
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ innobase_active_small();
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query, if the option innodb_locks_unsafe_for_binlog is set. */
+UNIV_INTERN
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+ DBUG_ENTER("ha_innobase::unlock_row");
+
+ /* Consistent read does not take any locks, thus there is
+ nothing to unlock. */
+
+ if (prebuilt->select_lock_type == LOCK_NONE) {
+ DBUG_VOID_RETURN;
+ }
+
+ switch (prebuilt->row_read_type) {
+ case ROW_READ_WITH_LOCKS:
+ if (!srv_locks_unsafe_for_binlog
+ && prebuilt->trx->isolation_level
+ > TRX_ISO_READ_COMMITTED) {
+ break;
+ }
+ /* fall through */
+ case ROW_READ_TRY_SEMI_CONSISTENT:
+ row_unlock_for_mysql(prebuilt, FALSE);
+ break;
+ case ROW_READ_DID_SEMI_CONSISTENT:
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ break;
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+ return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+void
+ha_innobase::try_semi_consistent_read(bool yes)
+/*===========================================*/
+{
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ /* Row read type is set to semi consistent read if this was
+ requested by the MySQL and either innodb_locks_unsafe_for_binlog
+ option is used or this session is using READ COMMITTED isolation
+ level. */
+
+ if (yes
+ && (srv_locks_unsafe_for_binlog
+ || prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ } else {
+ prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
+ }
+}
+
+/******************************************************************//**
+Initializes a handle to use an index.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::index_init(
+/*====================*/
+ uint keynr, /*!< in: key (index) number */
+ bool sorted) /*!< in: 1 if result MUST be sorted according to index */
+{
+ DBUG_ENTER("index_init");
+
+ DBUG_RETURN(change_active_index(keynr));
+}
+
+/******************************************************************//**
+Currently does nothing.
+@return 0 */
+UNIV_INTERN
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+ int error = 0;
+ DBUG_ENTER("index_end");
+ active_index=MAX_KEY;
+ in_range_check_pushed_down= FALSE;
+ ds_mrr.dsmrr_close();
+ DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+static inline
+ulint
+convert_search_mode_to_innobase(
+/*============================*/
+ enum ha_rkey_function find_flag)
+{
+ switch (find_flag) {
+ case HA_READ_KEY_EXACT:
+ /* this does not require the index to be UNIQUE */
+ return(PAGE_CUR_GE);
+ case HA_READ_KEY_OR_NEXT:
+ return(PAGE_CUR_GE);
+ case HA_READ_KEY_OR_PREV:
+ return(PAGE_CUR_LE);
+ case HA_READ_AFTER_KEY:
+ return(PAGE_CUR_G);
+ case HA_READ_BEFORE_KEY:
+ return(PAGE_CUR_L);
+ case HA_READ_PREFIX:
+ return(PAGE_CUR_GE);
+ case HA_READ_PREFIX_LAST:
+ return(PAGE_CUR_LE);
+ case HA_READ_PREFIX_LAST_OR_PREV:
+ return(PAGE_CUR_LE);
+ /* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always
+ pass a complete-field prefix of a key value as the search
+ tuple. I.e., it is not allowed that the last field would
+ just contain n first bytes of the full field value.
+ MySQL uses a 'padding' trick to convert LIKE 'abc%'
+ type queries so that it can use as a search tuple
+ a complete-field-prefix of a key value. Thus, the InnoDB
+ search mode PAGE_CUR_LE_OR_EXTENDS is never used.
+ TODO: when/if MySQL starts to use also partial-field
+ prefixes, we have to deal with stripping of spaces
+ and comparison of non-latin1 char type fields in
+ innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to
+ work correctly. */
+ case HA_READ_MBR_CONTAIN:
+ case HA_READ_MBR_INTERSECT:
+ case HA_READ_MBR_WITHIN:
+ case HA_READ_MBR_DISJOINT:
+ case HA_READ_MBR_EQUAL:
+ return(PAGE_CUR_UNSUPP);
+ /* do not use "default:" in order to produce a gcc warning:
+ enumeration value '...' not handled in switch
+ (if -Wswitch or -Wall is used) */
+ }
+
+ my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
+
+ return(PAGE_CUR_UNSUPP);
+}
+
+/*
+ BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+ ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
+
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB 'prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
+
+ A) if the user has not explicitly set any MySQL table level locks:
+
+ 1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
+
+ 2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
+
+ 3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
+
+ 4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
+
+ 5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
+
+ B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
+
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_read(
+/*====================*/
+ uchar* buf, /*!< in/out: buffer for the returned
+ row */
+ const uchar* key_ptr, /*!< in: key value; if this is NULL
+ we position the cursor at the
+ start or end of index; this can
+ also contain an InnoDB row id, in
+ which case key_len is the InnoDB
+ row id length; the key value can
+ also be a prefix of a full key value,
+ and the last column can be a prefix
+ of a full column */
+ uint key_len,/*!< in: key value length */
+ enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+ ulint mode;
+ dict_index_t* index;
+ ulint match_mode = 0;
+ int error;
+ ulint ret;
+
+ DBUG_ENTER("index_read");
+
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+ ha_statistic_increment(&SSV::ha_read_key_count);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ index = prebuilt->index;
+
+ if (UNIV_UNLIKELY(index == NULL)) {
+ prebuilt->index_usable = FALSE;
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED);
+ }
+
+ /* Note that if the index for which the search template is built is not
+ necessarily prebuilt->index, but can also be the clustered index */
+
+ if (prebuilt->sql_stat_start) {
+ build_template(prebuilt, user_thd, table, this,
+ ROW_MYSQL_REC_FIELDS);
+ }
+
+ if (key_ptr) {
+ /* Convert the search key value to InnoDB format into
+ prebuilt->search_tuple */
+
+ row_sel_convert_mysql_key_to_innobase(
+ prebuilt->search_tuple,
+ (byte*) key_val_buff,
+ (ulint)upd_and_key_val_buff_len,
+ index,
+ (byte*) key_ptr,
+ (ulint) key_len,
+ prebuilt->trx);
+ } else {
+ /* We position the cursor to the last or the first entry
+ in the index */
+
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+ }
+
+ mode = convert_search_mode_to_innobase(find_flag);
+
+ match_mode = 0;
+
+ if (find_flag == HA_READ_KEY_EXACT) {
+
+ match_mode = ROW_SEL_EXACT;
+
+ } else if (find_flag == HA_READ_PREFIX
+ || find_flag == HA_READ_PREFIX_LAST) {
+
+ match_mode = ROW_SEL_EXACT_PREFIX;
+ }
+
+ last_match_mode = (uint) match_mode;
+
+ if (mode != PAGE_CUR_UNSUPP) {
+
+ innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+ ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
+ match_mode, 0);
+
+ innodb_srv_conc_exit_innodb(prebuilt->trx);
+ } else {
+
+ ret = DB_UNSUPPORTED;
+ }
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+ break;
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_KEY_NOT_FOUND;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ default:
+ error = convert_error_code_to_mysql((int) ret,
+ prebuilt->table->flags,
+ user_thd);
+ table->status = STATUS_NOT_FOUND;
+ break;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return 0, HA_ERR_KEY_NOT_FOUND, or an error code */
+UNIV_INTERN
+int
+ha_innobase::index_read_last(
+/*=========================*/
+ uchar* buf, /*!< out: fetched row */
+ const uchar* key_ptr,/*!< in: key value, or a prefix of a full
+ key value */
+ uint key_len)/*!< in: length of the key val or prefix
+ in bytes */
+{
+ return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
+
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return NULL or index instance. */
+UNIV_INTERN
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always
+ clustered index, even if it was internally
+ generated by InnoDB */
+{
+ KEY* key = 0;
+ dict_index_t* index = 0;
+
+ DBUG_ENTER("innobase_get_index");
+ ha_statistic_increment(&SSV::ha_read_key_count);
+
+ if (keynr != MAX_KEY && table->s->keys > 0) {
+ key = table->key_info + keynr;
+
+ index = innobase_index_lookup(share, keynr);
+
+ if (index) {
+ ut_a(ut_strcmp(index->name, key->name) == 0);
+ } else {
+ /* Can't find index with keynr in the translation
+ table. Only print message if the index translation
+ table exists */
+ if (share->idx_trans_tbl.index_mapping) {
+ sql_print_error("InnoDB could not find "
+ "index %s key no %u for "
+ "table %s through its "
+ "index translation table",
+ key ? key->name : "NULL",
+ keynr,
+ prebuilt->table->name);
+ }
+
+ index = dict_table_get_index_on_name(prebuilt->table,
+ key->name);
+ }
+ } else {
+ index = dict_table_get_first_index(prebuilt->table);
+ }
+
+ if (!index) {
+ sql_print_error(
+ "Innodb could not find key n:o %u with name %s "
+ "from dict cache for table %s",
+ keynr, key ? key->name : "NULL",
+ prebuilt->table->name);
+ }
+
+ DBUG_RETURN(index);
+}
+
+/********************************************************************//**
+Changes the active index of a handle.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::change_active_index(
+/*=============================*/
+ uint keynr) /*!< in: use this index; MAX_KEY means always clustered
+ index, even if it was internally generated by
+ InnoDB */
+{
+ DBUG_ENTER("change_active_index");
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ ut_ad(user_thd == ha_thd());
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+ active_index = keynr;
+
+ prebuilt->index = innobase_get_index(keynr);
+
+ if (UNIV_UNLIKELY(!prebuilt->index)) {
+ sql_print_warning("InnoDB: change_active_index(%u) failed",
+ keynr);
+ prebuilt->index_usable = FALSE;
+ DBUG_RETURN(1);
+ }
+
+ prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx,
+ prebuilt->index);
+
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ push_warning_printf(user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: insufficient history for index %u",
+ keynr);
+ /* The caller seems to ignore this. Thus, we must check
+ this again in row_search_for_mysql(). */
+ DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+ 0, NULL));
+ }
+
+ ut_a(prebuilt->search_tuple != 0);
+
+ dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
+
+ dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
+ prebuilt->index->n_fields);
+
+ /* MySQL changes the active index for a handle also during some
+ queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+ and then calculates the sum. Previously we played safe and used
+ the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+ copying. Starting from MySQL-4.1 we use a more efficient flag here. */
+
+ build_template(prebuilt, user_thd, table, this, ROW_MYSQL_REC_FIELDS);
+
+ DBUG_RETURN(0);
+}
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in keynr. Fetches the
+row if any.
+??? This is only used to read whole keys ???
+@return error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::index_read_idx(
+/*========================*/
+ uchar* buf, /*!< in/out: buffer for the returned
+ row */
+ uint keynr, /*!< in: use this index */
+ const uchar* key, /*!< in: key value; if this is NULL
+ we position the cursor at the
+ start or end of index */
+ uint key_len, /*!< in: key value length */
+ enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+ if (change_active_index(keynr)) {
+
+ return(1);
+ }
+
+ return(index_read(buf, key, key_len, find_flag));
+}
+
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::general_fetch(
+/*=======================*/
+ uchar* buf, /*!< in/out: buffer for next row in MySQL
+ format */
+ uint direction, /*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+ uint match_mode) /*!< in: 0, ROW_SEL_EXACT, or
+ ROW_SEL_EXACT_PREFIX */
+{
+ ulint ret;
+ int error = 0;
+
+ DBUG_ENTER("general_fetch");
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+ innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+ ret = row_search_for_mysql(
+ (byte*)buf, 0, prebuilt, match_mode, direction);
+
+ innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ error = 0;
+ table->status = 0;
+#ifdef EXTENDED_FOR_USERSTAT
+ rows_read++;
+ if (active_index >= 0 && active_index < MAX_KEY)
+ index_rows_read[active_index]++;
+#endif
+ break;
+ case DB_RECORD_NOT_FOUND:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ case DB_END_OF_INDEX:
+ error = HA_ERR_END_OF_FILE;
+ table->status = STATUS_NOT_FOUND;
+ break;
+ default:
+ error = convert_error_code_to_mysql(
+ (int) ret, prebuilt->table->flags, user_thd);
+ table->status = STATUS_NOT_FOUND;
+ break;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for next row in MySQL
+ format */
+{
+ ha_statistic_increment(&SSV::ha_read_next_count);
+
+ return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next_same(
+/*=========================*/
+ uchar* buf, /*!< in/out: buffer for the row */
+ const uchar* key, /*!< in: key value */
+ uint keylen) /*!< in: key value length */
+{
+ ha_statistic_increment(&SSV::ha_read_next_count);
+
+ return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_prev(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for previous row in MySQL format */
+{
+ ha_statistic_increment(&SSV::ha_read_prev_count);
+
+ return(general_fetch(buf, ROW_SEL_PREV, 0));
+}
+
+/********************************************************************//**
+Positions a cursor on the first record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_first(
+/*=====================*/
+ uchar* buf) /*!< in/out: buffer for the row */
+{
+ int error;
+
+ DBUG_ENTER("index_first");
+ ha_statistic_increment(&SSV::ha_read_first_count);
+
+ error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY);
+
+ /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Positions a cursor on the last record in an index and reads the
+corresponding row to buf.
+@return 0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_last(
+/*====================*/
+ uchar* buf) /*!< in/out: buffer for the row */
+{
+ int error;
+
+ DBUG_ENTER("index_last");
+ ha_statistic_increment(&SSV::ha_read_last_count);
+
+ error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY);
+
+ /* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ DBUG_RETURN(error);
+}
+
+/****************************************************************//**
+Initialize a table scan.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_init(
+/*==================*/
+ bool scan) /*!< in: TRUE if table/index scan FALSE otherwise */
+{
+ int err;
+
+ /* Store the active index value so that we can restore the original
+ value after a scan */
+
+ if (prebuilt->clust_index_was_generated) {
+ err = change_active_index(MAX_KEY);
+ } else {
+ err = change_active_index(primary_key);
+ }
+
+ /* Don't use semi-consistent read in random row reads (by position).
+ This means we must disable semi_consistent_read if scan is false */
+
+ if (!scan) {
+ try_semi_consistent_read(0);
+ }
+
+ start_of_scan = 1;
+
+ return(err);
+}
+
+/*****************************************************************//**
+Ends a table scan.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_end(void)
+/*======================*/
+{
+ return(index_end());
+}
+
+/*****************************************************************//**
+Reads the next row in a table scan (also used to read the FIRST row
+in a table scan).
+@return 0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_next(
+/*==================*/
+ uchar* buf) /*!< in/out: returns the row in this buffer,
+ in MySQL format */
+{
+ int error;
+
+ DBUG_ENTER("rnd_next");
+ ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+
+ if (start_of_scan) {
+ error = index_first(buf);
+
+ if (error == HA_ERR_KEY_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+ }
+
+ start_of_scan = 0;
+ } else {
+ error = general_fetch(buf, ROW_SEL_NEXT, 0);
+ }
+
+ DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Fetches a row from the table based on a row reference.
+@return 0, HA_ERR_KEY_NOT_FOUND, or error code */
+UNIV_INTERN
+int
+ha_innobase::rnd_pos(
+/*=================*/
+ uchar* buf, /*!< in/out: buffer for the row */
+ uchar* pos) /*!< in: primary key value of the row in the
+ MySQL format, or the row id if the clustered
+ index was internally generated by InnoDB; the
+ length of data in pos has to be ref_length */
+{
+ int error;
+ uint keynr = active_index;
+ DBUG_ENTER("rnd_pos");
+ DBUG_DUMP("key", pos, ref_length);
+
+ ha_statistic_increment(&SSV::ha_read_rnd_count);
+
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ if (prebuilt->clust_index_was_generated) {
+ /* No primary key was defined for the table and we
+ generated the clustered index from the row id: the
+ row reference is the row id, not any key value
+ that MySQL knows of */
+
+ error = change_active_index(MAX_KEY);
+ } else {
+ error = change_active_index(primary_key);
+ }
+
+ if (error) {
+ DBUG_PRINT("error", ("Got error: %d", error));
+ DBUG_RETURN(error);
+ }
+
+ /* Note that we assume the length of the row reference is fixed
+ for the table, and it is == ref_length */
+
+ error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
+
+ if (error) {
+ DBUG_PRINT("error", ("Got error: %d", error));
+ }
+
+ change_active_index(keynr);
+
+ DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Stores a reference to the current row to 'ref' field of the handle. Note
+that in the case where we have generated the clustered index for the
+table, the function parameter is illogical: we MUST ASSUME that 'record'
+is the current 'position' of the handle, because if row ref is actually
+the row id internally generated in InnoDB, then 'record' does not contain
+it. We just guess that the row id must be for the record where the handle
+was positioned the last time. */
+UNIV_INTERN
+void
+ha_innobase::position(
+/*==================*/
+ const uchar* record) /*!< in: row in MySQL format */
+{
+ uint len;
+
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ if (prebuilt->clust_index_was_generated) {
+ /* No primary key was defined for the table and we
+ generated the clustered index from row id: the
+ row reference will be the row id, not any key value
+ that MySQL knows of */
+
+ len = DATA_ROW_ID_LEN;
+
+ memcpy(ref, prebuilt->row_id, len);
+ } else {
+ len = store_key_val_for_row(primary_key, (char*)ref,
+ ref_length, record);
+ }
+
+ /* We assume that the 'ref' value len is always fixed for the same
+ table. */
+
+ if (len != ref_length) {
+ sql_print_error("Stored ref len is %lu, but table ref len is %lu",
+ (ulong) len, (ulong) ref_length);
+ }
+}
+
+/* limit innodb monitor access to users with PROCESS privilege.
+See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */
+#define IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, thd) \
+ (row_is_magic_monitor_table(table_name) \
+ && check_global_access(thd, PROCESS_ACL))
+
+/*****************************************************************//**
+Creates a table definition to an InnoDB database. */
+static
+int
+create_table_def(
+/*=============*/
+ trx_t* trx, /*!< in: InnoDB transaction handle */
+ TABLE* form, /*!< in: information on table
+ columns and indexes */
+ const char* table_name, /*!< in: table name */
+ const char* path_of_temp_table,/*!< in: if this is a table explicitly
+ created by the user with the
+ TEMPORARY keyword, then this
+ parameter is the dir path where the
+ table should be placed if we create
+ an .ibd file for it (no .ibd extension
+ in the path, though); otherwise this
+ is NULL */
+ ulint flags) /*!< in: table flags */
+{
+ Field* field;
+ dict_table_t* table;
+ ulint n_cols;
+ int error;
+ ulint col_type;
+ ulint col_len;
+ ulint nulls_allowed;
+ ulint unsigned_type;
+ ulint binary_type;
+ ulint long_true_varchar;
+ ulint charset_no;
+ ulint i;
+
+ DBUG_ENTER("create_table_def");
+ DBUG_PRINT("enter", ("table_name: %s", table_name));
+
+ ut_a(trx->mysql_thd != NULL);
+ if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name,
+ (THD*) trx->mysql_thd)) {
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+
+ n_cols = form->s->fields;
+
+ /* We pass 0 as the space id, and determine at a lower level the space
+ id where to store the table */
+
+ table = dict_mem_table_create(table_name, 0, form->s->stored_fields, flags);
+
+ if (path_of_temp_table) {
+ table->dir_path_of_temp_table =
+ mem_heap_strdup(table->heap, path_of_temp_table);
+ }
+
+ for (i = 0; i < n_cols; i++) {
+ field = form->field[i];
+ if (!field->stored_in_db)
+ continue;
+
+ col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+ field);
+
+ if (!col_type) {
+ push_warning_printf(
+ (THD*) trx->mysql_thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_CANT_CREATE_TABLE,
+ "Error creating table '%s' with "
+ "column '%s'. Please check its "
+ "column type and try to re-create "
+ "the table with an appropriate "
+ "column type.",
+ table->name, (char*) field->field_name);
+ goto err_col;
+ }
+
+ if (field->null_ptr) {
+ nulls_allowed = 0;
+ } else {
+ nulls_allowed = DATA_NOT_NULL;
+ }
+
+ if (field->binary()) {
+ binary_type = DATA_BINARY_TYPE;
+ } else {
+ binary_type = 0;
+ }
+
+ charset_no = 0;
+
+ if (dtype_is_string_type(col_type)) {
+
+ charset_no = (ulint)field->charset()->number;
+
+ if (UNIV_UNLIKELY(charset_no >= 256)) {
+ /* in data0type.h we assume that the
+ number fits in one byte in prtype */
+ push_warning_printf(
+ (THD*) trx->mysql_thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_CANT_CREATE_TABLE,
+ "In InnoDB, charset-collation codes"
+ " must be below 256."
+ " Unsupported code %lu.",
+ (ulong) charset_no);
+ DBUG_RETURN(ER_CANT_CREATE_TABLE);
+ }
+ }
+
+ ut_a(field->type() < 256); /* we assume in dtype_form_prtype()
+ that this fits in one byte */
+ col_len = field->pack_length();
+
+ /* The MySQL pack length contains 1 or 2 bytes length field
+ for a true VARCHAR. Let us subtract that, so that the InnoDB
+ column length in the InnoDB data dictionary is the real
+ maximum byte length of the actual data. */
+
+ long_true_varchar = 0;
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ col_len -= ((Field_varstring*)field)->length_bytes;
+
+ if (((Field_varstring*)field)->length_bytes == 2) {
+ long_true_varchar = DATA_LONG_TRUE_VARCHAR;
+ }
+ }
+
+ /* First check whether the column to be added has a
+ system reserved name. */
+ if (dict_col_name_is_reserved(field->field_name)){
+ my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+ field->field_name);
+err_col:
+ dict_mem_table_free(table);
+ trx_commit_for_mysql(trx);
+
+ error = DB_ERROR;
+ goto error_ret;
+ }
+
+ dict_mem_table_add_col(table, table->heap,
+ (char*) field->field_name,
+ col_type,
+ dtype_form_prtype(
+ (ulint)field->type()
+ | nulls_allowed | unsigned_type
+ | binary_type | long_true_varchar,
+ charset_no),
+ col_len);
+ }
+
+ error = row_create_table_for_mysql(table, trx);
+
+ if (error == DB_DUPLICATE_KEY) {
+ char buf[100];
+ char* buf_end = innobase_convert_identifier(
+ buf, sizeof buf - 1, table_name, strlen(table_name),
+ trx->mysql_thd, TRUE);
+
+ *buf_end = '\0';
+ my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf);
+ }
+
+error_ret:
+ error = convert_error_code_to_mysql(error, flags, NULL);
+
+ DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Creates an index in an InnoDB database. */
+static
+int
+create_index(
+/*=========*/
+ trx_t* trx, /*!< in: InnoDB transaction handle */
+ TABLE* form, /*!< in: information on table
+ columns and indexes */
+ ulint flags, /*!< in: InnoDB table flags */
+ const char* table_name, /*!< in: table name */
+ uint key_num) /*!< in: index number */
+{
+ Field* field;
+ dict_index_t* index;
+ int error;
+ ulint n_fields;
+ KEY* key;
+ KEY_PART_INFO* key_part;
+ ulint ind_type;
+ ulint col_type;
+ ulint prefix_len;
+ ulint is_unsigned;
+ ulint i;
+ ulint j;
+ ulint* field_lengths;
+
+ DBUG_ENTER("create_index");
+
+ key = form->key_info + key_num;
+
+ n_fields = key->key_parts;
+
+ /* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
+ ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0);
+
+ ind_type = 0;
+
+ if (key_num == form->s->primary_key) {
+ ind_type = ind_type | DICT_CLUSTERED;
+ }
+
+ if (key->flags & HA_NOSAME ) {
+ ind_type = ind_type | DICT_UNIQUE;
+ }
+
+ /* We pass 0 as the space id, and determine at a lower level the space
+ id where to store the table */
+
+ index = dict_mem_index_create(table_name, key->name, 0,
+ ind_type, n_fields);
+
+ field_lengths = (ulint*) my_malloc(sizeof(ulint) * n_fields,
+ MYF(MY_FAE));
+
+ for (i = 0; i < n_fields; i++) {
+ key_part = key->key_part + i;
+
+ /* (The flag HA_PART_KEY_SEG denotes in MySQL a column prefix
+ field in an index: we only store a specified number of first
+ bytes of the column to the index field.) The flag does not
+ seem to be properly set by MySQL. Let us fall back on testing
+ the length of the key part versus the column. */
+
+ field = NULL;
+ for (j = 0; j < form->s->fields; j++) {
+
+ field = form->field[j];
+
+ if (0 == innobase_strcasecmp(
+ field->field_name,
+ key_part->field->field_name)) {
+ /* Found the corresponding column */
+
+ break;
+ }
+ }
+
+ ut_a(j < form->s->fields);
+
+ col_type = get_innobase_type_from_mysql_type(
+ &is_unsigned, key_part->field);
+
+ if (DATA_BLOB == col_type
+ || (key_part->length < field->pack_length()
+ && field->type() != MYSQL_TYPE_VARCHAR)
+ || (field->type() == MYSQL_TYPE_VARCHAR
+ && key_part->length < field->pack_length()
+ - ((Field_varstring*)field)->length_bytes)) {
+
+ prefix_len = key_part->length;
+
+ if (col_type == DATA_INT
+ || col_type == DATA_FLOAT
+ || col_type == DATA_DOUBLE
+ || col_type == DATA_DECIMAL) {
+ sql_print_error(
+ "MySQL is trying to create a column "
+ "prefix index field, on an "
+ "inappropriate data type. Table "
+ "name %s, column name %s.",
+ table_name,
+ key_part->field->field_name);
+
+ prefix_len = 0;
+ }
+ } else {
+ prefix_len = 0;
+ }
+
+ field_lengths[i] = key_part->length;
+
+ dict_mem_index_add_field(index,
+ (char*) key_part->field->field_name, prefix_len);
+ }
+
+ /* Even though we've defined max_supported_key_part_length, we
+ still do our own checking using field_lengths to be absolutely
+ sure we don't create too long indexes. */
+ error = row_create_index_for_mysql(index, trx, field_lengths);
+
+ error = convert_error_code_to_mysql(error, flags, NULL);
+
+ my_free(field_lengths, MYF(0));
+
+ DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Creates an index to an InnoDB table when the user has defined no
+primary index. */
+static
+int
+create_clustered_index_when_no_primary(
+/*===================================*/
+ trx_t* trx, /*!< in: InnoDB transaction handle */
+ ulint flags, /*!< in: InnoDB table flags */
+ const char* table_name) /*!< in: table name */
+{
+ dict_index_t* index;
+ int error;
+
+ /* We pass 0 as the space id, and determine at a lower level the space
+ id where to store the table */
+ index = dict_mem_index_create(table_name,
+ innobase_index_reserve_name,
+ 0, DICT_CLUSTERED, 0);
+
+ error = row_create_index_for_mysql(index, trx, NULL);
+
+ error = convert_error_code_to_mysql(error, flags, NULL);
+
+ return(error);
+}
+
+/*****************************************************************//**
+Validates the create options. We may build on this function
+in future. For now, it checks two specifiers:
+KEY_BLOCK_SIZE and ROW_FORMAT
+If innodb_strict_mode is not set then this function is a no-op
+@return TRUE if valid. */
+static
+ibool
+create_options_are_valid(
+/*=====================*/
+ THD* thd, /*!< in: connection thread. */
+ TABLE* form, /*!< in: information on table
+ columns and indexes */
+ HA_CREATE_INFO* create_info) /*!< in: create info. */
+{
+ ibool kbs_specified = FALSE;
+ ibool ret = TRUE;
+
+
+ ut_ad(thd != NULL);
+
+ /* If innodb_strict_mode is not set don't do any validation. */
+ if (!(THDVAR(thd, strict_mode))) {
+ return(TRUE);
+ }
+
+ ut_ad(form != NULL);
+ ut_ad(create_info != NULL);
+
+ /* First check if KEY_BLOCK_SIZE was specified. */
+ if (create_info->key_block_size
+ || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+
+ kbs_specified = TRUE;
+ switch (create_info->key_block_size) {
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ case 16:
+ /* Valid value. */
+ break;
+ default:
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: invalid"
+ " KEY_BLOCK_SIZE = %lu."
+ " Valid values are"
+ " [1, 2, 4, 8, 16]",
+ create_info->key_block_size);
+ ret = FALSE;
+ }
+ }
+
+ /* If KEY_BLOCK_SIZE was specified, check for its
+ dependencies. */
+ if (kbs_specified && !srv_file_per_table) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE"
+ " requires innodb_file_per_table.");
+ ret = FALSE;
+ }
+
+ if (kbs_specified && srv_file_format < DICT_TF_FORMAT_ZIP) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE"
+ " requires innodb_file_format >"
+ " Antelope.");
+ ret = FALSE;
+ }
+
+ /* Now check for ROW_FORMAT specifier. */
+ if (create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) {
+ switch (form->s->row_type) {
+ const char* row_format_name;
+ case ROW_TYPE_COMPRESSED:
+ case ROW_TYPE_DYNAMIC:
+ row_format_name
+ = form->s->row_type == ROW_TYPE_COMPRESSED
+ ? "COMPRESSED"
+ : "DYNAMIC";
+
+ /* These two ROW_FORMATs require
+ srv_file_per_table and srv_file_format */
+ if (!srv_file_per_table) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s"
+ " requires innodb_file_per_table.",
+ row_format_name);
+ ret = FALSE;
+
+ }
+
+ if (srv_file_format < DICT_TF_FORMAT_ZIP) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s"
+ " requires innodb_file_format >"
+ " Antelope.",
+ row_format_name);
+ ret = FALSE;
+ }
+
+ /* Cannot specify KEY_BLOCK_SIZE with
+ ROW_FORMAT = DYNAMIC.
+ However, we do allow COMPRESSED to be
+ specified with KEY_BLOCK_SIZE. */
+ if (kbs_specified
+ && form->s->row_type == ROW_TYPE_DYNAMIC) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: cannot specify"
+ " ROW_FORMAT = DYNAMIC with"
+ " KEY_BLOCK_SIZE.");
+ ret = FALSE;
+ }
+
+ break;
+
+ case ROW_TYPE_REDUNDANT:
+ case ROW_TYPE_COMPACT:
+ case ROW_TYPE_DEFAULT:
+ /* Default is COMPACT. */
+ row_format_name
+ = form->s->row_type == ROW_TYPE_REDUNDANT
+ ? "REDUNDANT"
+ : "COMPACT";
+
+ /* Cannot specify KEY_BLOCK_SIZE with these
+ format specifiers. */
+ if (kbs_specified) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: cannot specify"
+ " ROW_FORMAT = %s with"
+ " KEY_BLOCK_SIZE.",
+ row_format_name);
+ ret = FALSE;
+ }
+
+ break;
+
+ default:
+ push_warning(thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: invalid ROW_FORMAT specifier.");
+ ret = FALSE;
+
+ }
+ }
+
+ return(ret);
+}
+
+/*****************************************************************//**
+Update create_info. Used in SHOW CREATE TABLE et al. */
+UNIV_INTERN
+void
+ha_innobase::update_create_info(
+/*============================*/
+ HA_CREATE_INFO* create_info) /*!< in/out: create info */
+{
+ if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+ ha_innobase::info(HA_STATUS_AUTO);
+ create_info->auto_increment_value = stats.auto_increment_value;
+ }
+}
+
+/*****************************************************************//**
+Creates a new table to an InnoDB database.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::create(
+/*================*/
+ const char* name, /*!< in: table name */
+ TABLE* form, /*!< in: information on table
+ columns and indexes */
+ HA_CREATE_INFO* create_info) /*!< in: more information of the
+ created table, contains also the
+ create statement string */
+{
+ int error;
+ dict_table_t* innobase_table;
+ trx_t* parent_trx;
+ trx_t* trx;
+ int primary_key_no;
+ uint i;
+ char name2[FN_REFLEN];
+ char norm_name[FN_REFLEN];
+ THD* thd = ha_thd();
+ ib_int64_t auto_inc_value;
+ ulint flags;
+ /* Cache the value of innodb_file_format, in case it is
+ modified by another thread while the table is being created. */
+ const ulint file_format = srv_file_format;
+ const char* stmt;
+ size_t stmt_len;
+ enum row_type row_type;
+
+ DBUG_ENTER("ha_innobase::create");
+
+ DBUG_ASSERT(thd != NULL);
+ DBUG_ASSERT(create_info != NULL);
+
+#ifdef __WIN__
+ /* Names passed in from server are in two formats:
+ 1. <database_name>/<table_name>: for normal table creation
+ 2. full path: for temp table creation, or sym link
+
+ When srv_file_per_table is on and mysqld_embedded is off,
+ check for full path pattern, i.e.
+ X:\dir\..., X is a driver letter, or
+ \\dir1\dir2\..., UNC path
+ returns error if it is in full path format, but not creating a temp.
+ table. Currently InnoDB does not support symbolic link on Windows. */
+
+ if (srv_file_per_table
+ && !mysqld_embedded
+ && (!create_info->options & HA_LEX_CREATE_TMP_TABLE)) {
+
+ if ((name[1] == ':')
+ || (name[0] == '\\' && name[1] == '\\')) {
+ sql_print_error("Cannot create table %s\n", name);
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+ }
+#endif
+
+ if (form->s->stored_fields > 1000) {
+ /* The limit probably should be REC_MAX_N_FIELDS - 3 = 1020,
+ but we play safe here */
+
+ DBUG_RETURN(HA_ERR_TO_BIG_ROW);
+ }
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ parent_trx = check_trx_exists(thd);
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(parent_trx);
+
+ trx = innobase_trx_allocate(thd);
+
+ if (lower_case_table_names) {
+ srv_lower_case_table_names = TRUE;
+ } else {
+ srv_lower_case_table_names = FALSE;
+ }
+
+ strcpy(name2, name);
+
+ normalize_table_name(norm_name, name2);
+
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during a table create operation.
+ Drop table etc. do this latching in row0mysql.c. */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ /* Create the table definition in InnoDB */
+
+ flags = 0;
+
+ /* Validate create options if innodb_strict_mode is set. */
+ if (!create_options_are_valid(thd, form, create_info)) {
+ error = ER_ILLEGAL_HA_CREATE_OPTION;
+ goto cleanup;
+ }
+
+ if (create_info->key_block_size
+ || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+ /* Determine the page_zip.ssize corresponding to the
+ requested page size (key_block_size) in kilobytes. */
+
+ ulint ssize, ksize;
+ ulint key_block_size = create_info->key_block_size;
+
+ for (ssize = ksize = 1; ssize <= DICT_TF_ZSSIZE_MAX;
+ ssize++, ksize <<= 1) {
+ if (key_block_size == ksize) {
+ flags = ssize << DICT_TF_ZSSIZE_SHIFT
+ | DICT_TF_COMPACT
+ | DICT_TF_FORMAT_ZIP
+ << DICT_TF_FORMAT_SHIFT;
+ break;
+ }
+ }
+
+ if (!srv_file_per_table) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE"
+ " requires innodb_file_per_table.");
+ flags = 0;
+ }
+
+ if (file_format < DICT_TF_FORMAT_ZIP) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: KEY_BLOCK_SIZE"
+ " requires innodb_file_format >"
+ " Antelope.");
+ flags = 0;
+ }
+
+ if (!flags) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ignoring"
+ " KEY_BLOCK_SIZE=%lu.",
+ create_info->key_block_size);
+ }
+ }
+
+ row_type = form->s->row_type;
+
+ if (flags) {
+ /* KEY_BLOCK_SIZE was specified. */
+ if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
+ /* ROW_FORMAT was not specified;
+ default to ROW_FORMAT=COMPRESSED */
+ row_type = ROW_TYPE_COMPRESSED;
+ } else if (row_type != ROW_TYPE_COMPRESSED) {
+ /* ROW_FORMAT other than COMPRESSED
+ ignores KEY_BLOCK_SIZE. It does not
+ make sense to reject conflicting
+ KEY_BLOCK_SIZE and ROW_FORMAT, because
+ such combinations can be obtained
+ with ALTER TABLE anyway. */
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ignoring KEY_BLOCK_SIZE=%lu"
+ " unless ROW_FORMAT=COMPRESSED.",
+ create_info->key_block_size);
+ flags = 0;
+ }
+ } else {
+ /* No KEY_BLOCK_SIZE */
+ if (row_type == ROW_TYPE_COMPRESSED) {
+ /* ROW_FORMAT=COMPRESSED without
+ KEY_BLOCK_SIZE implies half the
+ maximum KEY_BLOCK_SIZE. */
+ flags = (DICT_TF_ZSSIZE_MAX - 1)
+ << DICT_TF_ZSSIZE_SHIFT
+ | DICT_TF_COMPACT
+ | DICT_TF_FORMAT_ZIP
+ << DICT_TF_FORMAT_SHIFT;
+//#if DICT_TF_ZSSIZE_MAX < 1
+//# error "DICT_TF_ZSSIZE_MAX < 1"
+//#endif
+ }
+ }
+
+ switch (row_type) {
+ const char* row_format_name;
+ case ROW_TYPE_REDUNDANT:
+ break;
+ case ROW_TYPE_COMPRESSED:
+ case ROW_TYPE_DYNAMIC:
+ row_format_name
+ = row_type == ROW_TYPE_COMPRESSED
+ ? "COMPRESSED"
+ : "DYNAMIC";
+
+ if (!srv_file_per_table) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s"
+ " requires innodb_file_per_table.",
+ row_format_name);
+ } else if (file_format < DICT_TF_FORMAT_ZIP) {
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: ROW_FORMAT=%s"
+ " requires innodb_file_format >"
+ " Antelope.",
+ row_format_name);
+ } else {
+ flags |= DICT_TF_COMPACT
+ | (DICT_TF_FORMAT_ZIP
+ << DICT_TF_FORMAT_SHIFT);
+ break;
+ }
+
+ /* fall through */
+ case ROW_TYPE_NOT_USED:
+ case ROW_TYPE_FIXED:
+ default:
+ push_warning(thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_ILLEGAL_HA_CREATE_OPTION,
+ "InnoDB: assuming ROW_FORMAT=COMPACT.");
+ case ROW_TYPE_DEFAULT:
+ case ROW_TYPE_COMPACT:
+ flags = DICT_TF_COMPACT;
+ break;
+ }
+
+ /* Look for a primary key */
+
+ primary_key_no= (form->s->primary_key != MAX_KEY ?
+ (int) form->s->primary_key :
+ -1);
+
+ /* Our function innobase_get_mysql_key_number_for_index assumes
+ the primary key is always number 0, if it exists */
+
+ ut_a(primary_key_no == -1 || primary_key_no == 0);
+
+ /* Check for name conflicts (with reserved name) for
+ any user indices to be created. */
+ if (innobase_index_name_is_reserved(trx, form->key_info,
+ form->s->keys)) {
+ error = -1;
+ goto cleanup;
+ }
+
+ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+ flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
+ }
+
+ error = create_table_def(trx, form, norm_name,
+ create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
+ flags);
+
+ if (error) {
+ goto cleanup;
+ }
+
+
+ /* Create the keys */
+
+ if (form->s->keys == 0 || primary_key_no == -1) {
+ /* Create an index which is used as the clustered index;
+ order the rows by their row id which is internally generated
+ by InnoDB */
+
+ error = create_clustered_index_when_no_primary(
+ trx, flags, norm_name);
+ if (error) {
+ goto cleanup;
+ }
+ }
+
+ if (primary_key_no != -1) {
+ /* In InnoDB the clustered index must always be created
+ first */
+ if ((error = create_index(trx, form, flags, norm_name,
+ (uint) primary_key_no))) {
+ goto cleanup;
+ }
+ }
+
+ for (i = 0; i < form->s->keys; i++) {
+
+ if (i != (uint) primary_key_no) {
+
+ if ((error = create_index(trx, form, flags, norm_name,
+ i))) {
+ goto cleanup;
+ }
+ }
+ }
+
+ stmt = innobase_get_stmt(thd, &stmt_len);
+
+ if (stmt) {
+ error = row_table_add_foreign_constraints(
+ trx, stmt, stmt_len, norm_name,
+ create_info->options & HA_LEX_CREATE_TMP_TABLE);
+
+ error = convert_error_code_to_mysql(error, flags, NULL);
+
+ if (error) {
+ goto cleanup;
+ }
+ }
+
+ innobase_commit_low(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ innobase_table = dict_table_get(norm_name, FALSE);
+
+ DBUG_ASSERT(innobase_table != 0);
+
+ if (innobase_table) {
+ /* We update the highest file format in the system table
+ space, if this table has higher file format setting. */
+
+ trx_sys_file_format_max_upgrade(
+ (const char**) &innobase_file_format_check,
+ dict_table_get_format(innobase_table));
+ }
+
+ /* Note: We can't call update_thd() as prebuilt will not be
+ setup at this stage and so we use thd. */
+
+ /* We need to copy the AUTOINC value from the old table if
+ this is an ALTER TABLE or CREATE INDEX because CREATE INDEX
+ does a table copy too. */
+
+ if (((create_info->used_fields & HA_CREATE_USED_AUTO)
+ || thd_sql_command(thd) == SQLCOM_ALTER_TABLE
+ || thd_sql_command(thd) == SQLCOM_CREATE_INDEX)
+ && create_info->auto_increment_value > 0) {
+
+ /* Query was one of :
+ CREATE TABLE ...AUTO_INCREMENT = x; or
+ ALTER TABLE...AUTO_INCREMENT = x; or
+ CREATE INDEX x on t(...);
+ Find out a table definition from the dictionary and get
+ the current value of the auto increment field. Set a new
+ value to the auto increment field if the value is greater
+ than the maximum value in the column. */
+
+ auto_inc_value = create_info->auto_increment_value;
+
+ dict_table_autoinc_lock(innobase_table);
+ dict_table_autoinc_initialize(innobase_table, auto_inc_value);
+ dict_table_autoinc_unlock(innobase_table);
+ }
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ trx_free_for_mysql(trx);
+
+ DBUG_RETURN(0);
+
+cleanup:
+ innobase_commit_low(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_free_for_mysql(trx);
+
+ DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Discards or imports an InnoDB tablespace.
+@return 0 == success, -1 == error */
+UNIV_INTERN
+int
+ha_innobase::discard_or_import_tablespace(
+/*======================================*/
+ my_bool discard) /*!< in: TRUE if discard, else import */
+{
+ dict_table_t* dict_table;
+ trx_t* trx;
+ int err;
+
+ DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
+
+ ut_a(prebuilt->trx);
+ ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ dict_table = prebuilt->table;
+ trx = prebuilt->trx;
+
+ if (discard) {
+ err = row_discard_tablespace_for_mysql(dict_table->name, trx);
+ } else {
+ err = row_import_tablespace_for_mysql(dict_table->name, trx);
+ }
+
+ err = convert_error_code_to_mysql(err, dict_table->flags, NULL);
+
+ DBUG_RETURN(err);
+}
+
+/*****************************************************************//**
+Deletes all rows of an InnoDB table.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::delete_all_rows(void)
+/*==============================*/
+{
+ int error;
+
+ DBUG_ENTER("ha_innobase::delete_all_rows");
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created, and update prebuilt->trx */
+
+ update_thd(ha_thd());
+
+ if (thd_sql_command(user_thd) != SQLCOM_TRUNCATE) {
+ fallback:
+ /* We only handle TRUNCATE TABLE t as a special case.
+ DELETE FROM t will have to use ha_innobase::delete_row(),
+ because DELETE is transactional while TRUNCATE is not. */
+ DBUG_RETURN(my_errno=HA_ERR_WRONG_COMMAND);
+ }
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ /* Truncate the table in InnoDB */
+
+ error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
+ if (error == DB_ERROR) {
+ /* Cannot truncate; resort to ha_innobase::delete_row() */
+ goto fallback;
+ }
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ error = convert_error_code_to_mysql(error, prebuilt->table->flags,
+ NULL);
+
+ DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Drops a table from an InnoDB database. Before calling this function,
+MySQL calls innobase_commit to commit the transaction of the current user.
+Then the current user cannot have locks set on the table. Drop table
+operation inside InnoDB will remove all locks any user has on the table
+inside InnoDB.
+@return error number */
+UNIV_INTERN
+int
+ha_innobase::delete_table(
+/*======================*/
+ const char* name) /*!< in: table name */
+{
+ ulint name_len;
+ int error;
+ trx_t* parent_trx;
+ trx_t* trx;
+ THD *thd = ha_thd();
+ char norm_name[1000];
+
+ DBUG_ENTER("ha_innobase::delete_table");
+
+ /* Strangely, MySQL passes the table name without the '.frm'
+ extension, in contrast to ::create */
+ normalize_table_name(norm_name, name);
+
+ if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) {
+ DBUG_RETURN(HA_ERR_GENERIC);
+ }
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ parent_trx = check_trx_exists(thd);
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(parent_trx);
+
+ trx = innobase_trx_allocate(thd);
+
+ if (lower_case_table_names) {
+ srv_lower_case_table_names = TRUE;
+ } else {
+ srv_lower_case_table_names = FALSE;
+ }
+
+ name_len = strlen(name);
+
+ ut_a(name_len < 1000);
+
+ /* Drop the table in InnoDB */
+
+ error = row_drop_table_for_mysql(norm_name, trx,
+ thd_sql_command(thd)
+ == SQLCOM_DROP_DB);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ innobase_commit_low(trx);
+
+ trx_free_for_mysql(trx);
+
+ error = convert_error_code_to_mysql(error, 0, NULL);
+
+ DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+ handlerton *hton, /*!< in: handlerton of Innodb */
+ char* path) /*!< in: database path; inside InnoDB the name
+ of the last directory in the path is used as
+ the database name: for example, in 'mysql/data/test'
+ the database name is 'test' */
+{
+ ulint len = 0;
+ trx_t* trx;
+ char* ptr;
+ int error;
+ char* namebuf;
+ THD* thd = current_thd;
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* In the Windows plugin, thd = current_thd is always NULL */
+ if (thd) {
+ trx_t* parent_trx = check_trx_exists(thd);
+
+ /* In case MySQL calls this in the middle of a SELECT
+ query, release possible adaptive hash latch to avoid
+ deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(parent_trx);
+ }
+
+ ptr = strend(path) - 2;
+
+ while (ptr >= path && *ptr != '\\' && *ptr != '/') {
+ ptr--;
+ len++;
+ }
+
+ ptr++;
+ namebuf = (char*) my_malloc((uint) len + 2, MYF(0));
+
+ memcpy(namebuf, ptr, len);
+ namebuf[len] = '/';
+ namebuf[len + 1] = '\0';
+#ifdef __WIN__
+ innobase_casedn_str(namebuf);
+#endif
+#if defined __WIN__ && !defined MYSQL_SERVER
+ /* In the Windows plugin, thd = current_thd is always NULL */
+ trx = trx_allocate_for_mysql();
+ trx->mysql_thd = NULL;
+#else
+ trx = innobase_trx_allocate(thd);
+#endif
+ error = row_drop_database_for_mysql(namebuf, trx);
+ my_free(namebuf, MYF(0));
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+}
+/*********************************************************************//**
+Renames an InnoDB table.
+@return 0 or error code */
+static
+int
+innobase_rename_table(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* from, /*!< in: old name of the table */
+ const char* to, /*!< in: new name of the table */
+ ibool lock_and_commit)
+ /*!< in: TRUE=lock data dictionary and commit */
+{
+ int error;
+ char* norm_to;
+ char* norm_from;
+ DBUG_ENTER("innobase_rename_table");
+
+ if (lower_case_table_names) {
+ srv_lower_case_table_names = TRUE;
+ } else {
+ srv_lower_case_table_names = FALSE;
+ }
+
+ // Magic number 64 arbitrary
+ norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0));
+ norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0));
+
+ normalize_table_name(norm_to, to);
+ normalize_table_name(norm_from, from);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ if (lock_and_commit) {
+ row_mysql_lock_data_dictionary(trx);
+ }
+
+ error = row_rename_table_for_mysql(
+ norm_from, norm_to, trx, lock_and_commit);
+
+ if (error != DB_SUCCESS) {
+ FILE* ef = dict_foreign_err_file;
+
+ DBUG_PRINT("info", ("rename failed: %d", error));
+ fputs("InnoDB: Renaming table ", ef);
+ ut_print_name(ef, trx, TRUE, norm_from);
+ fputs(" to ", ef);
+ ut_print_name(ef, trx, TRUE, norm_to);
+ fputs(" failed!\n", ef);
+ }
+
+ if (lock_and_commit) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ /* Flush the log to reduce probability that the .frm
+ files and the InnoDB data dictionary get out-of-sync
+ if the user runs with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+ }
+
+ my_free(norm_to, MYF(0));
+ my_free(norm_from, MYF(0));
+
+ DBUG_RETURN(error);
+}
+/*********************************************************************//**
+Renames an InnoDB table.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::rename_table(
+/*======================*/
+ const char* from, /*!< in: old name of the table */
+ const char* to) /*!< in: new name of the table */
+{
+ trx_t* trx;
+ int error;
+ trx_t* parent_trx;
+ THD* thd = ha_thd();
+
+ DBUG_ENTER("ha_innobase::rename_table");
+
+ /* Get the transaction associated with the current thd, or create one
+ if not yet created */
+
+ parent_trx = check_trx_exists(thd);
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(parent_trx);
+
+ trx = innobase_trx_allocate(thd);
+
+ error = innobase_rename_table(trx, from, to, TRUE);
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ innobase_commit_low(trx);
+ trx_free_for_mysql(trx);
+
+ /* Add a special case to handle the Duplicated Key error
+ and return DB_ERROR instead.
+ This is to avoid a possible SIGSEGV error from mysql error
+ handling code. Currently, mysql handles the Duplicated Key
+ error by re-entering the storage layer and getting dup key
+ info by calling get_dup_key(). This operation requires a valid
+ table handle ('row_prebuilt_t' structure) which could no
+ longer be available in the error handling stage. The suggested
+ solution is to report a 'table exists' error message (since
+ the dup key error here is due to an existing table whose name
+ is the one we are trying to rename to) and return the generic
+ error code. */
+ if (error == (int) DB_DUPLICATE_KEY) {
+ my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
+
+ error = DB_ERROR;
+ }
+
+ error = convert_error_code_to_mysql(error, 0, NULL);
+
+ DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Estimates the number of index records in a range.
+@return estimated number of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::records_in_range(
+/*==========================*/
+ uint keynr, /*!< in: index number */
+ key_range *min_key, /*!< in: start key value of the
+ range, may also be 0 */
+ key_range *max_key) /*!< in: range end key val, may
+ also be 0 */
+{
+ KEY* key;
+ dict_index_t* index;
+ uchar* key_val_buff2 = (uchar*) my_malloc(
+ table->s->stored_rec_length
+ + table->s->max_key_length + 100,
+ MYF(MY_FAE));
+ ulint buff2_len = table->s->stored_rec_length
+ + table->s->max_key_length + 100;
+ dtuple_t* range_start;
+ dtuple_t* range_end;
+ ib_int64_t n_rows;
+ ulint mode1;
+ ulint mode2;
+ mem_heap_t* heap;
+
+ DBUG_ENTER("records_in_range");
+
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ prebuilt->trx->op_info = (char*)"estimating records in index range";
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ active_index = keynr;
+
+ key = table->key_info + active_index;
+
+ index = innobase_get_index(keynr);
+
+ /* There exists possibility of not being able to find requested
+ index due to inconsistency between MySQL and InoDB dictionary info.
+ Necessary message should have been printed in innobase_get_index() */
+ if (UNIV_UNLIKELY(!index)) {
+ n_rows = HA_POS_ERROR;
+ goto func_exit;
+ }
+ if (UNIV_UNLIKELY(!row_merge_is_index_usable(prebuilt->trx, index))) {
+ n_rows = HA_ERR_TABLE_DEF_CHANGED;
+ goto func_exit;
+ }
+
+ heap = mem_heap_create(2 * (key->key_parts * sizeof(dfield_t)
+ + sizeof(dtuple_t)));
+
+ range_start = dtuple_create(heap, key->key_parts);
+ dict_index_copy_types(range_start, index, key->key_parts);
+
+ range_end = dtuple_create(heap, key->key_parts);
+ dict_index_copy_types(range_end, index, key->key_parts);
+
+ row_sel_convert_mysql_key_to_innobase(
+ range_start, (byte*) key_val_buff,
+ (ulint)upd_and_key_val_buff_len,
+ index,
+ (byte*) (min_key ? min_key->key :
+ (const uchar*) 0),
+ (ulint) (min_key ? min_key->length : 0),
+ prebuilt->trx);
+
+ row_sel_convert_mysql_key_to_innobase(
+ range_end, (byte*) key_val_buff2,
+ buff2_len, index,
+ (byte*) (max_key ? max_key->key :
+ (const uchar*) 0),
+ (ulint) (max_key ? max_key->length : 0),
+ prebuilt->trx);
+
+ mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag :
+ HA_READ_KEY_EXACT);
+ mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag :
+ HA_READ_KEY_EXACT);
+
+ if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
+
+ n_rows = btr_estimate_n_rows_in_range(index, range_start,
+ mode1, range_end,
+ mode2);
+ } else {
+
+ n_rows = HA_POS_ERROR;
+ }
+
+ mem_heap_free(heap);
+
+func_exit:
+ my_free(key_val_buff2, MYF(0));
+
+ prebuilt->trx->op_info = (char*)"";
+
+ /* The MySQL optimizer seems to believe an estimate of 0 rows is
+ always accurate and may return the result 'Empty set' based on that.
+ The accuracy is not guaranteed, and even if it were, for a locking
+ read we should anyway perform the search to set the next-key lock.
+ Add 1 to the value to make sure MySQL does not make the assumption! */
+
+ if (n_rows == 0) {
+ n_rows = 1;
+ }
+
+ DBUG_RETURN((ha_rows) n_rows);
+}
+
+/*********************************************************************//**
+Gives an UPPER BOUND to the number of rows in a table. This is used in
+filesort.cc.
+@return upper bound of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::estimate_rows_upper_bound(void)
+/*======================================*/
+{
+ dict_index_t* index;
+ ulonglong estimate;
+ ulonglong local_data_file_length;
+
+ DBUG_ENTER("estimate_rows_upper_bound");
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ prebuilt->trx->op_info = (char*)
+ "calculating upper bound for table rows";
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ index = dict_table_get_first_index(prebuilt->table);
+
+ ut_a(index->stat_n_leaf_pages > 0);
+
+ local_data_file_length =
+ ((ulonglong) index->stat_n_leaf_pages) * UNIV_PAGE_SIZE;
+
+
+ /* Calculate a minimum length for a clustered index record and from
+ that an upper bound for the number of rows. Since we only calculate
+ new statistics in row0mysql.c when a table has grown by a threshold
+ factor, we must add a safety factor 2 in front of the formula below. */
+
+ estimate = 2 * local_data_file_length /
+ dict_index_calc_min_rec_len(index);
+
+ prebuilt->trx->op_info = (char*)"";
+
+ DBUG_RETURN((ha_rows) estimate);
+}
+
+/*********************************************************************//**
+How many seeks it will take to read through the table. This is to be
+comparable to the number returned by records_in_range so that we can
+decide if we should scan the table or use keys.
+@return estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::scan_time()
+/*====================*/
+{
+ /* Since MySQL seems to favor table scans too much over index
+ searches, we pretend that a sequential read takes the same time
+ as a random disk read, that is, we do not divide the following
+ by 10, which would be physically realistic. */
+
+ return((double) (prebuilt->table->stat_clustered_index_size));
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of ranges through an index
+This enables us to optimise reads for clustered indexes.
+@return estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::read_time(
+/*===================*/
+ uint index, /*!< in: key number */
+ uint ranges, /*!< in: how many ranges */
+ ha_rows rows) /*!< in: estimated number of rows in the ranges */
+{
+ ha_rows total_rows;
+ double time_for_scan;
+
+ if (index != table->s->primary_key) {
+ /* Not clustered */
+ return(handler::read_time(index, ranges, rows));
+ }
+
+ if (rows <= 2) {
+
+ return((double) rows);
+ }
+
+ /* Assume that the read time is proportional to the scan time for all
+ rows + at most one seek per range. */
+
+ time_for_scan = scan_time();
+
+ if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+ return(time_for_scan);
+ }
+
+ return(ranges + (double) rows / (double) total_rows * time_for_scan);
+}
+
+UNIV_INTERN
+bool
+ha_innobase::is_corrupt() const
+{
+ if (share->ib_table)
+ return ((bool)share->ib_table->is_corrupt);
+ else
+ return (FALSE);
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index. We will
+first check the "index translation table" for a match of the index to get
+the index number. If there does not exist an "index translation table",
+or not able to find the index in the translation table, then we will fall back
+to the traditional way of looping through dict_index_t list to find a
+match. In this case, we have to take into account if we generated a
+default clustered index for the table
+@return the key number used inside MySQL */
+static
+unsigned int
+innobase_get_mysql_key_number_for_index(
+/*====================================*/
+ INNOBASE_SHARE* share, /*!< in: share structure for index
+ translation table. */
+ const TABLE* table, /*!< in: table in MySQL data
+ dictionary */
+ dict_table_t* ib_table,/*!< in: table in Innodb data
+ dictionary */
+ const dict_index_t* index) /*!< in: index */
+{
+ const dict_index_t* ind;
+ unsigned int i;
+
+ ut_ad(index);
+ ut_ad(ib_table);
+ ut_ad(table);
+ ut_ad(share);
+
+ /* If index does not belong to the table of share structure. Search
+ index->table instead */
+ if (index->table != ib_table) {
+ i = 0;
+ ind = dict_table_get_first_index(index->table);
+
+ while (index != ind) {
+ ind = dict_table_get_next_index(ind);
+ i++;
+ }
+
+ if (row_table_got_default_clust_index(index->table)) {
+ ut_a(i > 0);
+ i--;
+ }
+
+ return(i);
+ }
+
+ /* If index translation table exists, we will first check
+ the index through index translation table for a match. */
+ if (share->idx_trans_tbl.index_mapping) {
+ for (i = 0; i < share->idx_trans_tbl.index_count; i++) {
+ if (share->idx_trans_tbl.index_mapping[i] == index) {
+ return(i);
+ }
+ }
+
+ /* Print an error message if we cannot find the index
+ ** in the "index translation table". */
+ sql_print_error("Cannot find index %s in InnoDB index "
+ "translation table.", index->name);
+ }
+
+ /* If we do not have an "index translation table", or not able
+ to find the index in the translation table, we'll directly find
+ matching index in the dict_index_t list */
+ for (i = 0; i < table->s->keys; i++) {
+ ind = dict_table_get_index_on_name(
+ ib_table, table->key_info[i].name);
+
+ if (index == ind) {
+ return(i);
+ }
+ }
+
+ sql_print_error("Cannot find matching index number for index %s "
+ "in InnoDB index list.", index->name);
+
+ return(0);
+}
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object. */
+UNIV_INTERN
+int
+ha_innobase::info(
+/*==============*/
+ uint flag) /*!< in: what information MySQL requests */
+{
+ dict_table_t* ib_table;
+ dict_index_t* index;
+ ha_rows rec_per_key;
+ ib_int64_t n_rows;
+ char path[FN_REFLEN];
+ os_file_stat_t stat_info;
+
+ DBUG_ENTER("info");
+
+ /* If we are forcing recovery at a high level, we will suppress
+ statistics calculation on tables, because that may crash the
+ server if an index is badly corrupted. */
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ prebuilt->trx->op_info = (char*)"returning various info to MySQL";
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ ib_table = prebuilt->table;
+
+ if (flag & HA_STATUS_TIME) {
+ if ((innobase_stats_on_metadata
+ || thd_sql_command(user_thd) == SQLCOM_ANALYZE)
+ && !share->ib_table->is_corrupt) {
+ /* In sql_show we call with this flag: update
+ then statistics so that they are up-to-date */
+
+ if (srv_use_sys_stats_table && !((ib_table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)
+ && thd_sql_command(user_thd) == SQLCOM_ANALYZE) {
+ /* If the indexes on the table don't have enough rows in SYS_STATS system table, */
+ /* they need to be created. */
+ dict_index_t* index;
+
+ prebuilt->trx->op_info = "confirming rows of SYS_STATS to store statistics";
+
+ ut_a(prebuilt->trx->conc_state == TRX_NOT_STARTED);
+
+ for (index = dict_table_get_first_index(ib_table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ row_insert_stats_for_mysql(index, prebuilt->trx);
+ innobase_commit_low(prebuilt->trx);
+ }
+
+ ut_a(prebuilt->trx->conc_state == TRX_NOT_STARTED);
+ }
+
+ prebuilt->trx->op_info = "updating table statistics";
+
+ dict_update_statistics(ib_table,
+ (thd_sql_command(user_thd) == SQLCOM_ANALYZE)?TRUE:FALSE);
+
+ prebuilt->trx->op_info = "returning various info to MySQL";
+ }
+
+ my_snprintf(path, sizeof(path), "%s/%s%s",
+ mysql_data_home, ib_table->name, reg_ext);
+
+ unpack_filename(path,path);
+
+ /* Note that we do not know the access time of the table,
+ nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+
+ if (os_file_get_status(path,&stat_info)) {
+ stats.create_time = (ulong) stat_info.ctime;
+ }
+ }
+
+ if (flag & HA_STATUS_VARIABLE) {
+ n_rows = ib_table->stat_n_rows;
+
+ /* Because we do not protect stat_n_rows by any mutex in a
+ delete, it is theoretically possible that the value can be
+ smaller than zero! TODO: fix this race.
+
+ The MySQL optimizer seems to assume in a left join that n_rows
+ is an accurate estimate if it is zero. Of course, it is not,
+ since we do not have any locks on the rows yet at this phase.
+ Since SHOW TABLE STATUS seems to call this function with the
+ HA_STATUS_TIME flag set, while the left join optimizer does not
+ set that flag, we add one to a zero value if the flag is not
+ set. That way SHOW TABLE STATUS will show the best estimate,
+ while the optimizer never sees the table empty. */
+
+ if (n_rows < 0) {
+ n_rows = 0;
+ }
+
+ if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
+ n_rows++;
+ }
+
+ /* Fix bug#40386: Not flushing query cache after truncate.
+ n_rows can not be 0 unless the table is empty, set to 1
+ instead. The original problem of bug#29507 is actually
+ fixed in the server code. */
+ if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) {
+
+ n_rows = 1;
+
+ /* We need to reset the prebuilt value too, otherwise
+ checks for values greater than the last value written
+ to the table will fail and the autoinc counter will
+ not be updated. This will force write_row() into
+ attempting an update of the table's AUTOINC counter. */
+
+ prebuilt->autoinc_last_value = 0;
+ }
+
+ stats.records = (ha_rows)n_rows;
+ stats.deleted = 0;
+ stats.data_file_length = ((ulonglong)
+ ib_table->stat_clustered_index_size)
+ * UNIV_PAGE_SIZE;
+ stats.index_file_length = ((ulonglong)
+ ib_table->stat_sum_of_other_index_sizes)
+ * UNIV_PAGE_SIZE;
+
+ /* Since fsp_get_available_space_in_free_extents() is
+ acquiring latches inside InnoDB, we do not call it if we
+ are asked by MySQL to avoid locking. Another reason to
+ avoid the call is that it uses quite a lot of CPU.
+ See Bug#38185. */
+ if (flag & HA_STATUS_NO_LOCK) {
+ /* We do not update delete_length if no
+ locking is requested so the "old" value can
+ remain. delete_length is initialized to 0 in
+ the ha_statistics' constructor. */
+ } else if (UNIV_UNLIKELY
+ (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) {
+ /* Avoid accessing the tablespace if
+ innodb_crash_recovery is set to a high value. */
+ stats.delete_length = 0;
+ } else if (srv_stats_update_need_lock) {
+
+ /* lock the data dictionary to avoid races with
+ ibd_file_missing and tablespace_discarded */
+ row_mysql_lock_data_dictionary(prebuilt->trx);
+
+ /* ib_table->space must be an existent tablespace */
+ if (!ib_table->ibd_file_missing
+ && !ib_table->tablespace_discarded) {
+
+ stats.delete_length =
+ fsp_get_available_space_in_free_extents(
+ ib_table->space) * 1024;
+ } else {
+
+ THD* thd;
+
+ thd = ha_thd();
+
+ push_warning_printf(
+ thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_CANT_GET_STAT,
+ "InnoDB: Trying to get the free "
+ "space for table %s but its "
+ "tablespace has been discarded or "
+ "the .ibd file is missing. Setting "
+ "the free space to zero.",
+ ib_table->name);
+
+ stats.delete_length = 0;
+ }
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ }
+
+ stats.check_time = 0;
+ stats.mrr_length_per_rec= ref_length + 8; // 8 = max(sizeof(void *));
+
+ if (stats.records == 0) {
+ stats.mean_rec_length = 0;
+ } else {
+ stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
+ }
+ }
+
+ if (flag & HA_STATUS_CONST) {
+ ulong i;
+ /* Verify the number of index in InnoDB and MySQL
+ matches up. If prebuilt->clust_index_was_generated
+ holds, InnoDB defines GEN_CLUST_INDEX internally */
+ ulint num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+ - prebuilt->clust_index_was_generated;
+
+ if (table->s->keys != num_innodb_index) {
+ sql_print_error("Table %s contains %lu "
+ "indexes inside InnoDB, which "
+ "is different from the number of "
+ "indexes %u defined in the MySQL ",
+ ib_table->name, num_innodb_index,
+ table->s->keys);
+ }
+
+ for (i = 0; i < table->s->keys; i++) {
+ ulong j;
+ /* We could get index quickly through internal
+ index mapping with the index translation table.
+ The identity of index (match up index name with
+ that of table->key_info[i]) is already verified in
+ innobase_get_index(). */
+ index = innobase_get_index(i);
+
+ if (index == NULL) {
+ sql_print_error("Table %s contains fewer "
+ "indexes inside InnoDB than "
+ "are defined in the MySQL "
+ ".frm file. Have you mixed up "
+ ".frm files from different "
+ "installations? See "
+ REFMAN
+ "innodb-troubleshooting.html\n",
+ ib_table->name);
+ break;
+ }
+
+ for (j = 0; j < table->key_info[i].key_parts; j++) {
+
+ if (j + 1 > index->n_uniq) {
+ sql_print_error(
+"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
+"statistics for %lu columns. Have you mixed up .frm files from different "
+"installations? "
+"See " REFMAN "innodb-troubleshooting.html\n",
+ index->name,
+ ib_table->name,
+ (unsigned long)
+ index->n_uniq, j + 1);
+ break;
+ }
+
+ dict_index_stat_mutex_enter(index);
+
+ if (index->stat_n_diff_key_vals[j + 1] == 0) {
+
+ rec_per_key = stats.records;
+ } else {
+ rec_per_key = (ha_rows)(stats.records /
+ index->stat_n_diff_key_vals[j + 1]);
+ }
+
+ dict_index_stat_mutex_exit(index);
+
+ /* Since MySQL seems to favor table scans
+ too much over index searches, we pretend
+ index selectivity is 2 times better than
+ our estimate: */
+
+ rec_per_key = rec_per_key / 2;
+
+ if (rec_per_key == 0) {
+ rec_per_key = 1;
+ }
+
+ table->key_info[i].rec_per_key[j]=
+ rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
+ (ulong) rec_per_key;
+ }
+ }
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+
+ goto func_exit;
+ }
+
+ if (flag & HA_STATUS_ERRKEY) {
+ const dict_index_t* err_index;
+
+ ut_a(prebuilt->trx);
+ ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+ err_index = trx_get_error_info(prebuilt->trx);
+
+ if (err_index) {
+ errkey = innobase_get_mysql_key_number_for_index(
+ share, table, ib_table, err_index);
+ } else {
+ errkey = (unsigned int) prebuilt->trx->error_key_num;
+ }
+ }
+
+ if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
+ stats.auto_increment_value = innobase_peek_autoinc();
+ }
+
+func_exit:
+ prebuilt->trx->op_info = (char*)"";
+
+ DBUG_RETURN(0);
+}
+
+/**********************************************************************//**
+Updates index cardinalities of the table, based on 8 random dives into
+each index tree. This does NOT calculate exact statistics on the table.
+@return returns always 0 (success) */
+UNIV_INTERN
+int
+ha_innobase::analyze(
+/*=================*/
+ THD* thd, /*!< in: connection thread handle */
+ HA_CHECK_OPT* check_opt) /*!< in: currently ignored */
+{
+ if (share->ib_table->is_corrupt) {
+ return(HA_ADMIN_CORRUPT);
+ }
+
+ /* Simply call ::info() with all the flags */
+ info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE);
+
+ if (share->ib_table->is_corrupt) {
+ return(HA_ADMIN_CORRUPT);
+ }
+
+ return(0);
+}
+
+/**********************************************************************//**
+This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
+the table in MySQL. */
+UNIV_INTERN
+int
+ha_innobase::optimize(
+/*==================*/
+ THD* thd, /*!< in: connection thread handle */
+ HA_CHECK_OPT* check_opt) /*!< in: currently ignored */
+{
+ return(HA_ADMIN_TRY_ALTER);
+}
+
+/*******************************************************************//**
+Tries to check that an InnoDB table is not corrupted. If corruption is
+noticed, prints to stderr information about it. In case of corruption
+may also assert a failure and crash the server.
+@return HA_ADMIN_CORRUPT or HA_ADMIN_OK */
+UNIV_INTERN
+int
+ha_innobase::check(
+/*===============*/
+ THD* thd, /*!< in: user thread handle */
+ HA_CHECK_OPT* check_opt) /*!< in: check options, currently
+ ignored */
+{
+ dict_index_t* index;
+ ulint n_rows;
+ ulint n_rows_in_table = ULINT_UNDEFINED;
+ ibool is_ok = TRUE;
+ ulint old_isolation_level;
+
+ DBUG_ENTER("ha_innobase::check");
+ DBUG_ASSERT(thd == ha_thd());
+ ut_a(prebuilt->trx);
+ ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+ ut_a(prebuilt->trx == thd_to_trx(thd));
+
+ if (prebuilt->mysql_template == NULL) {
+ /* Build the template; we will use a dummy template
+ in index scans done in checking */
+
+ build_template(prebuilt, NULL, table, this, ROW_MYSQL_WHOLE_ROW);
+ }
+
+ if (prebuilt->table->ibd_file_missing) {
+ sql_print_error("InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you"
+ " used DISCARD TABLESPACE?\n"
+ "InnoDB: Please refer to\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ DBUG_RETURN(HA_ADMIN_CORRUPT);
+ }
+
+ prebuilt->trx->op_info = "checking table";
+
+ old_isolation_level = prebuilt->trx->isolation_level;
+
+ /* We must run the index record counts at an isolation level
+ >= READ COMMITTED, because a dirty read can see a wrong number
+ of records in some index; to play safe, we use always
+ REPEATABLE READ here */
+
+ prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ /* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+#if 0
+ fputs("Validating index ", stderr);
+ ut_print_name(stderr, trx, FALSE, index->name);
+ putc('\n', stderr);
+#endif
+
+ if (!btr_validate_index(index, prebuilt->trx)) {
+ is_ok = FALSE;
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ continue;
+ }
+
+ /* Instead of invoking change_active_index(), set up
+ a dummy template for non-locking reads, disabling
+ access to the clustered index. */
+ prebuilt->index = index;
+
+ prebuilt->index_usable = row_merge_is_index_usable(
+ prebuilt->trx, prebuilt->index);
+
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ HA_ERR_TABLE_DEF_CHANGED,
+ "InnoDB: Insufficient history for"
+ " index '%-.200s'",
+ index->name);
+ continue;
+ }
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+ prebuilt->n_template = 0;
+ prebuilt->need_to_access_clustered = FALSE;
+
+ dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+ prebuilt->select_lock_type = LOCK_NONE;
+
+ if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The B-tree of"
+ " index '%-.200s' is corrupted.",
+ index->name);
+ is_ok = FALSE;
+ }
+
+ if (thd_killed(user_thd)) {
+ break;
+ }
+
+#if 0
+ fprintf(stderr, "%lu entries in index %s\n", n_rows,
+ index->name);
+#endif
+
+ if (index == dict_table_get_first_index(prebuilt->table)) {
+ n_rows_in_table = n_rows;
+ } else if (n_rows != n_rows_in_table) {
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: Index '%-.200s'"
+ " contains %lu entries,"
+ " should be %lu.",
+ index->name,
+ (ulong) n_rows,
+ (ulong) n_rows_in_table);
+ is_ok = FALSE;
+ }
+ }
+
+ /* Restore the original isolation level */
+ prebuilt->trx->isolation_level = old_isolation_level;
+
+ /* We validate also the whole adaptive hash index for all tables
+ at every CHECK TABLE */
+
+ if (!btr_search_validate()) {
+ push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_NOT_KEYFILE,
+ "InnoDB: The adaptive hash index is corrupted.");
+ is_ok = FALSE;
+ }
+
+ /* Restore the fatal lock wait timeout after CHECK TABLE. */
+ mutex_enter(&kernel_mutex);
+ srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+ mutex_exit(&kernel_mutex);
+
+ prebuilt->trx->op_info = "";
+ if (thd_killed(user_thd)) {
+ my_error(ER_QUERY_INTERRUPTED, MYF(0));
+ }
+
+ if (share->ib_table->is_corrupt) {
+ return(HA_ADMIN_CORRUPT);
+ }
+
+ DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
+}
+
+/*************************************************************//**
+Adds information about free space in the InnoDB tablespace to a table comment
+which is printed out when a user calls SHOW TABLE STATUS. Adds also info on
+foreign keys.
+@return table comment + InnoDB free space + info on foreign keys */
+UNIV_INTERN
+char*
+ha_innobase::update_table_comment(
+/*==============================*/
+ const char* comment)/*!< in: table comment defined by user */
+{
+ uint length = (uint) strlen(comment);
+ char* str;
+ long flen;
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ if (length > 64000 - 3) {
+ return((char*)comment); /* string too long */
+ }
+
+ update_thd(ha_thd());
+
+ prebuilt->trx->op_info = (char*)"returning table comment";
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+ str = NULL;
+
+ /* output the data to a temporary file */
+
+ mutex_enter(&srv_dict_tmpfile_mutex);
+ rewind(srv_dict_tmpfile);
+
+ fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB",
+ fsp_get_available_space_in_free_extents(
+ prebuilt->table->space));
+
+ dict_print_info_on_foreign_keys(FALSE, srv_dict_tmpfile,
+ prebuilt->trx, prebuilt->table);
+ flen = ftell(srv_dict_tmpfile);
+ if (flen < 0) {
+ flen = 0;
+ } else if (length + flen + 3 > 64000) {
+ flen = 64000 - 3 - length;
+ }
+
+ /* allocate buffer for the full string, and
+ read the contents of the temporary file */
+
+ str = (char*) my_malloc(length + flen + 3, MYF(0));
+
+ if (str) {
+ char* pos = str + length;
+ if (length) {
+ memcpy(str, comment, length);
+ *pos++ = ';';
+ *pos++ = ' ';
+ }
+ rewind(srv_dict_tmpfile);
+ flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile);
+ pos[flen] = 0;
+ }
+
+ mutex_exit(&srv_dict_tmpfile_mutex);
+
+ prebuilt->trx->op_info = (char*)"";
+
+ return(str ? str : (char*) comment);
+}
+
+/*******************************************************************//**
+Gets the foreign key create info for a table stored in InnoDB.
+@return own: character string in the form which can be inserted to the
+CREATE TABLE statement, MUST be freed with
+ha_innobase::free_foreign_key_create_info */
+UNIV_INTERN
+char*
+ha_innobase::get_foreign_key_create_info(void)
+/*==========================================*/
+{
+ char* str = 0;
+ long flen;
+
+ ut_a(prebuilt != NULL);
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(ha_thd());
+
+ prebuilt->trx->op_info = (char*)"getting info on foreign keys";
+
+ /* In case MySQL calls this in the middle of a SELECT query,
+ release possible adaptive hash latch to avoid
+ deadlocks of threads */
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+
+ mutex_enter(&srv_dict_tmpfile_mutex);
+ rewind(srv_dict_tmpfile);
+
+ /* output the data to a temporary file */
+ dict_print_info_on_foreign_keys(TRUE, srv_dict_tmpfile,
+ prebuilt->trx, prebuilt->table);
+ prebuilt->trx->op_info = (char*)"";
+
+ flen = ftell(srv_dict_tmpfile);
+ if (flen < 0) {
+ flen = 0;
+ } else if (flen > 64000 - 1) {
+ flen = 64000 - 1;
+ }
+
+ /* allocate buffer for the string, and
+ read the contents of the temporary file */
+
+ str = (char*) my_malloc(flen + 1, MYF(0));
+
+ if (str) {
+ rewind(srv_dict_tmpfile);
+ flen = (uint) fread(str, 1, flen, srv_dict_tmpfile);
+ str[flen] = 0;
+ }
+
+ mutex_exit(&srv_dict_tmpfile_mutex);
+
+ return(str);
+}
+
+
+UNIV_INTERN
+int
+ha_innobase::get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list)
+{
+ dict_foreign_t* foreign;
+
+ DBUG_ENTER("get_foreign_key_list");
+ ut_a(prebuilt != NULL);
+ update_thd(ha_thd());
+ prebuilt->trx->op_info = (char*)"getting list of foreign keys";
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+ mutex_enter(&(dict_sys->mutex));
+ foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+
+ while (foreign != NULL) {
+ uint i;
+ FOREIGN_KEY_INFO f_key_info;
+ LEX_STRING *name= 0;
+ uint ulen;
+ char uname[NAME_LEN+1]; /* Unencoded name */
+ char db_name[NAME_LEN+1];
+ const char *tmp_buff;
+
+ tmp_buff= foreign->id;
+ i= 0;
+ while (tmp_buff[i] != '/')
+ i++;
+ tmp_buff+= i + 1;
+ f_key_info.forein_id = thd_make_lex_string(thd, 0,
+ tmp_buff, (uint) strlen(tmp_buff), 1);
+ tmp_buff= foreign->referenced_table_name;
+
+ /* Database name */
+ i= 0;
+ while (tmp_buff[i] != '/')
+ {
+ db_name[i]= tmp_buff[i];
+ i++;
+ }
+ db_name[i]= 0;
+ ulen= filename_to_tablename(db_name, uname, sizeof(uname));
+ f_key_info.referenced_db = thd_make_lex_string(thd, 0,
+ uname, ulen, 1);
+
+ /* Table name */
+ tmp_buff+= i + 1;
+ ulen= filename_to_tablename(tmp_buff, uname, sizeof(uname));
+ f_key_info.referenced_table = thd_make_lex_string(thd, 0,
+ uname, ulen, 1);
+
+ for (i= 0;;) {
+ tmp_buff= foreign->foreign_col_names[i];
+ name = thd_make_lex_string(thd, name,
+ tmp_buff, (uint) strlen(tmp_buff), 1);
+ f_key_info.foreign_fields.push_back(name);
+ tmp_buff= foreign->referenced_col_names[i];
+ name = thd_make_lex_string(thd, name,
+ tmp_buff, (uint) strlen(tmp_buff), 1);
+ f_key_info.referenced_fields.push_back(name);
+ if (++i >= foreign->n_fields)
+ break;
+ }
+
+ ulong length;
+ if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)
+ {
+ length=7;
+ tmp_buff= "CASCADE";
+ }
+ else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+ {
+ length=8;
+ tmp_buff= "SET NULL";
+ }
+ else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION)
+ {
+ length=9;
+ tmp_buff= "NO ACTION";
+ }
+ else
+ {
+ length=8;
+ tmp_buff= "RESTRICT";
+ }
+ f_key_info.delete_method = thd_make_lex_string(
+ thd, f_key_info.delete_method, tmp_buff, length, 1);
+
+
+ if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)
+ {
+ length=7;
+ tmp_buff= "CASCADE";
+ }
+ else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)
+ {
+ length=8;
+ tmp_buff= "SET NULL";
+ }
+ else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION)
+ {
+ length=9;
+ tmp_buff= "NO ACTION";
+ }
+ else
+ {
+ length=8;
+ tmp_buff= "RESTRICT";
+ }
+ f_key_info.update_method = thd_make_lex_string(
+ thd, f_key_info.update_method, tmp_buff, length, 1);
+ if (foreign->referenced_index &&
+ foreign->referenced_index->name)
+ {
+ f_key_info.referenced_key_name = thd_make_lex_string(
+ thd, f_key_info.referenced_key_name,
+ foreign->referenced_index->name,
+ (uint) strlen(foreign->referenced_index->name), 1);
+ }
+ else
+ f_key_info.referenced_key_name= 0;
+
+ FOREIGN_KEY_INFO *pf_key_info = (FOREIGN_KEY_INFO *)
+ thd_memdup(thd, &f_key_info, sizeof(FOREIGN_KEY_INFO));
+ f_key_list->push_back(pf_key_info);
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+ mutex_exit(&(dict_sys->mutex));
+ prebuilt->trx->op_info = (char*)"";
+
+ DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Checks if ALTER TABLE may change the storage engine of the table.
+Changing storage engines is not allowed for tables for which there
+are foreign key constraints (parent or child tables).
+@return TRUE if can switch engines */
+UNIV_INTERN
+bool
+ha_innobase::can_switch_engines(void)
+/*=================================*/
+{
+ bool can_switch;
+
+ DBUG_ENTER("ha_innobase::can_switch_engines");
+
+ ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+ prebuilt->trx->op_info =
+ "determining if there are foreign key constraints";
+ row_mysql_lock_data_dictionary(prebuilt->trx);
+
+ can_switch = !UT_LIST_GET_FIRST(prebuilt->table->referenced_list)
+ && !UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ prebuilt->trx->op_info = "";
+
+ DBUG_RETURN(can_switch);
+}
+
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return > 0 if referenced by a FOREIGN KEY */
+UNIV_INTERN
+uint
+ha_innobase::referenced_by_foreign_key(void)
+/*========================================*/
+{
+ if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) {
+
+ return(1);
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Frees the foreign key create info for a table stored in InnoDB, if it is
+non-NULL. */
+UNIV_INTERN
+void
+ha_innobase::free_foreign_key_create_info(
+/*======================================*/
+ char* str) /*!< in, own: create info string to free */
+{
+ if (str) {
+ my_free(str, MYF(0));
+ }
+}
+
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::extra(
+/*===============*/
+ enum ha_extra_function operation)
+ /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+ /* Warning: since it is not sure that MySQL calls external_lock
+ before calling this function, the trx field in prebuilt can be
+ obsolete! */
+
+ switch (operation) {
+ case HA_EXTRA_FLUSH:
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+ break;
+ case HA_EXTRA_RESET_STATE:
+ reset_template(prebuilt);
+ /* Reset index condition pushdown state */
+ pushed_idx_cond= FALSE;
+ pushed_idx_cond_keyno= MAX_KEY;
+ prebuilt->idx_cond_func= NULL;
+ break;
+ case HA_EXTRA_NO_KEYREAD:
+ prebuilt->read_just_key = 0;
+ break;
+ case HA_EXTRA_KEYREAD:
+ prebuilt->read_just_key = 1;
+ break;
+ case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ prebuilt->keep_other_fields_on_keyread = 1;
+ break;
+
+ /* IMPORTANT: prebuilt->trx can be obsolete in
+ this method, because it is not sure that MySQL
+ calls external_lock before this method with the
+ parameters below. We must not invoke update_thd()
+ either, because the calling threads may change.
+ CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+ case HA_EXTRA_IGNORE_DUP_KEY:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+ break;
+ case HA_EXTRA_WRITE_CAN_REPLACE:
+ thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+ break;
+ case HA_EXTRA_WRITE_CANNOT_REPLACE:
+ thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+ break;
+ case HA_EXTRA_NO_IGNORE_DUP_KEY:
+ thd_to_trx(ha_thd())->duplicates &=
+ ~(TRX_DUP_IGNORE | TRX_DUP_REPLACE);
+ break;
+ default:/* Do nothing */
+ ;
+ }
+
+ return(0);
+}
+
+UNIV_INTERN
+int
+ha_innobase::reset()
+{
+ if (prebuilt->blob_heap) {
+ row_mysql_prebuilt_free_blob_heap(prebuilt);
+ }
+
+ reset_template(prebuilt);
+
+ /* Reset index condition pushdown state */
+ pushed_idx_cond_keyno= MAX_KEY;
+ pushed_idx_cond= NULL;
+ ds_mrr.dsmrr_close();
+ prebuilt->idx_cond_func= NULL;
+
+ /* TODO: This should really be reset in reset_template() but for now
+ it's safer to do it explicitly here. */
+
+ /* This is a statement level counter. */
+ prebuilt->autoinc_last_value = 0;
+
+ return(0);
+}
+
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::start_stmt(
+/*====================*/
+ THD* thd, /*!< in: handle to the user thread */
+ thr_lock_type lock_type)
+{
+ trx_t* trx;
+
+ update_thd(thd);
+
+ trx = prebuilt->trx;
+
+ /* Here we release the search latch and the InnoDB thread FIFO ticket
+ if they were reserved. They should have been released already at the
+ end of the previous statement, but because inside LOCK TABLES the
+ lock count method does not work to mark the end of a SELECT statement,
+ that may not be the case. We MUST release the search latch before an
+ INSERT, for example. */
+
+ innobase_release_stat_resources(trx);
+
+ /* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+ trx->n_autoinc_rows = 0;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->hint_need_to_fetch_extra_cols = 0;
+ reset_template(prebuilt);
+
+ if (!prebuilt->mysql_has_locked) {
+ /* This handle is for a temporary table created inside
+ this same LOCK TABLES; since MySQL does NOT call external_lock
+ in this case, we must use x-row locks inside InnoDB to be
+ prepared for an update of a row */
+
+ prebuilt->select_lock_type = LOCK_X;
+ } else {
+ if (trx->isolation_level != TRX_ISO_SERIALIZABLE
+ && thd_sql_command(thd) == SQLCOM_SELECT
+ && lock_type == TL_READ) {
+
+ /* For other than temporary tables, we obtain
+ no lock for consistent read (plain SELECT). */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ } else {
+ /* Not a consistent read: restore the
+ select_lock_type value. The value of
+ stored_select_lock_type was decided in:
+ 1) ::store_lock(),
+ 2) ::external_lock(),
+ 3) ::init_table_handle_for_HANDLER(), and
+ 4) ::transactional_table_lock(). */
+
+ prebuilt->select_lock_type =
+ prebuilt->stored_select_lock_type;
+ }
+ }
+
+ trx->detailed_error[0] = '\0';
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+ if (trx->active_trans == 0) {
+
+ innobase_register_trx_and_stmt(ht, thd);
+ trx->active_trans = 1;
+ } else {
+ innobase_register_stmt(ht, thd);
+ }
+
+ return(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return InnoDB isolation level */
+static inline
+ulint
+innobase_map_isolation_level(
+/*=========================*/
+ enum_tx_isolation iso) /*!< in: MySQL isolation level code */
+{
+ switch(iso) {
+ case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
+ case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
+ case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
+ case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
+ default: ut_a(0); return(0);
+ }
+}
+
+/******************************************************************//**
+As MySQL will execute an external lock for every new table it uses when it
+starts to process an SQL statement (an exception is when MySQL calls
+start_stmt for the handle) we can use this function to store the pointer to
+the THD in the handle. We will also use this function to communicate
+to InnoDB that a new SQL statement has started and that we must store a
+savepoint to our transaction handle, so that we are able to roll back
+the SQL statement in case of an error.
+@return 0 */
+UNIV_INTERN
+int
+ha_innobase::external_lock(
+/*=======================*/
+ THD* thd, /*!< in: handle to the user thread */
+ int lock_type) /*!< in: lock type */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("ha_innobase::external_lock");
+ DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+ update_thd(thd);
+
+ /* Statement based binlogging does not work in isolation level
+ READ UNCOMMITTED and READ COMMITTED since the necessary
+ locks cannot be taken. In this case, we print an
+ informative error message and return with an error. */
+ if (lock_type == F_WRLCK)
+ {
+ ulong const binlog_format= thd_binlog_format(thd);
+ ulong const tx_isolation = thd_tx_isolation(ha_thd());
+ if (tx_isolation <= ISO_READ_COMMITTED
+ && binlog_format == BINLOG_FORMAT_STMT
+#if MYSQL_VERSION_ID > 50140
+ && thd_binlog_filter_ok(thd)
+#endif /* MYSQL_VERSION_ID > 50140 */
+ )
+ {
+ char buf[256];
+ my_snprintf(buf, sizeof(buf),
+ "Transaction level '%s' in"
+ " InnoDB is not safe for binlog mode '%s'",
+ tx_isolation_names[tx_isolation],
+ binlog_format_names[binlog_format]);
+ my_error(ER_BINLOG_LOGGING_IMPOSSIBLE, MYF(0), buf);
+ DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+ }
+ }
+
+
+ trx = prebuilt->trx;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+ reset_template(prebuilt);
+
+ if (lock_type == F_WRLCK) {
+
+ /* If this is a SELECT, then it is in UPDATE TABLE ...
+ or SELECT ... FOR UPDATE */
+ prebuilt->select_lock_type = LOCK_X;
+ prebuilt->stored_select_lock_type = LOCK_X;
+ }
+
+ if (lock_type != F_UNLCK) {
+ /* MySQL is setting a new table lock */
+
+ trx->detailed_error[0] = '\0';
+
+ /* Set the MySQL flag to mark that there is an active
+ transaction */
+ if (trx->active_trans == 0) {
+
+ innobase_register_trx_and_stmt(ht, thd);
+ trx->active_trans = 1;
+ } else if (trx->n_mysql_tables_in_use == 0) {
+ innobase_register_stmt(ht, thd);
+ }
+
+ if (trx->isolation_level == TRX_ISO_SERIALIZABLE
+ && prebuilt->select_lock_type == LOCK_NONE
+ && thd_test_options(thd,
+ OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ /* To get serializable execution, we let InnoDB
+ conceptually add 'LOCK IN SHARE MODE' to all SELECTs
+ which otherwise would have been consistent reads. An
+ exception is consistent reads in the AUTOCOMMIT=1 mode:
+ we know that they are read-only transactions, and they
+ can be serialized also if performed as consistent
+ reads. */
+
+ prebuilt->select_lock_type = LOCK_S;
+ prebuilt->stored_select_lock_type = LOCK_S;
+ }
+
+ /* Starting from 4.1.9, no InnoDB table lock is taken in LOCK
+ TABLES if AUTOCOMMIT=1. It does not make much sense to acquire
+ an InnoDB table lock if it is released immediately at the end
+ of LOCK TABLES, and InnoDB's table locks in that case cause
+ VERY easily deadlocks.
+
+ We do not set InnoDB table locks if user has not explicitly
+ requested a table lock. Note that thd_in_lock_tables(thd)
+ can hold in some cases, e.g., at the start of a stored
+ procedure call (SQLCOM_CALL). */
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+
+ if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES
+ && THDVAR(thd, table_locks)
+ && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
+ && thd_in_lock_tables(thd)) {
+
+ ulint error = row_lock_table_for_mysql(
+ prebuilt, NULL, 0);
+
+ if (error != DB_SUCCESS) {
+ error = convert_error_code_to_mysql(
+ (int) error, 0, thd);
+ DBUG_RETURN((int) error);
+ }
+ }
+
+ trx->mysql_n_tables_locked++;
+ }
+
+ trx->n_mysql_tables_in_use++;
+ prebuilt->mysql_has_locked = TRUE;
+
+ DBUG_RETURN(0);
+ }
+
+ /* MySQL is releasing a table lock */
+
+ trx->n_mysql_tables_in_use--;
+ prebuilt->mysql_has_locked = FALSE;
+
+ /* Release a possible FIFO ticket and search latch. Since we
+ may reserve the kernel mutex, we have to release the search
+ system latch first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ /* If the MySQL lock count drops to zero we know that the current SQL
+ statement has ended */
+
+ if (trx->n_mysql_tables_in_use == 0) {
+#ifdef EXTENDED_SLOWLOG
+ increment_thd_innodb_stats(thd,
+ (unsigned long long) ut_conv_dulint_to_longlong(trx->id),
+ trx->io_reads,
+ trx->io_read,
+ trx->io_reads_wait_timer,
+ trx->lock_que_wait_timer,
+ trx->innodb_que_wait_timer,
+ trx->distinct_page_access);
+
+ trx->io_reads = 0;
+ trx->io_read = 0;
+ trx->io_reads_wait_timer = 0;
+ trx->lock_que_wait_timer = 0;
+ trx->innodb_que_wait_timer = 0;
+ trx->distinct_page_access = 0;
+ if (trx->distinct_page_access_hash)
+ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+#endif
+
+ trx->mysql_n_tables_locked = 0;
+ prebuilt->used_in_HANDLER = FALSE;
+
+ if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+ if (trx->active_trans != 0) {
+ innobase_commit(ht, thd, TRUE);
+ }
+ } else {
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && trx->global_read_view) {
+
+ /* At low transaction isolation levels we let
+ each consistent read set its own snapshot */
+
+ read_view_close_for_mysql(trx);
+ }
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+With this function MySQL request a transactional lock to a table when
+user issued query LOCK TABLES..WHERE ENGINE = InnoDB.
+@return error code */
+UNIV_INTERN
+int
+ha_innobase::transactional_table_lock(
+/*==================================*/
+ THD* thd, /*!< in: handle to the user thread */
+ int lock_type) /*!< in: lock type */
+{
+ trx_t* trx;
+
+ DBUG_ENTER("ha_innobase::transactional_table_lock");
+ DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+ /* We do not know if MySQL can call this function before calling
+ external_lock(). To be safe, update the thd of the current table
+ handle. */
+
+ update_thd(thd);
+
+ if (share->ib_table->is_corrupt) {
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ if (prebuilt->table->ibd_file_missing && !thd_tablespace_op(thd)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir?"
+ "InnoDB: See " REFMAN
+ "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ trx = prebuilt->trx;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+ reset_template(prebuilt);
+
+ if (lock_type == F_WRLCK) {
+ prebuilt->select_lock_type = LOCK_X;
+ prebuilt->stored_select_lock_type = LOCK_X;
+ } else if (lock_type == F_RDLCK) {
+ prebuilt->select_lock_type = LOCK_S;
+ prebuilt->stored_select_lock_type = LOCK_S;
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB error:\n"
+"MySQL is trying to set transactional table lock with corrupted lock type\n"
+"to table %s, lock type %d does not exist.\n",
+ prebuilt->table->name, lock_type);
+ DBUG_RETURN(HA_ERR_CRASHED);
+ }
+
+ /* MySQL is setting a new transactional table lock */
+
+ /* Set the MySQL flag to mark that there is an active transaction */
+ if (trx->active_trans == 0) {
+
+ innobase_register_trx_and_stmt(ht, thd);
+ trx->active_trans = 1;
+ }
+
+ if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) {
+ ulint error = DB_SUCCESS;
+
+ error = row_lock_table_for_mysql(prebuilt, NULL, 0);
+
+ if (error != DB_SUCCESS) {
+ error = convert_error_code_to_mysql(
+ (int) error, prebuilt->table->flags, thd);
+ DBUG_RETURN((int) error);
+ }
+
+ if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+ /* Store the current undo_no of the transaction
+ so that we know where to roll back if we have
+ to roll back the next SQL statement */
+
+ trx_mark_sql_stat_end(trx);
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Here we export InnoDB status variables to MySQL. */
+static
+void
+innodb_export_status(void)
+/*======================*/
+{
+ if (innodb_inited) {
+ srv_export_innodb_status();
+ }
+}
+
+/************************************************************************//**
+Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
+Monitor to the client. */
+static
+bool
+innodb_show_status(
+/*===============*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of the caller */
+ stat_print_fn *stat_print)
+{
+ trx_t* trx;
+ static const char truncated_msg[] = "... truncated...\n";
+ const long MAX_STATUS_SIZE = 64000;
+ ulint trx_list_start = ULINT_UNDEFINED;
+ ulint trx_list_end = ULINT_UNDEFINED;
+
+ DBUG_ENTER("innodb_show_status");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = check_trx_exists(thd);
+
+ innobase_release_stat_resources(trx);
+
+ /* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
+ bytes of text. */
+
+ long flen, usable_len;
+ char* str;
+
+ mutex_enter(&srv_monitor_file_mutex);
+ rewind(srv_monitor_file);
+ srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+ &trx_list_start, &trx_list_end);
+ flen = ftell(srv_monitor_file);
+ os_file_set_eof(srv_monitor_file);
+
+ if (flen < 0) {
+ flen = 0;
+ }
+
+ if (flen > MAX_STATUS_SIZE) {
+ usable_len = MAX_STATUS_SIZE;
+ } else {
+ usable_len = flen;
+ }
+
+ /* allocate buffer for the string, and
+ read the contents of the temporary file */
+
+ if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
+ mutex_exit(&srv_monitor_file_mutex);
+ DBUG_RETURN(TRUE);
+ }
+
+ rewind(srv_monitor_file);
+ if (flen < MAX_STATUS_SIZE) {
+ /* Display the entire output. */
+ flen = (long) fread(str, 1, flen, srv_monitor_file);
+ } else if (trx_list_end < (ulint) flen
+ && trx_list_start < trx_list_end
+ && trx_list_start + (flen - trx_list_end)
+ < MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+ /* Omit the beginning of the list of active transactions. */
+ long len = (long) fread(str, 1, trx_list_start, srv_monitor_file);
+ memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
+ len += sizeof truncated_msg - 1;
+ usable_len = (MAX_STATUS_SIZE - 1) - len;
+ fseek(srv_monitor_file, flen - usable_len, SEEK_SET);
+ len += (long) fread(str + len, 1, usable_len, srv_monitor_file);
+ flen = len;
+ } else {
+ /* Omit the end of the output. */
+ flen = (long) fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+ }
+
+ mutex_exit(&srv_monitor_file_mutex);
+
+ bool result = FALSE;
+
+ if (stat_print(thd, innobase_hton_name, (uint) strlen(innobase_hton_name),
+ STRING_WITH_LEN(""), str, flen)) {
+ result= TRUE;
+ }
+ my_free(str, MYF(0));
+
+ DBUG_RETURN(FALSE);
+}
+
+/************************************************************************//**
+Implements the SHOW MUTEX STATUS command.
+@return TRUE on failure, FALSE on success. */
+static
+bool
+innodb_mutex_show_status(
+/*=====================*/
+ handlerton* hton, /*!< in: the innodb handlerton */
+ THD* thd, /*!< in: the MySQL query thread of the
+ caller */
+ stat_print_fn* stat_print) /*!< in: function for printing
+ statistics */
+{
+ char buf1[IO_SIZE], buf2[IO_SIZE];
+ mutex_t* mutex;
+ rw_lock_t* lock;
+ ulint block_mutex_oswait_count = 0;
+ ulint block_lock_oswait_count = 0;
+ mutex_t* block_mutex = NULL;
+ rw_lock_t* block_lock = NULL;
+#ifdef UNIV_DEBUG
+ ulint rw_lock_count= 0;
+ ulint rw_lock_count_spin_loop= 0;
+ ulint rw_lock_count_spin_rounds= 0;
+ ulint rw_lock_count_os_wait= 0;
+ ulint rw_lock_count_os_yield= 0;
+ ulonglong rw_lock_wait_time= 0;
+#endif /* UNIV_DEBUG */
+ uint hton_name_len= (uint) strlen(innobase_hton_name), buf1len, buf2len;
+ DBUG_ENTER("innodb_mutex_show_status");
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ mutex_enter(&mutex_list_mutex);
+
+ for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+ mutex = UT_LIST_GET_NEXT(list, mutex)) {
+ if (mutex->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_mutex(mutex)) {
+ block_mutex = mutex;
+ block_mutex_oswait_count += mutex->count_os_wait;
+ continue;
+ }
+#ifdef UNIV_DEBUG
+ if (mutex->mutex_type != 1) {
+ if (mutex->count_using > 0) {
+ buf1len= my_snprintf(buf1, sizeof(buf1),
+ "%s:%s",
+ mutex->cmutex_name, mutex->cfile_name);
+ buf2len= my_snprintf(buf2, sizeof(buf2),
+ "count=%lu, spin_waits=%lu,"
+ " spin_rounds=%lu, "
+ "os_waits=%lu, os_yields=%lu,"
+ " os_wait_times=%lu",
+ mutex->count_using,
+ mutex->count_spin_loop,
+ mutex->count_spin_rounds,
+ mutex->count_os_wait,
+ mutex->count_os_yield,
+ (ulong) (mutex->lspent_time/1000));
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&mutex_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+ } else {
+ rw_lock_count += mutex->count_using;
+ rw_lock_count_spin_loop += mutex->count_spin_loop;
+ rw_lock_count_spin_rounds += mutex->count_spin_rounds;
+ rw_lock_count_os_wait += mutex->count_os_wait;
+ rw_lock_count_os_yield += mutex->count_os_yield;
+ rw_lock_wait_time += mutex->lspent_time;
+ }
+#else /* UNIV_DEBUG */
+ buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s",
+ mutex->cmutex_name);
+ buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
+ (ulong) mutex->count_os_wait);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&mutex_list_mutex);
+ DBUG_RETURN(1);
+ }
+#endif /* UNIV_DEBUG */
+ }
+
+ if (block_mutex) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s",
+ block_mutex->cmutex_name);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_mutex_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&mutex_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+
+ mutex_exit(&mutex_list_mutex);
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+ lock = UT_LIST_GET_NEXT(list, lock)) {
+ if (lock->count_os_wait == 0) {
+ continue;
+ }
+
+ if (buf_pool_is_block_lock(lock)) {
+ block_lock = lock;
+ block_lock_oswait_count += lock->count_os_wait;
+ continue;
+ }
+
+ buf1len = my_snprintf(buf1, sizeof buf1, "%s",
+ lock->lock_name);
+ buf2len = my_snprintf(buf2, sizeof buf2, "os_waits=%lu",
+ (ulong) lock->count_os_wait);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+
+ if (block_lock) {
+ buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+ "combined %s",
+ block_lock->lock_name);
+ buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+ "os_waits=%lu",
+ (ulong) block_lock_oswait_count);
+
+ if (stat_print(thd, innobase_hton_name,
+ hton_name_len, buf1, buf1len,
+ buf2, buf2len)) {
+ mutex_exit(&rw_lock_list_mutex);
+ DBUG_RETURN(1);
+ }
+ }
+
+ mutex_exit(&rw_lock_list_mutex);
+
+#ifdef UNIV_DEBUG
+ buf2len = my_snprintf(buf2, sizeof buf2,
+ "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
+ "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
+ (ulong) rw_lock_count,
+ (ulong) rw_lock_count_spin_loop,
+ (ulong) rw_lock_count_spin_rounds,
+ (ulong) rw_lock_count_os_wait,
+ (ulong) rw_lock_count_os_yield,
+ (ulong) (rw_lock_wait_time / 1000));
+
+ if (stat_print(thd, innobase_hton_name, hton_name_len,
+ STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) {
+ DBUG_RETURN(1);
+ }
+#endif /* UNIV_DEBUG */
+
+ DBUG_RETURN(FALSE);
+}
+
+static
+bool innobase_show_status(handlerton *hton, THD* thd,
+ stat_print_fn* stat_print,
+ enum ha_stat_type stat_type)
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ switch (stat_type) {
+ case HA_ENGINE_STATUS:
+ return innodb_show_status(hton, thd, stat_print);
+ case HA_ENGINE_MUTEX:
+ return innodb_mutex_show_status(hton, thd, stat_print);
+ default:
+ return(FALSE);
+ }
+}
+
+/************************************************************************//**
+ Handling the shared INNOBASE_SHARE structure that is needed to provide table
+ locking.
+****************************************************************************/
+
+static INNOBASE_SHARE* get_share(const char* table_name)
+{
+ INNOBASE_SHARE *share;
+ pthread_mutex_lock(&innobase_share_mutex);
+
+ ulint fold = ut_fold_string(table_name);
+
+ HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+ INNOBASE_SHARE*, share,
+ ut_ad(share->use_count > 0),
+ !strcmp(share->table_name, table_name));
+
+ if (!share) {
+
+ uint length = (uint) strlen(table_name);
+
+ /* TODO: invoke HASH_MIGRATE if innobase_open_tables
+ grows too big */
+
+ share = (INNOBASE_SHARE *) my_malloc(sizeof(*share)+length+1,
+ MYF(MY_FAE | MY_ZEROFILL));
+
+ share->table_name = (char*) memcpy(share + 1,
+ table_name, length + 1);
+
+ HASH_INSERT(INNOBASE_SHARE, table_name_hash,
+ innobase_open_tables, fold, share);
+
+ thr_lock_init(&share->lock);
+
+ /* Index translation table initialization */
+ share->idx_trans_tbl.index_mapping = NULL;
+ share->idx_trans_tbl.index_count = 0;
+ share->idx_trans_tbl.array_size = 0;
+ }
+
+ share->use_count++;
+ pthread_mutex_unlock(&innobase_share_mutex);
+
+ return(share);
+}
+
+static void free_share(INNOBASE_SHARE* share)
+{
+ pthread_mutex_lock(&innobase_share_mutex);
+
+#ifdef UNIV_DEBUG
+ INNOBASE_SHARE* share2;
+ ulint fold = ut_fold_string(share->table_name);
+
+ HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+ INNOBASE_SHARE*, share2,
+ ut_ad(share->use_count > 0),
+ !strcmp(share->table_name, share2->table_name));
+
+ ut_a(share2 == share);
+#endif /* UNIV_DEBUG */
+
+ if (!--share->use_count) {
+ ulint fold = ut_fold_string(share->table_name);
+
+ HASH_DELETE(INNOBASE_SHARE, table_name_hash,
+ innobase_open_tables, fold, share);
+ thr_lock_delete(&share->lock);
+
+ /* Free any memory from index translation table */
+ my_free(share->idx_trans_tbl.index_mapping,
+ MYF(MY_ALLOW_ZERO_PTR));
+
+ my_free(share, MYF(0));
+
+ /* TODO: invoke HASH_MIGRATE if innobase_open_tables
+ shrinks too much */
+ }
+
+ pthread_mutex_unlock(&innobase_share_mutex);
+}
+
+/*****************************************************************//**
+Converts a MySQL table lock stored in the 'lock' field of the handle to
+a proper type before storing pointer to the lock into an array of pointers.
+MySQL also calls this if it wants to reset some table locks to a not-locked
+state during the processing of an SQL query. An example is that during a
+SELECT the read lock is released early on the 'const' tables where we only
+fetch one row. MySQL does not call this when it releases all locks at the
+end of an SQL statement.
+@return pointer to the next element in the 'to' array */
+UNIV_INTERN
+THR_LOCK_DATA**
+ha_innobase::store_lock(
+/*====================*/
+ THD* thd, /*!< in: user thread handle */
+ THR_LOCK_DATA** to, /*!< in: pointer to an array
+ of pointers to lock structs;
+ pointer to the 'lock' field
+ of current handle is stored
+ next to this array */
+ enum thr_lock_type lock_type) /*!< in: lock type to store in
+ 'lock'; this may also be
+ TL_IGNORE */
+{
+ trx_t* trx;
+
+ /* Note that trx in this function is NOT necessarily prebuilt->trx
+ because we call update_thd() later, in ::external_lock()! Failure to
+ understand this caused a serious memory corruption bug in 5.1.11. */
+
+ trx = check_trx_exists(thd);
+
+ /* NOTE: MySQL can call this function with lock 'type' TL_IGNORE!
+ Be careful to ignore TL_IGNORE if we are going to do something with
+ only 'real' locks! */
+
+ /* If no MySQL table is in use, we need to set the isolation level
+ of the transaction. */
+
+ if (lock_type != TL_IGNORE
+ && trx->n_mysql_tables_in_use == 0) {
+ trx->isolation_level = innobase_map_isolation_level(
+ (enum_tx_isolation) thd_tx_isolation(thd));
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && trx->global_read_view) {
+
+ /* At low transaction isolation levels we let
+ each consistent read set its own snapshot */
+
+ read_view_close_for_mysql(trx);
+ }
+ }
+
+ DBUG_ASSERT(EQ_CURRENT_THD(thd));
+ const bool in_lock_tables = thd_in_lock_tables(thd);
+ const uint sql_command = thd_sql_command(thd);
+
+ if (sql_command == SQLCOM_DROP_TABLE) {
+
+ /* MySQL calls this function in DROP TABLE though this table
+ handle may belong to another thd that is running a query. Let
+ us in that case skip any changes to the prebuilt struct. */
+
+ } else if ((lock_type == TL_READ && in_lock_tables)
+ || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
+ || lock_type == TL_READ_WITH_SHARED_LOCKS
+ || lock_type == TL_READ_NO_INSERT
+ || (lock_type != TL_IGNORE
+ && sql_command != SQLCOM_SELECT)) {
+
+ /* The OR cases above are in this order:
+ 1) MySQL is doing LOCK TABLES ... READ LOCAL, or we
+ are processing a stored procedure or function, or
+ 2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
+ 3) this is a SELECT ... IN SHARE MODE, or
+ 4) we are doing a complex SQL statement like
+ INSERT INTO ... SELECT ... and the logical logging (MySQL
+ binlog) requires the use of a locking read, or
+ MySQL is doing LOCK TABLES ... READ.
+ 5) we let InnoDB do locking reads for all SQL statements that
+ are not simple SELECTs; note that select_lock_type in this
+ case may get strengthened in ::external_lock() to LOCK_X.
+ Note that we MUST use a locking read in all data modifying
+ SQL statements, because otherwise the execution would not be
+ serializable, and also the results from the update could be
+ unexpected if an obsolete consistent read view would be
+ used. */
+
+ ulint isolation_level;
+
+ isolation_level = trx->isolation_level;
+
+ if ((srv_locks_unsafe_for_binlog
+ || isolation_level <= TRX_ISO_READ_COMMITTED)
+ && isolation_level != TRX_ISO_SERIALIZABLE
+ && (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT)
+ && (sql_command == SQLCOM_INSERT_SELECT
+ || sql_command == SQLCOM_REPLACE_SELECT
+ || sql_command == SQLCOM_UPDATE
+ || sql_command == SQLCOM_CREATE_TABLE
+ || sql_command == SQLCOM_SET_OPTION)) {
+
+ /* If we either have innobase_locks_unsafe_for_binlog
+ option set or this session is using READ COMMITTED
+ isolation level and isolation level of the transaction
+ is not set to serializable and MySQL is doing
+ INSERT INTO...SELECT or REPLACE INTO...SELECT
+ or UPDATE ... = (SELECT ...) or CREATE ...
+ SELECT... or SET ... = (SELECT ...) without
+ FOR UPDATE or IN SHARE MODE in select,
+ then we use consistent read for select. */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+ } else if (sql_command == SQLCOM_CHECKSUM) {
+ /* Use consistent read for checksum table */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+ } else {
+ prebuilt->select_lock_type = LOCK_S;
+ prebuilt->stored_select_lock_type = LOCK_S;
+ }
+
+ } else if (lock_type != TL_IGNORE) {
+
+ /* We set possible LOCK_X value in external_lock, not yet
+ here even if this would be SELECT ... FOR UPDATE */
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = LOCK_NONE;
+ }
+
+ if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
+
+ /* Starting from 5.0.7, we weaken also the table locks
+ set at the start of a MySQL stored procedure call, just like
+ we weaken the locks set at the start of an SQL statement.
+ MySQL does set in_lock_tables TRUE there, but in reality
+ we do not need table locks to make the execution of a
+ single transaction stored procedure call deterministic
+ (if it does not use a consistent read). */
+
+ if (lock_type == TL_READ
+ && sql_command == SQLCOM_LOCK_TABLES) {
+ /* We come here if MySQL is processing LOCK TABLES
+ ... READ LOCAL. MyISAM under that table lock type
+ reads the table as it was at the time the lock was
+ granted (new inserts are allowed, but not seen by the
+ reader). To get a similar effect on an InnoDB table,
+ we must use LOCK TABLES ... READ. We convert the lock
+ type here, so that for InnoDB, READ LOCAL is
+ equivalent to READ. This will change the InnoDB
+ behavior in mysqldump, so that dumps of InnoDB tables
+ are consistent with dumps of MyISAM tables. */
+
+ lock_type = TL_READ_NO_INSERT;
+ }
+
+ /* If we are not doing a LOCK TABLE, DISCARD/IMPORT
+ TABLESPACE or TRUNCATE TABLE then allow multiple
+ writers. Note that ALTER TABLE uses a TL_WRITE_ALLOW_READ
+ < TL_WRITE_CONCURRENT_INSERT.
+
+ We especially allow multiple writers if MySQL is at the
+ start of a stored procedure call (SQLCOM_CALL) or a
+ stored function call (MySQL does have in_lock_tables
+ TRUE there). */
+
+ if ((lock_type >= TL_WRITE_CONCURRENT_INSERT
+ && lock_type <= TL_WRITE)
+ && !(in_lock_tables
+ && sql_command == SQLCOM_LOCK_TABLES)
+ && !thd_tablespace_op(thd)
+ && sql_command != SQLCOM_TRUNCATE
+ && sql_command != SQLCOM_OPTIMIZE
+ && sql_command != SQLCOM_CREATE_TABLE) {
+
+ lock_type = TL_WRITE_ALLOW_WRITE;
+ }
+
+ /* In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+ MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+ would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+ to t2. Convert the lock to a normal read lock to allow
+ concurrent inserts to t2.
+
+ We especially allow concurrent inserts if MySQL is at the
+ start of a stored procedure call (SQLCOM_CALL)
+ (MySQL does have thd_in_lock_tables() TRUE there). */
+
+ if (lock_type == TL_READ_NO_INSERT
+ && sql_command != SQLCOM_LOCK_TABLES) {
+
+ lock_type = TL_READ;
+ }
+
+ lock.type = lock_type;
+ }
+
+ *to++= &lock;
+
+ return(to);
+}
+
+/*********************************************************************//**
+Read the next autoinc value. Acquire the relevant locks before reading
+the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
+on return and all relevant locks acquired.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_get_autoinc(
+/*==============================*/
+ ulonglong* value) /*!< out: autoinc value */
+{
+ *value = 0;
+
+ prebuilt->autoinc_error = innobase_lock_autoinc();
+
+ if (prebuilt->autoinc_error == DB_SUCCESS) {
+
+ /* Determine the first value of the interval */
+ *value = dict_table_autoinc_read(prebuilt->table);
+
+ /* It should have been initialized during open. */
+ if (*value == 0) {
+ prebuilt->autoinc_error = DB_UNSUPPORTED;
+ dict_table_autoinc_unlock(prebuilt->table);
+ }
+ }
+
+ return(prebuilt->autoinc_error);
+}
+
+/*******************************************************************//**
+This function reads the global auto-inc counter. It doesn't use the
+AUTOINC lock even if the lock mode is set to TRADITIONAL.
+@return the autoinc value */
+UNIV_INTERN
+ulonglong
+ha_innobase::innobase_peek_autoinc(void)
+/*====================================*/
+{
+ ulonglong auto_inc;
+ dict_table_t* innodb_table;
+
+ ut_a(prebuilt != NULL);
+ ut_a(prebuilt->table != NULL);
+
+ innodb_table = prebuilt->table;
+
+ dict_table_autoinc_lock(innodb_table);
+
+ auto_inc = dict_table_autoinc_read(innodb_table);
+
+ ut_a(auto_inc > 0);
+
+ dict_table_autoinc_unlock(innodb_table);
+
+ return(auto_inc);
+}
+
+/*********************************************************************//**
+This function initializes the auto-inc counter if it has not been
+initialized yet. This function does not change the value of the auto-inc
+counter if it already has been initialized. Returns the value of the
+auto-inc counter in *first_value, and ULONGLONG_MAX in *nb_reserved_values (as
+we have a table-level lock). offset, increment, nb_desired_values are ignored.
+*first_value is set to -1 if error (deadlock or lock wait timeout) */
+UNIV_INTERN
+void
+ha_innobase::get_auto_increment(
+/*============================*/
+ ulonglong offset, /*!< in: table autoinc offset */
+ ulonglong increment, /*!< in: table autoinc increment */
+ ulonglong nb_desired_values, /*!< in: number of values reqd */
+ ulonglong *first_value, /*!< out: the autoinc value */
+ ulonglong *nb_reserved_values) /*!< out: count of reserved values */
+{
+ trx_t* trx;
+ ulint error;
+ ulonglong autoinc = 0;
+
+ /* Prepare prebuilt->trx in the table handle */
+ update_thd(ha_thd());
+
+ error = innobase_get_autoinc(&autoinc);
+
+ if (error != DB_SUCCESS) {
+ *first_value = (~(ulonglong) 0);
+ return;
+ }
+
+ /* This is a hack, since nb_desired_values seems to be accurate only
+ for the first call to get_auto_increment() for multi-row INSERT and
+ meaningless for other statements e.g, LOAD etc. Subsequent calls to
+ this method for the same statement results in different values which
+ don't make sense. Therefore we store the value the first time we are
+ called and count down from that as rows are written (see write_row()).
+ */
+
+ trx = prebuilt->trx;
+
+ /* Note: We can't rely on *first_value since some MySQL engines,
+ in particular the partition engine, don't initialize it to 0 when
+ invoking this method. So we are not sure if it's guaranteed to
+ be 0 or not. */
+
+ /* We need the upper limit of the col type to check for
+ whether we update the table autoinc counter or not. */
+ ulonglong col_max_value = innobase_get_int_col_max_value(
+ table->next_number_field);
+
+ /* Called for the first time ? */
+ if (trx->n_autoinc_rows == 0) {
+
+ trx->n_autoinc_rows = (ulint) nb_desired_values;
+
+ /* It's possible for nb_desired_values to be 0:
+ e.g., INSERT INTO T1(C) SELECT C FROM T2; */
+ if (nb_desired_values == 0) {
+
+ trx->n_autoinc_rows = 1;
+ }
+
+ set_if_bigger(*first_value, autoinc);
+ /* Not in the middle of a mult-row INSERT. */
+ } else if (prebuilt->autoinc_last_value == 0) {
+ set_if_bigger(*first_value, autoinc);
+ /* Check for -ve values. */
+ } else if (*first_value > col_max_value && trx->n_autoinc_rows > 0) {
+ /* Set to next logical value. */
+ ut_a(autoinc > trx->n_autoinc_rows);
+ *first_value = (autoinc - trx->n_autoinc_rows) - 1;
+ }
+
+ *nb_reserved_values = trx->n_autoinc_rows;
+
+ /* With old style AUTOINC locking we only update the table's
+ AUTOINC counter after attempting to insert the row. */
+ if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+ ulonglong need;
+ ulonglong current;
+ ulonglong next_value;
+
+ current = *first_value > col_max_value ? autoinc : *first_value;
+ need = *nb_reserved_values * increment;
+
+ /* Compute the last value in the interval */
+ next_value = innobase_next_autoinc(
+ current, need, offset, col_max_value);
+
+ prebuilt->autoinc_last_value = next_value;
+
+ if (prebuilt->autoinc_last_value < *first_value) {
+ *first_value = (~(ulonglong) 0);
+ } else {
+ /* Update the table autoinc variable */
+ dict_table_autoinc_update_if_greater(
+ prebuilt->table, prebuilt->autoinc_last_value);
+ }
+ } else {
+ /* This will force write_row() into attempting an update
+ of the table's AUTOINC counter. */
+ prebuilt->autoinc_last_value = 0;
+ }
+
+ /* The increment to be used to increase the AUTOINC value, we use
+ this in write_row() and update_row() to increase the autoinc counter
+ for columns that are filled by the user. We need the offset and
+ the increment. */
+ prebuilt->autoinc_offset = offset;
+ prebuilt->autoinc_increment = increment;
+
+ dict_table_autoinc_unlock(prebuilt->table);
+}
+
+/*******************************************************************//**
+Reset the auto-increment counter to the given value, i.e. the next row
+inserted will get the given value. This is called e.g. after TRUNCATE
+is emulated by doing a 'DELETE FROM t'. HA_ERR_WRONG_COMMAND is
+returned by storage engines that don't support this operation.
+@return 0 or error code */
+UNIV_INTERN
+int
+ha_innobase::reset_auto_increment(
+/*==============================*/
+ ulonglong value) /*!< in: new value for table autoinc */
+{
+ DBUG_ENTER("ha_innobase::reset_auto_increment");
+
+ int error;
+
+ update_thd(ha_thd());
+
+ error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+ if (error != DB_SUCCESS) {
+ error = convert_error_code_to_mysql(error,
+ prebuilt->table->flags,
+ user_thd);
+
+ DBUG_RETURN(error);
+ }
+
+ /* The next value can never be 0. */
+ if (value == 0) {
+ value = 1;
+ }
+
+ innobase_reset_autoinc(value);
+
+ DBUG_RETURN(0);
+}
+
+/* See comment in handler.cc */
+UNIV_INTERN
+bool
+ha_innobase::get_error_message(int error, String *buf)
+{
+ trx_t* trx = check_trx_exists(ha_thd());
+
+ buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error),
+ system_charset_info);
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
+If there is no explicitly declared non-null unique key or a primary key, then
+InnoDB internally uses the row id as the primary key.
+@return < 0 if ref1 < ref2, 0 if equal, else > 0 */
+UNIV_INTERN
+int
+ha_innobase::cmp_ref(
+/*=================*/
+ const uchar* ref1, /*!< in: an (internal) primary key value in the
+ MySQL key value format */
+ const uchar* ref2) /*!< in: an (internal) primary key value in the
+ MySQL key value format */
+{
+ enum_field_types mysql_type;
+ Field* field;
+ KEY_PART_INFO* key_part;
+ KEY_PART_INFO* key_part_end;
+ uint len1;
+ uint len2;
+ int result;
+
+ if (prebuilt->clust_index_was_generated) {
+ /* The 'ref' is an InnoDB row id */
+
+ return(memcmp(ref1, ref2, DATA_ROW_ID_LEN));
+ }
+
+ /* Do a type-aware comparison of primary key fields. PK fields
+ are always NOT NULL, so no checks for NULL are performed. */
+
+ key_part = table->key_info[table->s->primary_key].key_part;
+
+ key_part_end = key_part
+ + table->key_info[table->s->primary_key].key_parts;
+
+ for (; key_part != key_part_end; ++key_part) {
+ field = key_part->field;
+ mysql_type = field->type();
+
+ if (mysql_type == MYSQL_TYPE_TINY_BLOB
+ || mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+ || mysql_type == MYSQL_TYPE_BLOB
+ || mysql_type == MYSQL_TYPE_LONG_BLOB) {
+
+ /* In the MySQL key value format, a column prefix of
+ a BLOB is preceded by a 2-byte length field */
+
+ len1 = innobase_read_from_2_little_endian(ref1);
+ len2 = innobase_read_from_2_little_endian(ref2);
+
+ ref1 += 2;
+ ref2 += 2;
+ result = ((Field_blob*)field)->cmp( ref1, len1,
+ ref2, len2);
+ } else {
+ result = field->key_cmp(ref1, ref2);
+ }
+
+ if (result) {
+
+ return(result);
+ }
+
+ ref1 += key_part->store_length;
+ ref2 += key_part->store_length;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Ask InnoDB if a query to a table can be cached.
+@return TRUE if query caching of the table is permitted */
+UNIV_INTERN
+my_bool
+ha_innobase::register_query_cache_table(
+/*====================================*/
+ THD* thd, /*!< in: user thread handle */
+ char* table_key, /*!< in: concatenation of database name,
+ the null character NUL,
+ and the table name */
+ uint key_length, /*!< in: length of the full name, i.e.
+ len(dbname) + len(tablename) + 1 */
+ qc_engine_callback*
+ call_back, /*!< out: pointer to function for
+ checking if query caching
+ is permitted */
+ ulonglong *engine_data) /*!< in/out: data to call_back */
+{
+ *call_back = innobase_query_caching_of_table_permitted;
+ *engine_data = 0;
+ return(innobase_query_caching_of_table_permitted(thd, table_key,
+ key_length,
+ engine_data));
+}
+
+UNIV_INTERN
+char*
+ha_innobase::get_mysql_bin_log_name()
+{
+ return(trx_sys_mysql_bin_log_name);
+}
+
+UNIV_INTERN
+ulonglong
+ha_innobase::get_mysql_bin_log_pos()
+{
+ /* trx... is ib_int64_t, which is a typedef for a 64-bit integer
+ (__int64 or longlong) so it's ok to cast it to ulonglong. */
+
+ return(trx_sys_mysql_bin_log_pos);
+}
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+extern "C" UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str) /*!< in: character string */
+{
+ ulint char_length; /*!< character length in bytes */
+ ulint n_chars; /*!< number of characters in prefix */
+ CHARSET_INFO* charset; /*!< charset used in the field */
+
+ charset = get_charset((uint) charset_id, MYF(MY_WME));
+
+ ut_ad(charset);
+ ut_ad(charset->mbmaxlen);
+
+ /* Calculate how many characters at most the prefix index contains */
+
+ n_chars = prefix_len / charset->mbmaxlen;
+
+ /* If the charset is multi-byte, then we must find the length of the
+ first at most n chars in the string. If the string contains less
+ characters than n, then we return the length to the end of the last
+ character. */
+
+ if (charset->mbmaxlen > 1) {
+ /* my_charpos() returns the byte length of the first n_chars
+ characters, or a value bigger than the length of str, if
+ there were not enough full characters in str.
+
+ Why does the code below work:
+ Suppose that we are looking for n UTF-8 characters.
+
+ 1) If the string is long enough, then the prefix contains at
+ least n complete UTF-8 characters + maybe some extra
+ characters + an incomplete UTF-8 character. No problem in
+ this case. The function returns the pointer to the
+ end of the nth character.
+
+ 2) If the string is not long enough, then the string contains
+ the complete value of a column, that is, only complete UTF-8
+ characters, and we can store in the column prefix index the
+ whole string. */
+
+ char_length = my_charpos(charset, str,
+ str + data_len, (int) n_chars);
+ if (char_length > data_len) {
+ char_length = data_len;
+ }
+ } else {
+ if (data_len < prefix_len) {
+ char_length = data_len;
+ } else {
+ char_length = prefix_len;
+ }
+ }
+
+ return(char_length);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return 0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ THD* thd, /*!< in: handle to the MySQL thread of
+ the user whose XA transaction should
+ be prepared */
+ bool all) /*!< in: TRUE - commit transaction
+ FALSE - the current SQL statement
+ ended */
+{
+ int error = 0;
+ trx_t* trx = check_trx_exists(thd);
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ /* we use support_xa value as it was seen at transaction start
+ time, not the current session variable value. Any possible changes
+ to the session variable take effect only in the next transaction */
+ if (!trx->support_xa) {
+
+ return(0);
+ }
+
+ thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
+
+ /* Release a possible FIFO ticket and search latch. Since we will
+ reserve the kernel mutex, we have to release the search system latch
+ first to obey the latching order. */
+
+ innobase_release_stat_resources(trx);
+
+ if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) {
+
+ sql_print_error("trx->active_trans == 0, but trx->conc_state != "
+ "TRX_NOT_STARTED");
+ }
+
+ if (all
+ || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+ /* We were instructed to prepare the whole transaction, or
+ this is an SQL statement end and autocommit is on */
+
+ ut_ad(trx->active_trans);
+
+ error = (int) trx_prepare_for_mysql(trx);
+ } else {
+ /* We just mark the SQL statement ended and do not do a
+ transaction prepare */
+
+ /* If we had reserved the auto-inc lock for some
+ table in this SQL statement we release it now */
+
+ row_unlock_table_autoinc_for_mysql(trx);
+
+ /* Store the current undo_no of the transaction so that we
+ know where to roll back if we have to roll back the next
+ SQL statement */
+
+ trx_mark_sql_stat_end(trx);
+ }
+
+ /* Tell the InnoDB server that there might be work for utility
+ threads: */
+
+ srv_active_wake_master_thread();
+
+ if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
+ (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+ {
+ if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) {
+ /* choose group commit rather than binlog order */
+ return(error);
+ }
+
+ /* For ibbackup to work the order of transactions in binlog
+ and InnoDB must be the same. Consider the situation
+
+ thread1> prepare; write to binlog; ...
+ <context switch>
+ thread2> prepare; write to binlog; commit
+ thread1> ... commit
+
+ To ensure this will not happen we're taking the mutex on
+ prepare, and releasing it on commit.
+
+ Note: only do it for normal commits, done via ha_commit_trans.
+ If 2pc protocol is executed by external transaction
+ coordinator, it will be just a regular MySQL client
+ executing XA PREPARE and XA COMMIT commands.
+ In this case we cannot know how many minutes or hours
+ will be between XA PREPARE and XA COMMIT, and we don't want
+ to block for undefined period of time. */
+ pthread_mutex_lock(&prepare_commit_mutex);
+ trx->active_trans = 2;
+ }
+
+ return(error);
+}
+
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid_list,/*!< in/out: prepared transactions */
+ uint len) /*!< in: number of slots in xid_list */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ if (len == 0 || xid_list == NULL) {
+
+ return(0);
+ }
+
+ return(trx_recover_for_mysql(xid_list, len));
+}
+
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+ handlerton *hton,
+ XID* xid) /*!< in: X/Open XA transaction identification */
+{
+ trx_t* trx;
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = trx_get_trx_by_xid(xid);
+
+ if (trx) {
+ innobase_commit_low(trx);
+
+ return(XA_OK);
+ } else {
+ return(XAER_NOTA);
+ }
+}
+
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return 0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+ handlerton* hton, /*!< in: InnoDB handlerton */
+ XID* xid) /*!< in: X/Open XA transaction
+ identification */
+{
+ trx_t* trx;
+
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ trx = trx_get_trx_by_xid(xid);
+
+ if (trx) {
+ return(innobase_rollback_trx(trx));
+ } else {
+ return(XAER_NOTA);
+ }
+}
+
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+ handlerton *hton, /*!< in: innobase hton */
+ THD* thd) /*!< in: user thread handle */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ return(read_cursor_view_create_for_mysql(check_trx_exists(thd)));
+}
+
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+ handlerton *hton,
+ THD* thd, /*!< in: user thread handle */
+ void* curview)/*!< in: Consistent read view to be closed */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ read_cursor_view_close_for_mysql(check_trx_exists(thd),
+ (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+ handlerton *hton,
+ THD* thd, /*!< in: user thread handle */
+ void* curview)/*!< in: Consistent cursor view to be set */
+{
+ DBUG_ASSERT(hton == innodb_hton_ptr);
+
+ read_cursor_set_for_mysql(check_trx_exists(thd),
+ (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+If col_name is not NULL, check whether the named column is being
+renamed in the table. If col_name is not provided, check
+whether any one of columns in the table is being renamed.
+@return true if the column is being renamed */
+static
+bool
+check_column_being_renamed(
+/*=======================*/
+ const TABLE* table, /*!< in: MySQL table */
+ const char* col_name) /*!< in: name of the column */
+{
+ uint k;
+ Field* field;
+
+ for (k = 0; k < table->s->fields; k++) {
+ field = table->field[k];
+
+ if (field->flags & FIELD_IS_RENAMED) {
+
+ /* If col_name is not provided, return
+ if the field is marked as being renamed. */
+ if (!col_name) {
+ return(true);
+ }
+
+ /* If col_name is provided, return only
+ if names match */
+ if (innobase_strcasecmp(field->field_name,
+ col_name) == 0) {
+ return(true);
+ }
+ }
+ }
+
+ return(false);
+}
+
+/*******************************************************************//**
+Check whether any of the given columns is being renamed in the table.
+@return true if any of col_names is being renamed in table */
+static
+bool
+column_is_being_renamed(
+/*====================*/
+ TABLE* table, /*!< in: MySQL table */
+ uint n_cols, /*!< in: number of columns */
+ const char** col_names) /*!< in: names of the columns */
+{
+ uint j;
+
+ for (j = 0; j < n_cols; j++) {
+ if (check_column_being_renamed(table, col_names[j])) {
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+/***********************************************************************
+Check whether a column in table "table" is being renamed and if this column
+is part of a foreign key, either part of another table, referencing this
+table or part of this table, referencing another table. */
+static
+bool
+foreign_key_column_is_being_renamed(
+/*================================*/
+ /* out: true if a column that
+ participates in a foreign key definition
+ is being renamed */
+ row_prebuilt_t* prebuilt, /* in: InnoDB prebuilt struct */
+ TABLE* table) /* in: MySQL table */
+{
+ dict_foreign_t* foreign;
+
+ /* check whether there are foreign keys at all */
+ if (UT_LIST_GET_LEN(prebuilt->table->foreign_list) == 0
+ && UT_LIST_GET_LEN(prebuilt->table->referenced_list) == 0) {
+ /* no foreign keys involved with prebuilt->table */
+
+ return(false);
+ }
+
+ row_mysql_lock_data_dictionary(prebuilt->trx);
+
+ /* Check whether any column in the foreign key constraints which refer
+ to this table is being renamed. */
+ for (foreign = UT_LIST_GET_FIRST(prebuilt->table->referenced_list);
+ foreign != NULL;
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+
+ if (column_is_being_renamed(table, foreign->n_fields,
+ foreign->referenced_col_names)) {
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ return(true);
+ }
+ }
+
+ /* Check whether any column in the foreign key constraints in the
+ table is being renamed. */
+ for (foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+ foreign != NULL;
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+ if (column_is_being_renamed(table, foreign->n_fields,
+ foreign->foreign_col_names)) {
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+ return(true);
+ }
+ }
+
+ row_mysql_unlock_data_dictionary(prebuilt->trx);
+
+ return(false);
+}
+
+UNIV_INTERN
+bool
+ha_innobase::check_if_incompatible_data(
+ HA_CREATE_INFO* info,
+ uint table_changes)
+{
+ enum row_type row_type, info_row_type;
+ DBUG_ENTER("ha_innobase::check_if_incompatible_data");
+
+ if (table_changes != IS_EQUAL_YES) {
+
+ DBUG_PRINT("info", ("table_changes != IS_EQUAL_YES "
+ "-> COMPATIBLE_DATA_NO"));
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ /* Check that auto_increment value was not changed */
+ if ((info->used_fields & HA_CREATE_USED_AUTO) &&
+ info->auto_increment_value != 0) {
+
+ DBUG_PRINT("info", ("auto_increment_value changed -> "
+ "COMPATIBLE_DATA_NO"));
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ /* For column rename operation, MySQL does not supply enough
+ information (new column name etc.) for InnoDB to make appropriate
+ system metadata change. To avoid system metadata inconsistency,
+ currently we can just request a table rebuild/copy by returning
+ COMPATIBLE_DATA_NO */
+ if (check_column_being_renamed(table, NULL)) {
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ /* Check if a column participating in a foreign key is being renamed.
+ There is no mechanism for updating InnoDB foreign key definitions. */
+ if (foreign_key_column_is_being_renamed(prebuilt, table)) {
+
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ /* Check that row format didn't change */
+ row_type = get_row_type();
+ info_row_type = info->row_type;
+ /* Default is compact. */
+ if (info_row_type == ROW_TYPE_DEFAULT)
+ info_row_type = ROW_TYPE_COMPACT;
+ if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) &&
+ get_row_type() != ((info->row_type == ROW_TYPE_DEFAULT)
+ ? ROW_TYPE_COMPACT : info->row_type)) {
+
+ DBUG_PRINT("info", ("get_row_type()=%d != info->row_type=%d -> "
+ "COMPATIBLE_DATA_NO",
+ row_type, info->row_type));
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ /* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+ if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+ DBUG_PRINT("info", ("HA_CREATE_USED_KEY_BLOCK_SIZE -> "
+ "COMPATIBLE_DATA_NO"));
+ DBUG_RETURN(COMPATIBLE_DATA_NO);
+ }
+
+ DBUG_PRINT("info", (" -> COMPATIBLE_DATA_YES"));
+ DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+ const char* format_name) /*!< in: pointer to file format name */
+{
+ char* endp;
+ uint format_id;
+
+ ut_a(format_name != NULL);
+
+ /* The format name can contain the format id itself instead of
+ the name and we check for that. */
+ format_id = (uint) strtoul(format_name, &endp, 10);
+
+ /* Check for valid parse. */
+ if (*endp == '\0' && *format_name != '\0') {
+
+ if (format_id <= DICT_TF_FORMAT_MAX) {
+
+ return(format_id);
+ }
+ } else {
+
+ for (format_id = 0; format_id <= DICT_TF_FORMAT_MAX;
+ format_id++) {
+ const char* name;
+
+ name = trx_sys_file_format_id_to_name(format_id);
+
+ if (!innobase_strcasecmp(format_name, name)) {
+
+ return(format_id);
+ }
+ }
+ }
+
+ return(DICT_TF_FORMAT_MAX + 1);
+}
+
+/************************************************************//**
+Validate the file format check value, is it one of "on" or "off",
+as a side effect it sets the srv_check_file_format_at_startup variable.
+@return true if config value one of "on" or "off" */
+static
+bool
+innobase_file_format_check_on_off(
+/*==============================*/
+ const char* format_check) /*!< in: parameter value */
+{
+ bool ret = true;
+
+ if (!innobase_strcasecmp(format_check, "off")) {
+
+ /* Set the value to disable checking. */
+ srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX + 1;
+
+ } else if (!innobase_strcasecmp(format_check, "on")) {
+
+ /* Set the value to the lowest supported format. */
+ srv_check_file_format_at_startup = DICT_TF_FORMAT_51;
+ } else {
+ ret = FALSE;
+ }
+
+ return(ret);
+}
+
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*================================*/
+ const char* format_check) /*!< in: parameter value */
+{
+ uint format_id;
+
+ format_id = innobase_file_format_name_lookup(format_check);
+
+ if (format_id < DICT_TF_FORMAT_MAX + 1) {
+ srv_check_file_format_at_startup = format_id;
+
+ return((int) format_id);
+ } else {
+ return(-1);
+ }
+}
+
+/*************************************************************//**
+Check if it is a valid file format. This function is registered as
+a callback with MySQL.
+@return 0 for valid file format */
+static
+int
+innodb_file_format_name_validate(
+/*=============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ const char* file_format_input;
+ char buff[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof(buff);
+
+ ut_a(save != NULL);
+ ut_a(value != NULL);
+
+ file_format_input = value->val_str(value, buff, &len);
+
+ if (file_format_input != NULL) {
+ uint format_id;
+
+ format_id = innobase_file_format_name_lookup(
+ file_format_input);
+
+ if (format_id <= DICT_TF_FORMAT_MAX) {
+
+ /* Save a pointer to the name in the
+ 'file_format_name_map' constant array. */
+ *static_cast<const char**>(save) =
+ trx_sys_file_format_id_to_name(format_id);
+
+ return(0);
+ }
+ }
+
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_name_update(
+/*===========================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr, /*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ const char* format_name;
+
+ ut_a(var_ptr != NULL);
+ ut_a(save != NULL);
+
+ format_name = *static_cast<const char*const*>(save);
+
+ if (format_name) {
+ uint format_id;
+
+ format_id = innobase_file_format_name_lookup(format_name);
+
+ if (format_id <= DICT_TF_FORMAT_MAX) {
+ srv_file_format = format_id;
+ }
+ }
+
+ *static_cast<const char**>(var_ptr)
+ = trx_sys_file_format_id_to_name(srv_file_format);
+}
+
+/*************************************************************//**
+Check if valid argument to innodb_file_format_check. This
+function is registered as a callback with MySQL.
+@return 0 for valid file format */
+static
+int
+innodb_file_format_check_validate(
+/*==============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ const char* file_format_input;
+ char buff[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof(buff);
+ int format_id;
+
+ ut_a(save != NULL);
+ ut_a(value != NULL);
+
+ file_format_input = value->val_str(value, buff, &len);
+
+ if (file_format_input != NULL) {
+
+ /* Check if user set on/off, we want to print a suitable
+ message if they did so. */
+
+ if (innobase_file_format_check_on_off(file_format_input)) {
+ push_warning_printf(thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: invalid innodb_file_format_check "
+ "value; on/off can only be set at startup or "
+ "in the configuration file");
+ } else {
+ format_id = innobase_file_format_validate_and_set(
+ file_format_input);
+
+ if (format_id >= 0) {
+ /* Save a pointer to the name in the
+ 'file_format_name_map' constant array. */
+ *static_cast<const char**>(save) =
+ trx_sys_file_format_id_to_name(
+ (uint)format_id);
+
+ return(0);
+
+ } else {
+ push_warning_printf(thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "InnoDB: invalid innodb_file_format_check "
+ "value; can be any format up to %s "
+ "or its equivalent numeric id",
+ trx_sys_file_format_id_to_name(
+ DICT_TF_FORMAT_MAX));
+ }
+ }
+ }
+
+ *static_cast<const char**>(save) = NULL;
+ return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format_check using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_check_update(
+/*============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr, /*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ const char* format_name_in;
+ const char** format_name_out;
+ uint format_id;
+
+ ut_a(save != NULL);
+ ut_a(var_ptr != NULL);
+
+ format_name_in = *static_cast<const char*const*>(save);
+
+ if (!format_name_in) {
+
+ return;
+ }
+
+ format_id = innobase_file_format_name_lookup(format_name_in);
+
+ if (format_id > DICT_TF_FORMAT_MAX) {
+ /* DEFAULT is "on", which is invalid at runtime. */
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WRONG_ARGUMENTS,
+ "Ignoring SET innodb_file_format=%s",
+ format_name_in);
+ return;
+ }
+
+ format_name_out = static_cast<const char**>(var_ptr);
+
+ /* Update the max format id in the system tablespace. */
+ if (trx_sys_file_format_max_set(format_id, format_name_out)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " [Info] InnoDB: the file format in the system "
+ "tablespace is now set to %s.\n", *format_name_out);
+ }
+}
+
+/****************************************************************//**
+Update the system variable innodb_adaptive_hash_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_adaptive_hash_index_update(
+/*==============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr, /*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ if (*(my_bool*) save) {
+ btr_search_enable();
+ } else {
+ btr_search_disable();
+ }
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_old_blocks_pct_update(
+/*=========================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+ *static_cast<const uint*>(save), TRUE);
+}
+
+/*************************************************************//**
+Find the corresponding ibuf_use_t value that indexes into
+innobase_change_buffering_values[] array for the input
+change buffering option name.
+@return corresponding IBUF_USE_* value for the input variable
+name, or IBUF_USE_COUNT if not able to find a match */
+static
+ibuf_use_t
+innodb_find_change_buffering_value(
+/*===============================*/
+ const char* input_name) /*!< in: input change buffering
+ option name */
+{
+ ulint use;
+
+ for (use = 0; use < UT_ARR_SIZE(innobase_change_buffering_values);
+ use++) {
+ /* found a match */
+ if (!innobase_strcasecmp(
+ input_name, innobase_change_buffering_values[use])) {
+ return((ibuf_use_t)use);
+ }
+ }
+
+ /* Did not find any match */
+ return(IBUF_USE_COUNT);
+}
+
+/*************************************************************//**
+Check if it is a valid value of innodb_change_buffering. This function is
+registered as a callback with MySQL.
+@return 0 for valid innodb_change_buffering */
+static
+int
+innodb_change_buffering_validate(
+/*=============================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to system
+ variable */
+ void* save, /*!< out: immediate result
+ for update function */
+ struct st_mysql_value* value) /*!< in: incoming string */
+{
+ const char* change_buffering_input;
+ char buff[STRING_BUFFER_USUAL_SIZE];
+ int len = sizeof(buff);
+
+ ut_a(save != NULL);
+ ut_a(value != NULL);
+
+ change_buffering_input = value->val_str(value, buff, &len);
+
+ if (change_buffering_input != NULL) {
+ ibuf_use_t use;
+
+ use = innodb_find_change_buffering_value(
+ change_buffering_input);
+
+ if (use != IBUF_USE_COUNT) {
+ /* Find a matching change_buffering option value. */
+ *static_cast<const char**>(save) =
+ innobase_change_buffering_values[use];
+
+ return(0);
+ }
+ }
+
+ /* No corresponding change buffering option for user supplied
+ "change_buffering_input" */
+ return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_change_buffering using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffering_update(
+/*===========================*/
+ THD* thd, /*!< in: thread handle */
+ struct st_mysql_sys_var* var, /*!< in: pointer to
+ system variable */
+ void* var_ptr,/*!< out: where the
+ formal string goes */
+ const void* save) /*!< in: immediate result
+ from check function */
+{
+ ibuf_use_t use;
+
+ ut_a(var_ptr != NULL);
+ ut_a(save != NULL);
+
+ use = innodb_find_change_buffering_value(
+ *static_cast<const char*const*>(save));
+
+ ut_a(use < IBUF_USE_COUNT);
+
+ ibuf_use = use;
+ *static_cast<const char**>(var_ptr) =
+ *static_cast<const char*const*>(save);
+}
+
+static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+ innodb_export_status();
+ var->type= SHOW_ARRAY;
+ var->value= (char *) &innodb_status_variables;
+ return 0;
+}
+
+/***********************************************************************
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name matches,
+this function pushes an warning message to the client, and returns true. */
+extern "C" UNIV_INTERN
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+ /* out: true if an index name
+ matches the reserved name */
+ const trx_t* trx, /* in: InnoDB transaction handle */
+ const KEY* key_info, /* in: Indexes to be created */
+ ulint num_of_keys) /* in: Number of indexes to
+ be created. */
+{
+ const KEY* key;
+ uint key_num; /* index number */
+
+ for (key_num = 0; key_num < num_of_keys; key_num++) {
+ key = &key_info[key_num];
+
+ if (innobase_strcasecmp(key->name,
+ innobase_index_reserve_name) == 0) {
+ /* Push warning to mysql */
+ push_warning_printf((THD*) trx->mysql_thd,
+ MYSQL_ERROR::WARN_LEVEL_WARN,
+ ER_WRONG_NAME_FOR_INDEX,
+ "Cannot Create Index with name "
+ "'%s'. The name is reserved "
+ "for the system default primary "
+ "index.",
+ innobase_index_reserve_name);
+
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ innobase_index_reserve_name);
+
+ return(true);
+ }
+ }
+
+ return(false);
+}
+
+static SHOW_VAR innodb_status_variables_export[]= {
+ {"Innodb", (char*) &show_innodb_vars, SHOW_FUNC},
+ {NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_storage_engine innobase_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+/* plugin options */
+static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable InnoDB checksums validation (enabled by default). "
+ "Disable with --skip-innodb-checksums.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(fast_checksum, innobase_fast_checksum,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Change the algorithm of checksum for the whole of datapage to 4-bytes word based. "
+ "The original checksum is checked after the new one. It may be slow for reading page"
+ " which has orginal checksum. Overwrite the page or recreate the InnoDB database, "
+ "if you want the entire benefit for performance at once. "
+ "#### Attention: The checksum is not compatible for normal or disabled version! ####",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(page_size, innobase_page_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "###EXPERIMENTAL###: The universal page size of the database. Changing for created database is not supported. Use on your own risk!",
+ NULL, NULL, (1 << 14), (1 << 12), (1 << UNIV_PAGE_SIZE_SHIFT_MAX), 0);
+
+static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
+ PLUGIN_VAR_READONLY,
+ "The common part for InnoDB table spaces.",
+ NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(extra_undoslots, innobase_extra_undoslots,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable to use about 4000 undo slots instead of default 1024. "
+ "#### Attention: Once you enable this parameter, "
+ "don't use the datafile for normal mysqld or ibbackup! ####",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(fast_recovery, innobase_fast_recovery,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "obsolete option. affects nothing.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Output statistics of recovery process after it.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULINT(use_purge_thread, srv_use_purge_thread,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of purge devoted threads. #### over 1 is EXPERIMENTAL ####",
+ NULL, NULL, 1, 0, 64, 0);
+
+static MYSQL_SYSVAR_BOOL(overwrite_relay_log_info, innobase_overwrite_relay_log_info,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "During InnoDB crash recovery on slave overwrite relay-log.info "
+ "to align master log file position if information in InnoDB and relay-log.info is different.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable InnoDB doublewrite buffer (enabled by default). "
+ "Disable with --skip-innodb-doublewrite.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of IOPs the server can do. Tunes the background IO rate",
+ NULL, NULL, 200, 100, ~0L, 0);
+
+static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
+ PLUGIN_VAR_OPCMDARG,
+ "Speeds up the shutdown process of the InnoDB storage engine. Possible "
+ "values are 0, 1 (faster)"
+ /*
+ NetWare can't close unclosed files, can't automatically kill remaining
+ threads, etc, so on this OS we disable the crash-like InnoDB shutdown.
+ */
+ IF_NETWARE("", " or 2 (fastest - crash-like)")
+ ".",
+ NULL, NULL, 1, 0, IF_NETWARE(1,2), 0);
+
+static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
+ PLUGIN_VAR_NOCMDARG,
+ "Stores each InnoDB table to an .ibd file in the database dir.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name,
+ PLUGIN_VAR_RQCMDARG,
+ "File format to use for new tables in .ibd files.",
+ innodb_file_format_name_validate,
+ innodb_file_format_name_update, "Antelope");
+
+/* If a new file format is introduced, the file format
+name needs to be updated accordingly. Please refer to
+file_format_name_map[] defined in trx0sys.c for the next
+file format name. */
+static MYSQL_SYSVAR_STR(file_format_check, innobase_file_format_check,
+ PLUGIN_VAR_OPCMDARG,
+ "The highest file format in the tablespace.",
+ innodb_file_format_check_validate,
+ innodb_file_format_check_update, "Barracuda");
+
+static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
+ PLUGIN_VAR_OPCMDARG,
+ "Set to 0 (write and flush once per second),"
+ " 1 (write and flush at each commit)"
+ " or 2 (write at commit, flush once per second).",
+ NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "With which method to flush data.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Force InnoDB to not use next-key locking, to use only row-level locking.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(show_verbose_locks, srv_show_verbose_locks,
+ PLUGIN_VAR_OPCMDARG,
+ "Whether to show records locked in SHOW INNODB STATUS.",
+ NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(show_locks_held, srv_show_locks_held,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.",
+ NULL, NULL, 10, 0, 1000, 0);
+
+#ifdef UNIV_LOG_ARCHIVE
+static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Where full logs should be archived.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+static MYSQL_SYSVAR_STR(log_group_home_dir, innobase_log_group_home_dir,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to InnoDB log files.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of dirty pages allowed in bufferpool.",
+ NULL, NULL, 75, 0, 99, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+ PLUGIN_VAR_NOCMDARG,
+ "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
+ PLUGIN_VAR_RQCMDARG,
+ "Desired maximum length of the purge queue (0 = no limit)",
+ NULL, NULL, 0, 0, ~0L, 0);
+
+static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+ "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
+ PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
+ "Enable SHOW INNODB STATUS output in the innodb_status.<pid> file",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "The number of index pages to sample when calculating statistics (default 8)",
+ NULL, NULL, 8, 1, ~0ULL, 0);
+
+const char *innobase_stats_method_names[]=
+{
+ "nulls_equal",
+ "nulls_unequal",
+ "nulls_ignored",
+ NullS
+};
+TYPELIB innobase_stats_method_typelib=
+{
+ array_elements(innobase_stats_method_names) - 1, "innobase_stats_method_typelib",
+ innobase_stats_method_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(stats_method, srv_stats_method,
+ PLUGIN_VAR_RQCMDARG,
+ "Specifies how InnoDB index statistics collection code should threat NULLs. "
+ "Possible values of name are same to for 'myisam_stats_method'. "
+ "This is startup parameter.",
+ NULL, NULL, 0, &innobase_stats_method_typelib);
+
+static MYSQL_SYSVAR_ULONG(stats_auto_update, srv_stats_auto_update,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable InnoDB's auto update statistics of indexes. "
+ "(except for ANALYZE TABLE command) 0:disable 1:enable",
+ NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULINT(stats_update_need_lock, srv_stats_update_need_lock,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable InnoDB's update statistics which needs to lock dictionary. "
+ "e.g. Data_free.",
+ NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_BOOL(use_sys_stats_table, innobase_use_sys_stats_table,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable to use SYS_STATS system table to store statistics statically, "
+ "And avoids to calculate statistics at every first open of the tables. "
+ "This option may make the opportunities of update statistics less. "
+ "So you should use ANALYZE TABLE command intentionally.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
+ PLUGIN_VAR_OPCMDARG,
+ "Enable InnoDB adaptive hash index (enabled by default). "
+ "Disable with --skip-innodb-adaptive-hash-index.",
+ NULL, innodb_adaptive_hash_index_update, TRUE);
+
+static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
+ PLUGIN_VAR_RQCMDARG,
+ "Replication thread delay (ms) on the slave server if "
+ "innodb_thread_concurrency is reached (0 by default)",
+ NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.",
+ NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment,
+ PLUGIN_VAR_RQCMDARG,
+ "Data file autoextend increment in megabytes",
+ NULL, NULL, 8L, 1L, 1000L, 0);
+
+static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
+ NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.",
+ NULL, NULL, 0, 0, INT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Enable buffer_pool_shm checksum validation (enabled by default).",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
+ PLUGIN_VAR_RQCMDARG,
+ "Helps in performance tuning in heavily concurrent environments.",
+ innobase_commit_concurrency_validate, NULL, 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
+ NULL, NULL, 500L, 1L, ~0L, 0);
+
+static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
+ "Number of file I/O threads in InnoDB.",
+ NULL, NULL, 4, 4, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of background read I/O threads in InnoDB.",
+ NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of background write I/O threads in InnoDB.",
+ NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Helps to save your data in case the disk image of the database becomes corrupt.",
+ NULL, NULL, 0, 0, 6, 0);
+
+static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The size of the buffer which InnoDB uses to write log to the log files on disk.",
+ NULL, NULL, 8*1024*1024L, 256*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Size of each log file in a log group.",
+ NULL, NULL, 5*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_LONG(log_files_in_group, innobase_log_files_in_group,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of log files in the log group. InnoDB writes to the files in a circular fashion. Value 3 is recommended here.",
+ NULL, NULL, 2, 2, 100, 0);
+
+static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.",
+ NULL, NULL, 1, 1, 10, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
+ PLUGIN_VAR_RQCMDARG,
+ "Percentage of the buffer pool to reserve for 'old' blocks.",
+ NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
+ PLUGIN_VAR_RQCMDARG,
+ "Move blocks to the 'new' end of the buffer pool if the first access"
+ " was at least this many milliseconds ago."
+ " The timeout is disabled if 0 (the default).",
+ NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_LONG(open_files, innobase_open_files,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "How many files at the maximum InnoDB keeps open at the same time.",
+ NULL, NULL, 300L, 10L, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
+ PLUGIN_VAR_RQCMDARG,
+ "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
+ NULL, NULL, 30L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay,
+ PLUGIN_VAR_OPCMDARG,
+ "Maximum delay between polling for a spin lock (6 by default)",
+ NULL, NULL, 6L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_BOOL(thread_concurrency_timer_based,
+ innobase_thread_concurrency_timer_based,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use InnoDB timer based concurrency throttling. ",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
+ PLUGIN_VAR_RQCMDARG,
+ "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
+ NULL, NULL, 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
+ PLUGIN_VAR_RQCMDARG,
+ "Time of innodb thread sleeping before joining InnoDB queue (usec). Value 0 disable a sleep",
+ NULL, NULL, 10000L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to individual files and their sizes.",
+ NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###",
+ NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The AUTOINC lock modes supported by InnoDB: "
+ "0 => Old style AUTOINC locking (for backward"
+ " compatibility) "
+ "1 => New style AUTOINC locking "
+ "2 => No AUTOINC locking (unsafe for SBR)",
+ NULL, NULL,
+ AUTOINC_NEW_STYLE_LOCKING, /* Default setting */
+ AUTOINC_OLD_STYLE_LOCKING, /* Minimum value */
+ AUTOINC_NO_LOCKING, 0); /* Maximum value */
+
+static MYSQL_SYSVAR_STR(version, innodb_version_str,
+ PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
+ "Percona-InnoDB-plugin version", NULL, NULL, INNODB_VERSION_STR);
+
+static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use OS memory allocator instead of InnoDB's internal memory allocator",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
+ PLUGIN_VAR_RQCMDARG,
+ "Buffer changes to reduce random access: "
+ "OFF, ON, none, inserts.",
+ innodb_change_buffering_validate,
+ innodb_change_buffering_update, "inserts");
+
+static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
+ PLUGIN_VAR_RQCMDARG,
+ "Number of pages that must be accessed sequentially for InnoDB to "
+ "trigger a readahead.",
+ NULL, NULL, 56, 0, 64, 0);
+
+static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "The maximum size of the insert buffer. (in bytes)",
+ NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
+ NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
+ PLUGIN_VAR_RQCMDARG,
+ "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
+ NULL, NULL, 100, 100, 999999999, 0);
+
+static MYSQL_SYSVAR_ULINT(checkpoint_age_target, srv_checkpoint_age_target,
+ PLUGIN_VAR_RQCMDARG,
+ "Control soft limit of checkpoint age. (0 : not control)",
+ NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
+ NULL, NULL, 1, 0, 1, 0);
+
+static
+void
+innodb_read_ahead_update(
+ THD* thd,
+ struct st_mysql_sys_var* var,
+ void* var_ptr,
+ const void* save)
+{
+ *(long *)var_ptr= (*(long *)save) & 3;
+}
+const char *read_ahead_names[]=
+{
+ "none", /* 0 */
+ "random",
+ "linear",
+ "both", /* 3 */
+ /* For compatibility of the older patch */
+ "0", /* 4 ("none" + 4) */
+ "1",
+ "2",
+ "3", /* 7 ("both" + 4) */
+ NullS
+};
+TYPELIB read_ahead_typelib=
+{
+ array_elements(read_ahead_names) - 1, "read_ahead_typelib",
+ read_ahead_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
+ PLUGIN_VAR_RQCMDARG,
+ "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
+ NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
+
+static
+void
+innodb_adaptive_checkpoint_update(
+ THD* thd,
+ struct st_mysql_sys_var* var,
+ void* var_ptr,
+ const void* save)
+{
+ *(long *)var_ptr= (*(long *)save) % 3;
+}
+const char *adaptive_checkpoint_names[]=
+{
+ "none", /* 0 */
+ "reflex", /* 1 */
+ "estimate", /* 2 */
+ /* For compatibility of the older patch */
+ "0", /* 3 ("none" + 3) */
+ "1", /* 4 ("reflex" + 3) */
+ "2", /* 5 ("estimate" + 3) */
+ NullS
+};
+TYPELIB adaptive_checkpoint_typelib=
+{
+ array_elements(adaptive_checkpoint_names) - 1, "adaptive_checkpoint_typelib",
+ adaptive_checkpoint_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable flushing along modified age. (none, reflex, [estimate])",
+ NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib);
+
+static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
+ NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import,
+ PLUGIN_VAR_RQCMDARG,
+ "Enable/Disable converting automatically *.ibd files when import tablespace.",
+ NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(extra_rsegments, srv_extra_rsegments,
+ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+ "Number of extra user rollback segments when create new database.",
+ NULL, NULL, 0, 0, 126, 0);
+
+static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit,
+ PLUGIN_VAR_RQCMDARG,
+ "Limit the allocated memory for dictionary cache. (0: unlimited)",
+ NULL, NULL, 0, 0, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_UINT(auto_lru_dump, srv_auto_lru_dump,
+ PLUGIN_VAR_RQCMDARG,
+ "Time in seconds between automatic buffer pool dumps. "
+ "0 (the default) disables automatic dumps.",
+ NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_ULINT(pass_corrupt_table, srv_pass_corrupt_table,
+ PLUGIN_VAR_RQCMDARG,
+ "Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, "
+ "when used with file_per_table. "
+ "All file io for the datafile after detected as corrupt are disabled, "
+ "except for the deletion.",
+ NULL, NULL, 0, 0, 1, 0);
+
+static struct st_mysql_sys_var* innobase_system_variables[]= {
+ MYSQL_SYSVAR(page_size),
+ MYSQL_SYSVAR(additional_mem_pool_size),
+ MYSQL_SYSVAR(autoextend_increment),
+ MYSQL_SYSVAR(buffer_pool_size),
+ MYSQL_SYSVAR(buffer_pool_shm_key),
+ MYSQL_SYSVAR(buffer_pool_shm_checksum),
+ MYSQL_SYSVAR(checksums),
+ MYSQL_SYSVAR(fast_checksum),
+ MYSQL_SYSVAR(commit_concurrency),
+ MYSQL_SYSVAR(concurrency_tickets),
+ MYSQL_SYSVAR(data_file_path),
+ MYSQL_SYSVAR(doublewrite_file),
+ MYSQL_SYSVAR(data_home_dir),
+ MYSQL_SYSVAR(doublewrite),
+ MYSQL_SYSVAR(extra_undoslots),
+ MYSQL_SYSVAR(fast_recovery),
+ MYSQL_SYSVAR(recovery_stats),
+ MYSQL_SYSVAR(fast_shutdown),
+ MYSQL_SYSVAR(file_io_threads),
+ MYSQL_SYSVAR(read_io_threads),
+ MYSQL_SYSVAR(write_io_threads),
+ MYSQL_SYSVAR(file_per_table),
+ MYSQL_SYSVAR(file_format),
+ MYSQL_SYSVAR(file_format_check),
+ MYSQL_SYSVAR(flush_log_at_trx_commit),
+ MYSQL_SYSVAR(flush_method),
+ MYSQL_SYSVAR(force_recovery),
+ MYSQL_SYSVAR(locks_unsafe_for_binlog),
+ MYSQL_SYSVAR(lock_wait_timeout),
+#ifdef UNIV_LOG_ARCHIVE
+ MYSQL_SYSVAR(log_arch_dir),
+ MYSQL_SYSVAR(log_archive),
+#endif /* UNIV_LOG_ARCHIVE */
+ MYSQL_SYSVAR(log_buffer_size),
+ MYSQL_SYSVAR(log_file_size),
+ MYSQL_SYSVAR(log_files_in_group),
+ MYSQL_SYSVAR(log_group_home_dir),
+ MYSQL_SYSVAR(max_dirty_pages_pct),
+ MYSQL_SYSVAR(adaptive_flushing),
+ MYSQL_SYSVAR(max_purge_lag),
+ MYSQL_SYSVAR(mirrored_log_groups),
+ MYSQL_SYSVAR(old_blocks_pct),
+ MYSQL_SYSVAR(old_blocks_time),
+ MYSQL_SYSVAR(open_files),
+ MYSQL_SYSVAR(overwrite_relay_log_info),
+ MYSQL_SYSVAR(rollback_on_timeout),
+ MYSQL_SYSVAR(stats_on_metadata),
+ MYSQL_SYSVAR(stats_method),
+ MYSQL_SYSVAR(stats_auto_update),
+ MYSQL_SYSVAR(stats_update_need_lock),
+ MYSQL_SYSVAR(use_sys_stats_table),
+ MYSQL_SYSVAR(stats_sample_pages),
+ MYSQL_SYSVAR(adaptive_hash_index),
+ MYSQL_SYSVAR(replication_delay),
+ MYSQL_SYSVAR(status_file),
+ MYSQL_SYSVAR(strict_mode),
+ MYSQL_SYSVAR(support_xa),
+ MYSQL_SYSVAR(sync_spin_loops),
+ MYSQL_SYSVAR(spin_wait_delay),
+ MYSQL_SYSVAR(table_locks),
+ MYSQL_SYSVAR(thread_concurrency),
+ MYSQL_SYSVAR(thread_concurrency_timer_based),
+ MYSQL_SYSVAR(thread_sleep_delay),
+ MYSQL_SYSVAR(autoinc_lock_mode),
+ MYSQL_SYSVAR(show_verbose_locks),
+ MYSQL_SYSVAR(show_locks_held),
+ MYSQL_SYSVAR(version),
+ MYSQL_SYSVAR(ibuf_max_size),
+ MYSQL_SYSVAR(ibuf_active_contract),
+ MYSQL_SYSVAR(ibuf_accel_rate),
+ MYSQL_SYSVAR(checkpoint_age_target),
+ MYSQL_SYSVAR(flush_neighbor_pages),
+ MYSQL_SYSVAR(read_ahead),
+ MYSQL_SYSVAR(adaptive_checkpoint),
+ MYSQL_SYSVAR(flush_log_at_trx_commit_session),
+ MYSQL_SYSVAR(enable_unsafe_group_commit),
+ MYSQL_SYSVAR(expand_import),
+ MYSQL_SYSVAR(extra_rsegments),
+ MYSQL_SYSVAR(dict_size_limit),
+ MYSQL_SYSVAR(use_sys_malloc),
+ MYSQL_SYSVAR(change_buffering),
+ MYSQL_SYSVAR(read_ahead_threshold),
+ MYSQL_SYSVAR(io_capacity),
+ MYSQL_SYSVAR(auto_lru_dump),
+ MYSQL_SYSVAR(use_purge_thread),
+ MYSQL_SYSVAR(pass_corrupt_table),
+ NULL
+};
+
+mysql_declare_plugin(xtradb)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &innobase_storage_engine,
+ innobase_hton_name,
+ "Percona",
+ "Percona-XtraDB, Supports transactions, row-level locking, and foreign keys",
+ PLUGIN_LICENSE_GPL,
+ innobase_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ INNODB_VERSION_SHORT,
+ innodb_status_variables_export,/* status variables */
+ innobase_system_variables, /* system variables */
+ NULL /* reserved */
+},
+i_s_innodb_rseg,
+i_s_innodb_buffer_pool_pages,
+i_s_innodb_buffer_pool_pages_index,
+i_s_innodb_buffer_pool_pages_blob,
+i_s_innodb_trx,
+i_s_innodb_locks,
+i_s_innodb_lock_waits,
+i_s_innodb_cmp,
+i_s_innodb_cmp_reset,
+i_s_innodb_cmpmem,
+i_s_innodb_cmpmem_reset,
+i_s_innodb_table_stats,
+i_s_innodb_index_stats,
+i_s_innodb_admin_command,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_stats,
+i_s_innodb_patches
+mysql_declare_plugin_end;
+maria_declare_plugin(xtradb)
+{ /* InnoDB */
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &innobase_storage_engine,
+ innobase_hton_name,
+ "Percona",
+ "XtraDB engine based on InnoDB plugin. Supports transactions, row-level locking, and foreign keys",
+ PLUGIN_LICENSE_GPL,
+ innobase_init, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ INNODB_VERSION_SHORT,
+ innodb_status_variables_export,/* status variables */
+ innobase_system_variables, /* system variables */
+ INNODB_VERSION_STR, /* string version */
+ MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+},
+i_s_innodb_rseg_maria,
+i_s_innodb_buffer_pool_pages_maria,
+i_s_innodb_buffer_pool_pages_index_maria,
+i_s_innodb_buffer_pool_pages_blob_maria,
+i_s_innodb_trx_maria,
+i_s_innodb_locks_maria,
+i_s_innodb_lock_waits_maria,
+i_s_innodb_cmp_maria,
+i_s_innodb_cmp_reset_maria,
+i_s_innodb_cmpmem_maria,
+i_s_innodb_cmpmem_reset_maria,
+i_s_innodb_table_stats_maria,
+i_s_innodb_index_stats_maria,
+i_s_innodb_admin_command_maria,
+i_s_innodb_sys_tables_maria,
+i_s_innodb_sys_indexes_maria,
+i_s_innodb_sys_stats_maria,
+i_s_innodb_patches_maria
+maria_declare_plugin_end;
+
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default(void)
+/*==========================================*/
+{
+ MYSQL_SYSVAR_NAME(commit_concurrency).def_val
+ = innobase_commit_concurrency;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+typedef struct innobase_convert_name_test_struct {
+ char* buf;
+ ulint buflen;
+ const char* id;
+ ulint idlen;
+ void* thd;
+ ibool file_id;
+
+ const char* expected;
+} innobase_convert_name_test_t;
+
+void
+test_innobase_convert_name()
+{
+ char buf[1024];
+ ulint i;
+
+ innobase_convert_name_test_t test_input[] = {
+ {buf, sizeof(buf), "abcd", 4, NULL, TRUE, "\"abcd\""},
+ {buf, 7, "abcd", 4, NULL, TRUE, "\"abcd\""},
+ {buf, 6, "abcd", 4, NULL, TRUE, "\"abcd\""},
+ {buf, 5, "abcd", 4, NULL, TRUE, "\"abc\""},
+ {buf, 4, "abcd", 4, NULL, TRUE, "\"ab\""},
+
+ {buf, sizeof(buf), "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+ {buf, 9, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+ {buf, 8, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+ {buf, 7, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+ {buf, 6, "ab@0060cd", 9, NULL, TRUE, "\"ab`c\""},
+ {buf, 5, "ab@0060cd", 9, NULL, TRUE, "\"ab`\""},
+ {buf, 4, "ab@0060cd", 9, NULL, TRUE, "\"ab\""},
+
+ {buf, sizeof(buf), "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\"\"cd\""},
+ {buf, 17, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\"\"cd\""},
+ {buf, 16, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\"\"c\""},
+ {buf, 15, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\"\"\""},
+ {buf, 14, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\""},
+ {buf, 13, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#ab\""},
+ {buf, 12, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#a\""},
+ {buf, 11, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50#\""},
+ {buf, 10, "ab\"cd", 5, NULL, TRUE,
+ "\"#mysql50\""},
+
+ {buf, sizeof(buf), "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+ {buf, 9, "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+ {buf, 8, "ab/cd", 5, NULL, TRUE, "\"ab\".\"c\""},
+ {buf, 7, "ab/cd", 5, NULL, TRUE, "\"ab\".\"\""},
+ {buf, 6, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+ {buf, 5, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+ {buf, 4, "ab/cd", 5, NULL, TRUE, "\"ab\""},
+ {buf, 3, "ab/cd", 5, NULL, TRUE, "\"a\""},
+ {buf, 2, "ab/cd", 5, NULL, TRUE, "\"\""},
+ /* XXX probably "" is a better result in this case
+ {buf, 1, "ab/cd", 5, NULL, TRUE, "."},
+ */
+ {buf, 0, "ab/cd", 5, NULL, TRUE, ""},
+ };
+
+ for (i = 0; i < sizeof(test_input) / sizeof(test_input[0]); i++) {
+
+ char* end;
+ ibool ok = TRUE;
+ size_t res_len;
+
+ fprintf(stderr, "TESTING %lu, %s, %lu, %s\n",
+ test_input[i].buflen,
+ test_input[i].id,
+ test_input[i].idlen,
+ test_input[i].expected);
+
+ end = innobase_convert_name(
+ test_input[i].buf,
+ test_input[i].buflen,
+ test_input[i].id,
+ test_input[i].idlen,
+ test_input[i].thd,
+ test_input[i].file_id);
+
+ res_len = (size_t) (end - test_input[i].buf);
+
+ if (res_len != strlen(test_input[i].expected)) {
+
+ fprintf(stderr, "unexpected len of the result: %u, "
+ "expected: %u\n", (unsigned) res_len,
+ (unsigned) strlen(test_input[i].expected));
+ ok = FALSE;
+ }
+
+ if (memcmp(test_input[i].buf,
+ test_input[i].expected,
+ strlen(test_input[i].expected)) != 0
+ || !ok) {
+
+ fprintf(stderr, "unexpected result: %.*s, "
+ "expected: %s\n", (int) res_len,
+ test_input[i].buf,
+ test_input[i].expected);
+ ok = FALSE;
+ }
+
+ if (ok) {
+ fprintf(stderr, "OK: res: %.*s\n\n", (int) res_len,
+ buf);
+ } else {
+ fprintf(stderr, "FAILED\n\n");
+ return;
+ }
+ }
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+
+/****************************************************************************
+ * DS-MRR implementation
+ ***************************************************************************/
+
+/**
+ * Multi Range Read interface, DS-MRR calls
+ */
+
+int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+ uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+{
+ return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+}
+
+int ha_innobase::multi_range_read_next(char **range_info)
+{
+ return ds_mrr.dsmrr_next(range_info);
+}
+
+ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+ void *seq_init_param,
+ uint n_ranges, uint *bufsz,
+ uint *flags,
+ COST_VECT *cost)
+{
+ /* See comments in ha_myisam::multi_range_read_info_const */
+ ds_mrr.init(this, table);
+
+ if (prebuilt->select_lock_type != LOCK_NONE)
+ *flags |= HA_MRR_USE_DEFAULT_IMPL;
+
+ ha_rows res= ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
+ bufsz, flags, cost);
+ return res;
+}
+
+ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges,
+ uint keys, uint *bufsz,
+ uint *flags, COST_VECT *cost)
+{
+ ds_mrr.init(this, table);
+ ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+ return res;
+}
+
+
+
+/**
+ * Index Condition Pushdown interface implementation
+ */
+
+C_MODE_START
+
+/*
+ Index condition check function to be called from within Innobase.
+ See note on ICP_RESULT for return values description.
+*/
+
+static int index_cond_func_innodb(void *arg)
+{
+ ha_innobase *h= (ha_innobase*)arg;
+ if (h->end_range)
+ {
+ if (h->compare_key2(h->end_range) > 0)
+ return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
+ }
+ return h->pushed_idx_cond->val_int()? ICP_MATCH : ICP_NO_MATCH;
+}
+
+C_MODE_END
+
+
+Item *ha_innobase::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
+{
+ if ((keyno_arg != primary_key) && (prebuilt->select_lock_type == LOCK_NONE))
+ {
+ pushed_idx_cond_keyno= keyno_arg;
+ pushed_idx_cond= idx_cond_arg;
+ in_range_check_pushed_down= TRUE;
+ return NULL; /* Table handler will check the entire condition */
+ }
+ return idx_cond_arg; /* Table handler will not make any checks */
+}
+
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
new file mode 100644
index 00000000000..50a43aaebed
--- /dev/null
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -0,0 +1,349 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*
+ This file is based on ha_berkeley.h of MySQL distribution
+
+ This file defines the Innodb handler: the interface between MySQL and
+ Innodb
+*/
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface /* gcc class implementation */
+#endif
+
+/* Structure defines translation table between mysql index and innodb
+index structures */
+typedef struct innodb_idx_translate_struct {
+ ulint index_count; /*!< number of valid index entries
+ in the index_mapping array */
+ ulint array_size; /*!< array size of index_mapping */
+ dict_index_t** index_mapping; /*!< index pointer array directly
+ maps to index in Innodb from MySQL
+ array index */
+} innodb_idx_translate_t;
+
+
+/** InnoDB table share */
+typedef struct st_innobase_share {
+ THR_LOCK lock; /*!< MySQL lock protecting
+ this structure */
+ const char* table_name; /*!< InnoDB table name */
+ uint use_count; /*!< reference count,
+ incremented in get_share()
+ and decremented in
+ free_share() */
+ void* table_name_hash;/*!< hash table chain node */
+ innodb_idx_translate_t idx_trans_tbl; /*!< index translation
+ table between MySQL and
+ Innodb */
+ dict_table_t* ib_table;
+} INNOBASE_SHARE;
+
+
+/** InnoDB B-tree index */
+struct dict_index_struct;
+/** Prebuilt structures in an Innobase table handle used within MySQL */
+struct row_prebuilt_struct;
+
+/** InnoDB B-tree index */
+typedef struct dict_index_struct dict_index_t;
+/** Prebuilt structures in an Innobase table handle used within MySQL */
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/** The class defining a handle to an Innodb table */
+class ha_innobase: public handler
+{
+ row_prebuilt_t* prebuilt; /*!< prebuilt struct in InnoDB, used
+ to save CPU time with prebuilt data
+ structures*/
+ THD* user_thd; /*!< the thread handle of the user
+ currently using the handle; this is
+ set in external_lock function */
+ THR_LOCK_DATA lock;
+ INNOBASE_SHARE* share; /*!< information for MySQL
+ table locking */
+
+ uchar* upd_buff; /*!< buffer used in updates */
+ uchar* key_val_buff; /*!< buffer used in converting
+ search key values from MySQL format
+ to Innodb format */
+ ulong upd_and_key_val_buff_len;
+ /* the length of each of the previous
+ two buffers */
+ Table_flags int_table_flags;
+ uint primary_key;
+ ulong start_of_scan; /*!< this is set to 1 when we are
+ starting a table scan but have not
+ yet fetched any row, else 0 */
+ uint last_match_mode;/* match mode of the latest search:
+ ROW_SEL_EXACT, ROW_SEL_EXACT_PREFIX,
+ or undefined */
+ uint num_write_row; /*!< number of write_row() calls */
+
+ uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
+ const uchar* record);
+ inline void update_thd(THD* thd);
+ void update_thd();
+ int change_active_index(uint keynr);
+ int general_fetch(uchar* buf, uint direction, uint match_mode);
+ ulint innobase_lock_autoinc();
+ ulonglong innobase_peek_autoinc();
+ ulint innobase_set_max_autoinc(ulonglong auto_inc);
+ ulint innobase_reset_autoinc(ulonglong auto_inc);
+ ulint innobase_get_autoinc(ulonglong* value);
+ ulint innobase_update_autoinc(ulonglong auto_inc);
+ void innobase_initialize_autoinc();
+ dict_index_t* innobase_get_index(uint keynr);
+
+ /* Init values for the class: */
+ public:
+ ha_innobase(handlerton *hton, TABLE_SHARE *table_arg);
+ ~ha_innobase();
+ /*
+ Get the row type from the storage engine. If this method returns
+ ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used.
+ */
+ enum row_type get_row_type() const;
+
+ const char* table_type() const;
+ const char* index_type(uint key_number);
+ const char** bas_ext() const;
+ Table_flags table_flags() const;
+ ulong index_flags(uint idx, uint part, bool all_parts) const;
+ uint max_supported_keys() const;
+ uint max_supported_key_length() const;
+ uint max_supported_key_part_length() const;
+ const key_map* keys_to_use_for_scanning();
+
+ int open(const char *name, int mode, uint test_if_locked);
+ int close(void);
+ double scan_time();
+ double read_time(uint index, uint ranges, ha_rows rows);
+ bool is_corrupt() const;
+
+ int write_row(uchar * buf);
+ int update_row(const uchar * old_data, uchar * new_data);
+ int delete_row(const uchar * buf);
+ bool was_semi_consistent_read();
+ void try_semi_consistent_read(bool yes);
+ void unlock_row();
+
+ int index_init(uint index, bool sorted);
+ int index_end();
+ int index_read(uchar * buf, const uchar * key,
+ uint key_len, enum ha_rkey_function find_flag);
+ int index_read_idx(uchar * buf, uint index, const uchar * key,
+ uint key_len, enum ha_rkey_function find_flag);
+ int index_read_last(uchar * buf, const uchar * key, uint key_len);
+ int index_next(uchar * buf);
+ int index_next_same(uchar * buf, const uchar *key, uint keylen);
+ int index_prev(uchar * buf);
+ int index_first(uchar * buf);
+ int index_last(uchar * buf);
+
+ int rnd_init(bool scan);
+ int rnd_end();
+ int rnd_next(uchar *buf);
+ int rnd_pos(uchar * buf, uchar *pos);
+
+ void position(const uchar *record);
+ int info(uint);
+ int analyze(THD* thd,HA_CHECK_OPT* check_opt);
+ int optimize(THD* thd,HA_CHECK_OPT* check_opt);
+ int discard_or_import_tablespace(my_bool discard);
+ int extra(enum ha_extra_function operation);
+ int reset();
+ int external_lock(THD *thd, int lock_type);
+ int transactional_table_lock(THD *thd, int lock_type);
+ int start_stmt(THD *thd, thr_lock_type lock_type);
+ void position(uchar *record);
+ ha_rows records_in_range(uint inx, key_range *min_key, key_range
+ *max_key);
+ ha_rows estimate_rows_upper_bound();
+
+ void update_create_info(HA_CREATE_INFO* create_info);
+ int create(const char *name, register TABLE *form,
+ HA_CREATE_INFO *create_info);
+ int delete_all_rows();
+ int delete_table(const char *name);
+ int rename_table(const char* from, const char* to);
+ int check(THD* thd, HA_CHECK_OPT* check_opt);
+ char* update_table_comment(const char* comment);
+ char* get_foreign_key_create_info();
+ int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list);
+ bool can_switch_engines();
+ uint referenced_by_foreign_key();
+ void free_foreign_key_create_info(char* str);
+ THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+ enum thr_lock_type lock_type);
+ void init_table_handle_for_HANDLER();
+ virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values);
+ int reset_auto_increment(ulonglong value);
+
+ virtual bool get_error_message(int error, String *buf);
+
+ uint8 table_cache_type();
+ /*
+ ask handler about permission to cache table during query registration
+ */
+ my_bool register_query_cache_table(THD *thd, char *table_key,
+ uint key_length,
+ qc_engine_callback *call_back,
+ ulonglong *engine_data);
+ static char *get_mysql_bin_log_name();
+ static ulonglong get_mysql_bin_log_pos();
+ bool primary_key_is_clustered();
+ int cmp_ref(const uchar *ref1, const uchar *ref2);
+ /** Fast index creation (smart ALTER TABLE) @see handler0alter.cc @{ */
+ int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys);
+ int prepare_drop_index(TABLE *table_arg, uint *key_num,
+ uint num_of_keys);
+ int final_drop_index(TABLE *table_arg);
+ /** @} */
+ bool check_if_incompatible_data(HA_CREATE_INFO *info,
+ uint table_changes);
+ bool check_if_supported_virtual_columns(void) { return TRUE; }
+public:
+ /**
+ * Multi Range Read interface
+ */
+ int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+ uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+ int multi_range_read_next(char **range_info);
+ ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+ void *seq_init_param,
+ uint n_ranges, uint *bufsz,
+ uint *flags, COST_VECT *cost);
+ ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+ uint *bufsz, uint *flags, COST_VECT *cost);
+ DsMrr_impl ds_mrr;
+
+ Item *idx_cond_push(uint keyno, Item* idx_cond);
+};
+
+/* Some accessor functions which the InnoDB plugin needs, but which
+can not be added to mysql/plugin.h as part of the public interface;
+the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+struct charset_info_st *thd_charset(MYSQL_THD thd);
+#if MYSQL_VERSION_ID >= 50142
+LEX_STRING *thd_query_string(MYSQL_THD thd);
+#else
+char **thd_query(MYSQL_THD thd);
+#endif
+
+/** Get the file name of the MySQL binlog.
+ * @return the name of the binlog file
+ */
+const char* mysql_bin_log_file_name(void);
+
+/** Get the current position of the MySQL binlog.
+ * @return byte offset from the beginning of the binlog
+ */
+ulonglong mysql_bin_log_file_pos(void);
+
+/**
+ Check if a user thread is a replication slave thread
+ @param thd user thread
+ @retval 0 the user thread is not a replication slave thread
+ @retval 1 the user thread is a replication slave thread
+*/
+int thd_slave_thread(const MYSQL_THD thd);
+
+/**
+ Check if a user thread is running a non-transactional update
+ @param thd user thread
+ @retval 0 the user thread is not running a non-transactional update
+ @retval 1 the user thread is running a non-transactional update
+*/
+int thd_non_transactional_update(const MYSQL_THD thd);
+
+/**
+ Get the user thread's binary logging format
+ @param thd user thread
+ @return Value to be used as index into the binlog_format_names array
+*/
+int thd_binlog_format(const MYSQL_THD thd);
+
+/**
+ Mark transaction to rollback and mark error as fatal to a sub-statement.
+ @param thd Thread handle
+ @param all TRUE <=> rollback main transaction.
+*/
+void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+
+#if MYSQL_VERSION_ID > 50140
+/**
+ Check if binary logging is filtered for thread's current db.
+ @param thd Thread handle
+ @retval 1 the query is not filtered, 0 otherwise.
+*/
+bool thd_binlog_filter_ok(const MYSQL_THD thd);
+#endif /* MYSQL_VERSION_ID > 50140 */
+}
+
+typedef struct trx_struct trx_t;
+/********************************************************************//**
+@file handler/ha_innodb.h
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return MySQL error code */
+extern "C"
+int
+convert_error_code_to_mysql(
+/*========================*/
+ int error, /*!< in: InnoDB error code */
+ ulint flags, /*!< in: InnoDB table flags, or 0 */
+ MYSQL_THD thd); /*!< in: user thread handle or NULL */
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return InnoDB transaction handle */
+extern "C"
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+ MYSQL_THD thd); /*!< in: user thread handle */
+
+
+/*********************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true. */
+extern "C"
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+ /* out: true if the index name
+ matches the reserved name */
+ const trx_t* trx, /* in: InnoDB transaction handle */
+ const KEY* key_info, /* in: Indexes to be created */
+ ulint num_of_keys); /* in: Number of indexes to
+ be created. */
+
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
new file mode 100644
index 00000000000..3a32ed9cf36
--- /dev/null
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -0,0 +1,1243 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/handler0alter.cc
+Smart ALTER TABLE
+*******************************************************/
+
+#include <mysql_priv.h>
+#include <mysqld_error.h>
+
+extern "C" {
+#include "log0log.h"
+#include "row0merge.h"
+#include "srv0srv.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "ha_prototypes.h"
+#include "handler0alter.h"
+}
+
+#include "ha_innodb.h"
+
+/*************************************************************//**
+Copies an InnoDB column to a MySQL field. This function is
+adapted from row_sel_field_store_in_mysql_format(). */
+static
+void
+innobase_col_to_mysql(
+/*==================*/
+ const dict_col_t* col, /*!< in: InnoDB column */
+ const uchar* data, /*!< in: InnoDB column data */
+ ulint len, /*!< in: length of data, in bytes */
+ Field* field) /*!< in/out: MySQL field */
+{
+ uchar* ptr;
+ uchar* dest = field->ptr;
+ ulint flen = field->pack_length();
+
+ switch (col->mtype) {
+ case DATA_INT:
+ ut_ad(len == flen);
+
+ /* Convert integer data from Innobase to little-endian
+ format, sign bit restored to normal */
+
+ for (ptr = dest + len; ptr != dest; ) {
+ *--ptr = *data++;
+ }
+
+ if (!(field->flags & UNSIGNED_FLAG)) {
+ ((byte*) dest)[len - 1] ^= 0x80;
+ }
+
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_BINARY:
+ field->reset();
+
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(
+ dest, len, flen - field->key_length());
+ }
+
+ /* Copy the actual data */
+ memcpy(dest, data, len);
+ break;
+
+ case DATA_BLOB:
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, flen, data, len);
+ break;
+
+#ifdef UNIV_DEBUG
+ case DATA_MYSQL:
+ ut_ad(flen >= len);
+ ut_ad(col->mbmaxlen >= col->mbminlen);
+ ut_ad(col->mbmaxlen > col->mbminlen || flen == len);
+ memcpy(dest, data, len);
+ break;
+
+ default:
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ /* These column types should never be shipped to MySQL. */
+ ut_ad(0);
+
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ /* Above are the valid column types for MySQL data. */
+ ut_ad(flen == len);
+#else /* UNIV_DEBUG */
+ default:
+#endif /* UNIV_DEBUG */
+ memcpy(dest, data, len);
+ }
+}
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+extern "C" UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+ TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets) /*!< in: rec_get_offsets(
+ rec, index, ...) */
+{
+ uint n_fields = table->s->fields;
+ uint i;
+
+ ut_ad(n_fields == dict_table_get_n_user_cols(index->table));
+
+ for (i = 0; i < n_fields; i++) {
+ Field* field = table->field[i];
+ ulint ipos;
+ ulint ilen;
+ const uchar* ifield;
+
+ field->reset();
+
+ ipos = dict_index_get_nth_col_pos(index, i);
+
+ if (UNIV_UNLIKELY(ipos == ULINT_UNDEFINED)) {
+null_field:
+ field->set_null();
+ continue;
+ }
+
+ ifield = rec_get_nth_field(rec, offsets, ipos, &ilen);
+
+ /* Assign the NULL flag */
+ if (ilen == UNIV_SQL_NULL) {
+ ut_ad(field->real_maybe_null());
+ goto null_field;
+ }
+
+ field->set_notnull();
+
+ innobase_col_to_mysql(
+ dict_field_get_col(
+ dict_index_get_nth_field(index, ipos)),
+ ifield, ilen, field);
+ }
+}
+
+/*************************************************************//**
+Resets table->record[0]. */
+extern "C" UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+ TABLE* table) /*!< in/out: MySQL table */
+{
+ uint n_fields = table->s->fields;
+ uint i;
+
+ for (i = 0; i < n_fields; i++) {
+ table->field[i]->set_default();
+ }
+}
+
+/******************************************************************//**
+Removes the filename encoding of a database and table name. */
+static
+void
+innobase_convert_tablename(
+/*=======================*/
+ char* s) /*!< in: identifier; out: decoded identifier */
+{
+ uint errors;
+
+ char* slash = strchr(s, '/');
+
+ if (slash) {
+ char* t;
+ /* Temporarily replace the '/' with NUL. */
+ *slash = 0;
+ /* Convert the database name. */
+ strconvert(&my_charset_filename, s, system_charset_info,
+ s, slash - s + 1, &errors);
+
+ t = s + strlen(s);
+ ut_ad(slash >= t);
+ /* Append a '.' after the database name. */
+ *t++ = '.';
+ slash++;
+ /* Convert the table name. */
+ strconvert(&my_charset_filename, slash, system_charset_info,
+ t, slash - t + strlen(slash), &errors);
+ } else {
+ strconvert(&my_charset_filename, s,
+ system_charset_info, s, strlen(s), &errors);
+ }
+}
+
+/*******************************************************************//**
+This function checks that index keys are sensible.
+@return 0 or error number */
+static
+int
+innobase_check_index_keys(
+/*======================*/
+ const KEY* key_info, /*!< in: Indexes to be
+ created */
+ ulint num_of_keys, /*!< in: Number of
+ indexes to be created */
+ const dict_table_t* table) /*!< in: Existing indexes */
+{
+ ulint key_num;
+
+ ut_ad(key_info);
+ ut_ad(num_of_keys);
+
+ for (key_num = 0; key_num < num_of_keys; key_num++) {
+ const KEY& key = key_info[key_num];
+
+ /* Check that the same index name does not appear
+ twice in indexes to be created. */
+
+ for (ulint i = 0; i < key_num; i++) {
+ const KEY& key2 = key_info[i];
+
+ if (0 == strcmp(key.name, key2.name)) {
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
+
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+ }
+
+ /* Check that the same index name does not already exist. */
+
+ for (const dict_index_t* index
+ = dict_table_get_first_index(table);
+ index; index = dict_table_get_next_index(index)) {
+
+ if (0 == strcmp(key.name, index->name)) {
+ my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+ key.name);
+
+ return(ER_WRONG_NAME_FOR_INDEX);
+ }
+ }
+
+ /* Check that MySQL does not try to create a column
+ prefix index field on an inappropriate data type and
+ that the same column does not appear twice in the index. */
+
+ for (ulint i = 0; i < key.key_parts; i++) {
+ const KEY_PART_INFO& key_part1
+ = key.key_part[i];
+ const Field* field
+ = key_part1.field;
+ ibool is_unsigned;
+
+ switch (get_innobase_type_from_mysql_type(
+ &is_unsigned, field)) {
+ default:
+ break;
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ if (field->type() == MYSQL_TYPE_VARCHAR) {
+ if (key_part1.length
+ >= field->pack_length()
+ - ((Field_varstring*) field)
+ ->length_bytes) {
+ break;
+ }
+ } else {
+ if (key_part1.length
+ >= field->pack_length()) {
+ break;
+ }
+ }
+
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ field->field_name);
+ return(ER_WRONG_KEY_COLUMN);
+ }
+
+ for (ulint j = 0; j < i; j++) {
+ const KEY_PART_INFO& key_part2
+ = key.key_part[j];
+
+ if (strcmp(key_part1.field->field_name,
+ key_part2.field->field_name)) {
+ continue;
+ }
+
+ my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+ key_part1.field->field_name);
+ return(ER_WRONG_KEY_COLUMN);
+ }
+ }
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Create index field definition for key part */
+static
+void
+innobase_create_index_field_def(
+/*============================*/
+ KEY_PART_INFO* key_part, /*!< in: MySQL key definition */
+ mem_heap_t* heap, /*!< in: memory heap */
+ merge_index_field_t* index_field) /*!< out: index field
+ definition for key_part */
+{
+ Field* field;
+ ibool is_unsigned;
+ ulint col_type;
+
+ DBUG_ENTER("innobase_create_index_field_def");
+
+ ut_ad(key_part);
+ ut_ad(index_field);
+
+ field = key_part->field;
+ ut_a(field);
+
+ col_type = get_innobase_type_from_mysql_type(&is_unsigned, field);
+
+ if (DATA_BLOB == col_type
+ || (key_part->length < field->pack_length()
+ && field->type() != MYSQL_TYPE_VARCHAR)
+ || (field->type() == MYSQL_TYPE_VARCHAR
+ && key_part->length < field->pack_length()
+ - ((Field_varstring*)field)->length_bytes)) {
+
+ index_field->prefix_len = key_part->length;
+ } else {
+ index_field->prefix_len = 0;
+ }
+
+ index_field->field_name = mem_heap_strdup(heap, field->field_name);
+
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Create index definition for key */
+static
+void
+innobase_create_index_def(
+/*======================*/
+ KEY* key, /*!< in: key definition */
+ bool new_primary, /*!< in: TRUE=generating
+ a new primary key
+ on the table */
+ bool key_primary, /*!< in: TRUE if this key
+ is a primary key */
+ merge_index_def_t* index, /*!< out: index definition */
+ mem_heap_t* heap) /*!< in: heap where memory
+ is allocated */
+{
+ ulint i;
+ ulint len;
+ ulint n_fields = key->key_parts;
+ char* index_name;
+
+ DBUG_ENTER("innobase_create_index_def");
+
+ index->fields = (merge_index_field_t*) mem_heap_alloc(
+ heap, n_fields * sizeof *index->fields);
+
+ index->ind_type = 0;
+ index->n_fields = n_fields;
+ len = strlen(key->name) + 1;
+ index->name = index_name = (char*) mem_heap_alloc(heap,
+ len + !new_primary);
+
+ if (UNIV_LIKELY(!new_primary)) {
+ *index_name++ = TEMP_INDEX_PREFIX;
+ }
+
+ memcpy(index_name, key->name, len);
+
+ if (key->flags & HA_NOSAME) {
+ index->ind_type |= DICT_UNIQUE;
+ }
+
+ if (key_primary) {
+ index->ind_type |= DICT_CLUSTERED;
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ innobase_create_index_field_def(&key->key_part[i], heap,
+ &index->fields[i]);
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Copy index field definition */
+static
+void
+innobase_copy_index_field_def(
+/*==========================*/
+ const dict_field_t* field, /*!< in: definition to copy */
+ merge_index_field_t* index_field) /*!< out: copied definition */
+{
+ DBUG_ENTER("innobase_copy_index_field_def");
+ DBUG_ASSERT(field != NULL);
+ DBUG_ASSERT(index_field != NULL);
+
+ index_field->field_name = field->name;
+ index_field->prefix_len = field->prefix_len;
+
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Copy index definition for the index */
+static
+void
+innobase_copy_index_def(
+/*====================*/
+ const dict_index_t* index, /*!< in: index definition to copy */
+ merge_index_def_t* new_index,/*!< out: Index definition */
+ mem_heap_t* heap) /*!< in: heap where allocated */
+{
+ ulint n_fields;
+ ulint i;
+
+ DBUG_ENTER("innobase_copy_index_def");
+
+ /* Note that we take only those fields that user defined to be
+ in the index. In the internal representation more colums were
+ added and those colums are not copied .*/
+
+ n_fields = index->n_user_defined_cols;
+
+ new_index->fields = (merge_index_field_t*) mem_heap_alloc(
+ heap, n_fields * sizeof *new_index->fields);
+
+ /* When adding a PRIMARY KEY, we may convert a previous
+ clustered index to a secondary index (UNIQUE NOT NULL). */
+ new_index->ind_type = index->type & ~DICT_CLUSTERED;
+ new_index->n_fields = n_fields;
+ new_index->name = index->name;
+
+ for (i = 0; i < n_fields; i++) {
+ innobase_copy_index_field_def(&index->fields[i],
+ &new_index->fields[i]);
+ }
+
+ DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Create an index table where indexes are ordered as follows:
+
+IF a new primary key is defined for the table THEN
+
+ 1) New primary key
+ 2) Original secondary indexes
+ 3) New secondary indexes
+
+ELSE
+
+ 1) All new indexes in the order they arrive from MySQL
+
+ENDIF
+
+
+@return key definitions or NULL */
+static
+merge_index_def_t*
+innobase_create_key_def(
+/*====================*/
+ trx_t* trx, /*!< in: trx */
+ const dict_table_t*table, /*!< in: table definition */
+ mem_heap_t* heap, /*!< in: heap where space for key
+ definitions are allocated */
+ KEY* key_info, /*!< in: Indexes to be created */
+ ulint& n_keys) /*!< in/out: Number of indexes to
+ be created */
+{
+ ulint i = 0;
+ merge_index_def_t* indexdef;
+ merge_index_def_t* indexdefs;
+ bool new_primary;
+
+ DBUG_ENTER("innobase_create_key_def");
+
+ indexdef = indexdefs = (merge_index_def_t*)
+ mem_heap_alloc(heap, sizeof *indexdef
+ * (n_keys + UT_LIST_GET_LEN(table->indexes)));
+
+ /* If there is a primary key, it is always the first index
+ defined for the table. */
+
+ new_primary = !my_strcasecmp(system_charset_info,
+ key_info->name, "PRIMARY");
+
+ /* If there is a UNIQUE INDEX consisting entirely of NOT NULL
+ columns and if the index does not contain column prefix(es)
+ (only prefix/part of the column is indexed), MySQL will treat the
+ index as a PRIMARY KEY unless the table already has one. */
+
+ if (!new_primary && (key_info->flags & HA_NOSAME)
+ && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG))
+ && row_table_got_default_clust_index(table)) {
+ uint key_part = key_info->key_parts;
+
+ new_primary = TRUE;
+
+ while (key_part--) {
+ if (key_info->key_part[key_part].key_type
+ & FIELDFLAG_MAYBE_NULL) {
+ new_primary = FALSE;
+ break;
+ }
+ }
+ }
+
+ if (new_primary) {
+ const dict_index_t* index;
+
+ /* Create the PRIMARY key index definition */
+ innobase_create_index_def(&key_info[i++], TRUE, TRUE,
+ indexdef++, heap);
+
+ row_mysql_lock_data_dictionary(trx);
+
+ index = dict_table_get_first_index(table);
+
+ /* Copy the index definitions of the old table. Skip
+ the old clustered index if it is a generated clustered
+ index or a PRIMARY KEY. If the clustered index is a
+ UNIQUE INDEX, it must be converted to a secondary index. */
+
+ if (dict_index_get_nth_col(index, 0)->mtype == DATA_SYS
+ || !my_strcasecmp(system_charset_info,
+ index->name, "PRIMARY")) {
+ index = dict_table_get_next_index(index);
+ }
+
+ while (index) {
+ innobase_copy_index_def(index, indexdef++, heap);
+ index = dict_table_get_next_index(index);
+ }
+
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ /* Create definitions for added secondary indexes. */
+
+ while (i < n_keys) {
+ innobase_create_index_def(&key_info[i++], new_primary, FALSE,
+ indexdef++, heap);
+ }
+
+ n_keys = indexdef - indexdefs;
+
+ DBUG_RETURN(indexdefs);
+}
+
+/*******************************************************************//**
+Create a temporary tablename using query id, thread id, and id
+@return temporary tablename */
+static
+char*
+innobase_create_temporary_tablename(
+/*================================*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ char id, /*!< in: identifier [0-9a-zA-Z] */
+ const char* table_name) /*!< in: table name */
+{
+ char* name;
+ ulint len;
+ static const char suffix[] = "@0023 "; /* "# " */
+
+ len = strlen(table_name);
+
+ name = (char*) mem_heap_alloc(heap, len + sizeof suffix);
+ memcpy(name, table_name, len);
+ memcpy(name + len, suffix, sizeof suffix);
+ name[len + (sizeof suffix - 2)] = id;
+
+ return(name);
+}
+
+/*******************************************************************//**
+Create indexes.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::add_index(
+/*===================*/
+ TABLE* table, /*!< in: Table where indexes are created */
+ KEY* key_info, /*!< in: Indexes to be created */
+ uint num_of_keys) /*!< in: Number of indexes to be created */
+{
+ dict_index_t** index; /*!< Index to be created */
+ dict_table_t* innodb_table; /*!< InnoDB table in dictionary */
+ dict_table_t* indexed_table; /*!< Table where indexes are created */
+ merge_index_def_t* index_defs; /*!< Index definitions */
+ mem_heap_t* heap; /*!< Heap for index definitions */
+ trx_t* trx; /*!< Transaction */
+ ulint num_of_idx;
+ ulint num_created = 0;
+ ibool dict_locked = FALSE;
+ ulint new_primary;
+ int error;
+
+ DBUG_ENTER("ha_innobase::add_index");
+ ut_a(table);
+ ut_a(key_info);
+ ut_a(num_of_keys);
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ update_thd();
+
+ heap = mem_heap_create(1024);
+
+ /* In case MySQL calls this in the middle of a SELECT query, release
+ possible adaptive hash latch to avoid deadlocks of threads. */
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+ trx_start_if_not_started(prebuilt->trx);
+
+ /* Create a background transaction for the operations on
+ the data dictionary tables. */
+ trx = innobase_trx_allocate(user_thd);
+ trx_start_if_not_started(trx);
+
+ innodb_table = indexed_table
+ = dict_table_get(prebuilt->table->name, FALSE);
+
+ if (UNIV_UNLIKELY(!innodb_table)) {
+ error = HA_ERR_NO_SUCH_TABLE;
+ goto err_exit;
+ }
+
+ /* Check if the index name is reserved. */
+ if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) {
+ error = ER_WRONG_NAME_FOR_INDEX;
+ } else {
+ /* Check that index keys are sensible */
+ error = innobase_check_index_keys(key_info, num_of_keys,
+ innodb_table);
+ }
+
+ if (UNIV_UNLIKELY(error)) {
+err_exit:
+ mem_heap_free(heap);
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx_free_for_mysql(trx);
+ trx_commit_for_mysql(prebuilt->trx);
+ DBUG_RETURN(error);
+ }
+
+ /* Create table containing all indexes to be built in this
+ alter table add index so that they are in the correct order
+ in the table. */
+
+ num_of_idx = num_of_keys;
+
+ index_defs = innobase_create_key_def(
+ trx, innodb_table, heap, key_info, num_of_idx);
+
+ new_primary = DICT_CLUSTERED & index_defs[0].ind_type;
+
+ /* Allocate memory for dictionary index definitions */
+
+ index = (dict_index_t**) mem_heap_alloc(
+ heap, num_of_idx * sizeof *index);
+
+ /* Flag this transaction as a dictionary operation, so that
+ the data dictionary will be locked in crash recovery. */
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ /* Acquire a lock on the table before creating any indexes. */
+ error = row_merge_lock_table(prebuilt->trx, innodb_table,
+ new_primary ? LOCK_X : LOCK_S);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+ goto error_handling;
+ }
+
+ /* Latch the InnoDB data dictionary exclusively so that no deadlocks
+ or lock waits can happen in it during an index create operation. */
+
+ row_mysql_lock_data_dictionary(trx);
+ dict_locked = TRUE;
+
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+
+ /* If a new primary key is defined for the table we need
+ to drop the original table and rebuild all indexes. */
+
+ if (UNIV_UNLIKELY(new_primary)) {
+ /* This transaction should be the only one
+ operating on the table. */
+ ut_a(innodb_table->n_mysql_handles_opened == 1);
+
+ char* new_table_name = innobase_create_temporary_tablename(
+ heap, '1', innodb_table->name);
+
+ /* Clone the table. */
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ indexed_table = row_merge_create_temporary_table(
+ new_table_name, index_defs, innodb_table, trx);
+
+ if (!indexed_table) {
+
+ switch (trx->error_state) {
+ case DB_TABLESPACE_ALREADY_EXISTS:
+ case DB_DUPLICATE_KEY:
+ innobase_convert_tablename(new_table_name);
+ my_error(HA_ERR_TABLE_EXIST, MYF(0),
+ new_table_name);
+ error = HA_ERR_TABLE_EXIST;
+ break;
+ default:
+ error = convert_error_code_to_mysql(
+ trx->error_state, innodb_table->flags,
+ user_thd);
+ }
+
+ ut_d(dict_table_check_for_dup_indexes(innodb_table,
+ FALSE));
+ row_mysql_unlock_data_dictionary(trx);
+ goto err_exit;
+ }
+
+ trx->table_id = indexed_table->id;
+ }
+
+ /* Create the indexes in SYS_INDEXES and load into dictionary. */
+
+ for (ulint i = 0; i < num_of_idx; i++) {
+
+ index[i] = row_merge_create_index(trx, indexed_table,
+ &index_defs[i]);
+
+ if (!index[i]) {
+ error = trx->error_state;
+ goto error_handling;
+ }
+
+ num_created++;
+ }
+
+ ut_ad(error == DB_SUCCESS);
+
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
+
+ /* Commit the data dictionary transaction in order to release
+ the table locks on the system tables. This means that if
+ MySQL crashes while creating a new primary key inside
+ row_merge_build_indexes(), indexed_table will not be dropped
+ by trx_rollback_active(). It will have to be recovered or
+ dropped by the database administrator. */
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+ dict_locked = FALSE;
+
+ ut_a(trx->n_active_thrs == 0);
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+
+ if (UNIV_UNLIKELY(new_primary)) {
+ /* A primary key is to be built. Acquire an exclusive
+ table lock also on the table that is being created. */
+ ut_ad(indexed_table != innodb_table);
+
+ error = row_merge_lock_table(prebuilt->trx, indexed_table,
+ LOCK_X);
+
+ if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+ goto error_handling;
+ }
+ }
+
+ /* Read the clustered index of the table and build indexes
+ based on this information using temporary files and merge sort. */
+ error = row_merge_build_indexes(prebuilt->trx,
+ innodb_table, indexed_table,
+ index, num_of_idx, table);
+
+error_handling:
+ /* After an error, remove all those index definitions from the
+ dictionary which were defined. */
+
+ switch (error) {
+ const char* old_name;
+ char* tmp_name;
+ case DB_SUCCESS:
+ ut_a(!dict_locked);
+ row_mysql_lock_data_dictionary(trx);
+ dict_locked = TRUE;
+
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+
+ if (!new_primary) {
+ error = row_merge_rename_indexes(trx, indexed_table);
+
+ if (error != DB_SUCCESS) {
+ row_merge_drop_indexes(trx, indexed_table,
+ index, num_created);
+ }
+
+ goto convert_error;
+ }
+
+ /* If a new primary key was defined for the table and
+ there was no error at this point, we can now rename
+ the old table as a temporary table, rename the new
+ temporary table as the old table and drop the old table. */
+ old_name = innodb_table->name;
+ tmp_name = innobase_create_temporary_tablename(heap, '2',
+ old_name);
+
+ error = row_merge_rename_tables(innodb_table, indexed_table,
+ tmp_name, trx);
+
+ if (error != DB_SUCCESS) {
+
+ row_merge_drop_table(trx, indexed_table);
+
+ switch (error) {
+ case DB_TABLESPACE_ALREADY_EXISTS:
+ case DB_DUPLICATE_KEY:
+ innobase_convert_tablename(tmp_name);
+ my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name);
+ error = HA_ERR_TABLE_EXIST;
+ break;
+ default:
+ goto convert_error;
+ }
+ break;
+ }
+
+ trx_commit_for_mysql(prebuilt->trx);
+ row_prebuilt_free(prebuilt, TRUE);
+ prebuilt = row_create_prebuilt(indexed_table);
+
+ indexed_table->n_mysql_handles_opened++;
+
+ error = row_merge_drop_table(trx, innodb_table);
+ innodb_table = indexed_table;
+ goto convert_error;
+
+ case DB_TOO_BIG_RECORD:
+ my_error(HA_ERR_TO_BIG_ROW, MYF(0));
+ goto error;
+ case DB_PRIMARY_KEY_IS_NULL:
+ my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0));
+ /* fall through */
+ case DB_DUPLICATE_KEY:
+error:
+ prebuilt->trx->error_info = NULL;
+ /* fall through */
+ default:
+ trx->error_state = DB_SUCCESS;
+
+ if (new_primary) {
+ if (indexed_table != innodb_table) {
+ row_merge_drop_table(trx, indexed_table);
+ }
+ } else {
+ if (!dict_locked) {
+ row_mysql_lock_data_dictionary(trx);
+ dict_locked = TRUE;
+ }
+
+ row_merge_drop_indexes(trx, indexed_table,
+ index, num_created);
+ }
+
+convert_error:
+ error = convert_error_code_to_mysql(error,
+ innodb_table->flags,
+ user_thd);
+ }
+
+ mem_heap_free(heap);
+ trx_commit_for_mysql(trx);
+ if (prebuilt->trx) {
+ trx_commit_for_mysql(prebuilt->trx);
+ }
+
+ if (dict_locked) {
+ ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ trx_free_for_mysql(trx);
+
+ /* There might be work for utility threads.*/
+ srv_active_wake_master_thread();
+
+ DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+Prepare to drop some indexes of a table.
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::prepare_drop_index(
+/*============================*/
+ TABLE* table, /*!< in: Table where indexes are dropped */
+ uint* key_num, /*!< in: Key nums to be dropped */
+ uint num_of_keys) /*!< in: Number of keys to be dropped */
+{
+ trx_t* trx;
+ int err = 0;
+ uint n_key;
+
+ DBUG_ENTER("ha_innobase::prepare_drop_index");
+ ut_ad(table);
+ ut_ad(key_num);
+ ut_ad(num_of_keys);
+ if (srv_created_new_raw || srv_force_recovery) {
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ update_thd();
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+ trx = prebuilt->trx;
+
+ /* Test and mark all the indexes to be dropped */
+
+ row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+ /* Check that none of the indexes have previously been flagged
+ for deletion. */
+ {
+ const dict_index_t* index
+ = dict_table_get_first_index(prebuilt->table);
+ do {
+ ut_a(!index->to_be_dropped);
+ index = dict_table_get_next_index(index);
+ } while (index);
+ }
+
+ for (n_key = 0; n_key < num_of_keys; n_key++) {
+ const KEY* key;
+ dict_index_t* index;
+
+ key = table->key_info + key_num[n_key];
+ index = dict_table_get_index_on_name_and_min_id(
+ prebuilt->table, key->name);
+
+ if (!index) {
+ sql_print_error("InnoDB could not find key n:o %u "
+ "with name %s for table %s",
+ key_num[n_key],
+ key ? key->name : "NULL",
+ prebuilt->table->name);
+
+ err = HA_ERR_KEY_NOT_FOUND;
+ goto func_exit;
+ }
+
+ /* Refuse to drop the clustered index. It would be
+ better to automatically generate a clustered index,
+ but mysql_alter_table() will call this method only
+ after ha_innobase::add_index(). */
+
+ if (dict_index_is_clust(index)) {
+ my_error(ER_REQUIRES_PRIMARY_KEY, MYF(0));
+ err = -1;
+ goto func_exit;
+ }
+
+ index->to_be_dropped = TRUE;
+ }
+
+ /* If FOREIGN_KEY_CHECK = 1 you may not drop an index defined
+ for a foreign key constraint because InnoDB requires that both
+ tables contain indexes for the constraint. Note that CREATE
+ INDEX id ON table does a CREATE INDEX and DROP INDEX, and we
+ can ignore here foreign keys because a new index for the
+ foreign key has already been created.
+
+ We check for the foreign key constraints after marking the
+ candidate indexes for deletion, because when we check for an
+ equivalent foreign index we don't want to select an index that
+ is later deleted. */
+
+ if (trx->check_foreigns
+ && thd_sql_command(user_thd) != SQLCOM_CREATE_INDEX) {
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ dict_foreign_t* foreign;
+
+ if (!index->to_be_dropped) {
+
+ continue;
+ }
+
+ /* Check if the index is referenced. */
+ foreign = dict_table_get_referenced_constraint(
+ prebuilt->table, index);
+
+ if (foreign) {
+index_needed:
+ trx_set_detailed_error(
+ trx,
+ "Index needed in foreign key "
+ "constraint");
+
+ trx->error_info = index;
+
+ err = HA_ERR_DROP_INDEX_FK;
+ break;
+ } else {
+ /* Check if this index references some
+ other table */
+ foreign = dict_table_get_foreign_constraint(
+ prebuilt->table, index);
+
+ if (foreign) {
+ ut_a(foreign->foreign_index == index);
+
+ /* Search for an equivalent index that
+ the foreign key constraint could use
+ if this index were to be deleted. */
+ if (!dict_foreign_find_equiv_index(
+ foreign)) {
+
+ goto index_needed;
+ }
+ }
+ }
+ }
+ } else if (thd_sql_command(user_thd) == SQLCOM_CREATE_INDEX) {
+ /* This is a drop of a foreign key constraint index that
+ was created by MySQL when the constraint was added. MySQL
+ does this when the user creates an index explicitly which
+ can be used in place of the automatically generated index. */
+
+ dict_index_t* index;
+
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index;
+ index = dict_table_get_next_index(index)) {
+ dict_foreign_t* foreign;
+
+ if (!index->to_be_dropped) {
+
+ continue;
+ }
+
+ /* Check if this index references some other table */
+ foreign = dict_table_get_foreign_constraint(
+ prebuilt->table, index);
+
+ if (foreign == NULL) {
+
+ continue;
+ }
+
+ ut_a(foreign->foreign_index == index);
+
+ /* Search for an equivalent index that the
+ foreign key constraint could use if this index
+ were to be deleted. */
+
+ if (!dict_foreign_find_equiv_index(foreign)) {
+ trx_set_detailed_error(
+ trx,
+ "Index needed in foreign key "
+ "constraint");
+
+ trx->error_info = foreign->foreign_index;
+
+ err = HA_ERR_DROP_INDEX_FK;
+ break;
+ }
+ }
+ }
+
+func_exit:
+ if (err) {
+ /* Undo our changes since there was some sort of error. */
+ dict_index_t* index
+ = dict_table_get_first_index(prebuilt->table);
+
+ do {
+ index->to_be_dropped = FALSE;
+ index = dict_table_get_next_index(index);
+ } while (index);
+ }
+
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+ row_mysql_unlock_data_dictionary(trx);
+
+ DBUG_RETURN(err);
+}
+
+/*******************************************************************//**
+Drop the indexes that were passed to a successful prepare_drop_index().
+@return 0 or error number */
+UNIV_INTERN
+int
+ha_innobase::final_drop_index(
+/*==========================*/
+ TABLE* table) /*!< in: Table where indexes are dropped */
+{
+ dict_index_t* index; /*!< Index to be dropped */
+ trx_t* trx; /*!< Transaction */
+ int err;
+
+ DBUG_ENTER("ha_innobase::final_drop_index");
+ ut_ad(table);
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ update_thd();
+
+ trx_search_latch_release_if_reserved(prebuilt->trx);
+ trx_start_if_not_started(prebuilt->trx);
+
+ /* Create a background transaction for the operations on
+ the data dictionary tables. */
+ trx = innobase_trx_allocate(user_thd);
+ trx_start_if_not_started(trx);
+
+ /* Flag this transaction as a dictionary operation, so that
+ the data dictionary will be locked in crash recovery. */
+ trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+ /* Lock the table exclusively, to ensure that no active
+ transaction depends on an index that is being dropped. */
+ err = convert_error_code_to_mysql(
+ row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X),
+ prebuilt->table->flags, user_thd);
+
+ row_mysql_lock_data_dictionary(trx);
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+ if (UNIV_UNLIKELY(err)) {
+
+ /* Unmark the indexes to be dropped. */
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index; index = dict_table_get_next_index(index)) {
+
+ index->to_be_dropped = FALSE;
+ }
+
+ goto func_exit;
+ }
+
+ /* Drop indexes marked to be dropped */
+
+ index = dict_table_get_first_index(prebuilt->table);
+
+ while (index) {
+ dict_index_t* next_index;
+
+ next_index = dict_table_get_next_index(index);
+
+ if (index->to_be_dropped) {
+
+ row_merge_drop_index(index, prebuilt->table, trx);
+ }
+
+ index = next_index;
+ }
+
+ /* Check that all flagged indexes were dropped. */
+ for (index = dict_table_get_first_index(prebuilt->table);
+ index; index = dict_table_get_next_index(index)) {
+ ut_a(!index->to_be_dropped);
+ }
+
+ /* We will need to rebuild index translation table. Set
+ valid index entry count in the translation table to zero */
+ share->idx_trans_tbl.index_count = 0;
+
+func_exit:
+ ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+ trx_commit_for_mysql(trx);
+ trx_commit_for_mysql(prebuilt->trx);
+ row_mysql_unlock_data_dictionary(trx);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ trx_free_for_mysql(trx);
+
+ /* Tell the InnoDB server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ DBUG_RETURN(err);
+}
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
new file mode 100644
index 00000000000..0f656528315
--- /dev/null
+++ b/storage/xtradb/handler/i_s.cc
@@ -0,0 +1,4516 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.cc
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#include <mysql_priv.h>
+#include <mysqld_error.h>
+
+#include <m_ctype.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "i_s.h"
+#include "innodb_patch_info.h"
+#include <mysql/plugin.h>
+
+extern "C" {
+#include "trx0i_s.h"
+#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
+#include "buf0buddy.h" /* for i_s_cmpmem */
+#include "buf0buf.h" /* for buf_pool and PAGE_ZIP_MIN_SIZE */
+#include "ha_prototypes.h" /* for innobase_convert_name() */
+#include "srv0start.h" /* for srv_was_started */
+#include "btr0btr.h" /* for btr_page_get_index_id */
+#include "trx0rseg.h" /* for trx_rseg_struct */
+#include "trx0sys.h" /* for trx_sys */
+#include "dict0dict.h" /* for dict_sys */
+#include "btr0pcur.h"
+#include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */
+}
+
+static const char plugin_author[] = "Innobase Oy";
+
+#define OK(expr) \
+ if ((expr) != 0) { \
+ DBUG_RETURN(1); \
+ }
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name) \
+do { \
+ if (!srv_was_started) { \
+ push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN, \
+ ER_CANT_FIND_SYSTEM_REC, \
+ "InnoDB: SELECTing from " \
+ "INFORMATION_SCHEMA.%s but " \
+ "the InnoDB storage engine " \
+ "is not installed", plugin_name); \
+ DBUG_RETURN(0); \
+ } \
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER
+#define STRUCT_FLD(name, value) name: value
+#else
+#define STRUCT_FLD(name, value) value
+#endif
+
+/* Don't use a static const variable here, as some C++ compilers (notably
+HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
+#define END_OF_ST_FIELD_INFO \
+ {STRUCT_FLD(field_name, NULL), \
+ STRUCT_FLD(field_length, 0), \
+ STRUCT_FLD(field_type, MYSQL_TYPE_NULL), \
+ STRUCT_FLD(value, 0), \
+ STRUCT_FLD(field_flags, 0), \
+ STRUCT_FLD(old_name, ""), \
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)}
+
+/*
+Use the following types mapping:
+
+C type ST_FIELD_INFO::field_type
+---------------------------------
+long MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS)
+
+long unsigned MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+char* MYSQL_TYPE_STRING
+(field_length=n)
+
+float MYSQL_TYPE_FLOAT
+(field_length=0 is ignored)
+
+void* MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+boolean (if else) MYSQL_TYPE_LONG
+(field_length=1)
+
+time_t MYSQL_TYPE_DATETIME
+(field_length=0 ignored)
+---------------------------------
+*/
+
+/* XXX these are defined in mysql_priv.h inside #ifdef MYSQL_SERVER */
+bool schema_table_store_record(THD *thd, TABLE *table);
+void localtime_to_TIME(MYSQL_TIME *to, struct tm *from);
+bool check_global_access(THD *thd, ulong want_access);
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond); /*!< in: condition (not used) */
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+ void* p); /*!< in/out: table schema object */
+
+/*******************************************************************//**
+Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME
+field.
+@return 0 on success */
+static
+int
+field_store_time_t(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ time_t time) /*!< in: value to store */
+{
+ MYSQL_TIME my_time;
+ struct tm tm_time;
+
+#if 0
+ /* use this if you are sure that `variables' and `time_zone'
+ are always initialized */
+ thd->variables.time_zone->gmt_sec_to_TIME(
+ &my_time, (my_time_t) time);
+#else
+ localtime_r(&time, &tm_time);
+ localtime_to_TIME(&my_time, &tm_time);
+ my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+#endif
+
+ return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME));
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return 0 on success */
+static
+int
+field_store_string(
+/*===============*/
+ Field* field, /*!< in/out: target field for storage */
+ const char* str) /*!< in: NUL-terminated utf-8 string,
+ or NULL */
+{
+ int ret;
+
+ if (str != NULL) {
+
+ ret = field->store(str, strlen(str),
+ system_charset_info);
+ field->set_notnull();
+ } else {
+
+ ret = 0; /* success */
+ field->set_null();
+ }
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
+If the value is ULINT_UNDEFINED then the field it set to NULL.
+@return 0 on success */
+static
+int
+field_store_ulint(
+/*==============*/
+ Field* field, /*!< in/out: target field for storage */
+ ulint n) /*!< in: value to store */
+{
+ int ret;
+
+ if (n != ULINT_UNDEFINED) {
+
+ ret = field->store(n);
+ field->set_notnull();
+ } else {
+
+ ret = 0; /* success */
+ field->set_null();
+ }
+
+ return(ret);
+}
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_patches */
+static ST_FIELD_INFO innodb_patches_fields_info[] =
+{
+#define IDX_PATCH_NAME 0
+ {STRUCT_FLD(field_name, "name"),
+ STRUCT_FLD(field_length, 255),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_DESCR 1
+ {STRUCT_FLD(field_name, "description"),
+ STRUCT_FLD(field_length, 255),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_COMMENT 2
+ {STRUCT_FLD(field_name, "comment"),
+ STRUCT_FLD(field_length, 100),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_LINK 3
+ {STRUCT_FLD(field_name, "link"),
+ STRUCT_FLD(field_length, 255),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static struct st_mysql_information_schema i_s_info =
+{
+ MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_patches */
+static
+int
+innodb_patches_fill(
+/*=============*/
+ /* out: 0 on success, 1 on failure */
+ THD* thd, /* in: thread */
+ TABLE_LIST* tables, /* in/out: tables to fill */
+ COND* cond) /* in: condition (ignored) */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+ int i;
+ Field** fields;
+
+
+ DBUG_ENTER("innodb_patches_fill");
+ fields = table->field;
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ for (i = 0; innodb_enhancements[i].file; i++) {
+
+ field_store_string(fields[0],innodb_enhancements[i].file);
+ field_store_string(fields[1],innodb_enhancements[i].name);
+ field_store_string(fields[2],innodb_enhancements[i].comment);
+ field_store_string(fields[3],innodb_enhancements[i].link);
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+
+ }
+
+
+ DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_patches. */
+static
+int
+innodb_patches_init(
+/*=========*/
+ /* out: 0 on success */
+ void* p) /* in/out: table schema object */
+{
+ DBUG_ENTER("innodb_patches_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_patches_fields_info;
+ schema->fill_table = innodb_patches_fill;
+
+ DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_patches =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "XTRADB_ENHANCEMENTS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Percona"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Enhancements applied to InnoDB plugin"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_patches_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_patches_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "XTRADB_ENHANCEMENTS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Percona"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Enhancements applied to InnoDB plugin"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_patches_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+
+static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "page_type"),
+ STRUCT_FLD(field_length, 64),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "space_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "page_no"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "lru_position"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "fix_count"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "flush_type"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_index_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "index_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "space_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "page_no"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "n_recs"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "data_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "hashed"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "access_time"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "modified"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "dirty"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "old"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "lru_position"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "fix_count"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "flush_type"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO i_s_innodb_buffer_pool_pages_blob_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "space_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "page_no"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "compressed"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "part_len"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "next_page_no"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "lru_position"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "fix_count"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "flush_type"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_fill(
+/*================*/
+ /* out: 0 on success, 1 on failure */
+ THD* thd, /* in: thread */
+ TABLE_LIST* tables, /* in/out: tables to fill */
+ COND* cond) /* in: condition (ignored) */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+
+ ulint n_chunks, n_blocks;
+
+ buf_chunk_t* chunk;
+
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ buf_pool_mutex_enter();
+
+ chunk = buf_pool->chunks;
+
+ for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+
+ for (n_blocks = chunk->size; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ char page_type[64];
+
+ switch(fil_page_get_type(frame))
+ {
+ case FIL_PAGE_INDEX:
+ strcpy(page_type, "index");
+ break;
+ case FIL_PAGE_UNDO_LOG:
+ strcpy(page_type, "undo_log");
+ break;
+ case FIL_PAGE_INODE:
+ strcpy(page_type, "inode");
+ break;
+ case FIL_PAGE_IBUF_FREE_LIST:
+ strcpy(page_type, "ibuf_free_list");
+ break;
+ case FIL_PAGE_TYPE_ALLOCATED:
+ strcpy(page_type, "allocated");
+ break;
+ case FIL_PAGE_IBUF_BITMAP:
+ strcpy(page_type, "bitmap");
+ break;
+ case FIL_PAGE_TYPE_SYS:
+ strcpy(page_type, "sys");
+ break;
+ case FIL_PAGE_TYPE_TRX_SYS:
+ strcpy(page_type, "trx_sys");
+ break;
+ case FIL_PAGE_TYPE_FSP_HDR:
+ strcpy(page_type, "fsp_hdr");
+ break;
+ case FIL_PAGE_TYPE_XDES:
+ strcpy(page_type, "xdes");
+ break;
+ case FIL_PAGE_TYPE_BLOB:
+ strcpy(page_type, "blob");
+ break;
+ case FIL_PAGE_TYPE_ZBLOB:
+ strcpy(page_type, "zblob");
+ break;
+ case FIL_PAGE_TYPE_ZBLOB2:
+ strcpy(page_type, "zblob2");
+ break;
+ default:
+ sprintf(page_type, "unknown (type=%li)", fil_page_get_type(frame));
+ }
+
+ field_store_string(table->field[0], page_type);
+ table->field[1]->store(block->page.space);
+ table->field[2]->store(block->page.offset);
+ table->field[3]->store(0);
+ table->field[4]->store(block->page.buf_fix_count);
+ table->field[5]->store(block->page.flush_type);
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+
+ }
+ }
+
+ buf_pool_mutex_exit();
+
+ DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
+static
+int
+i_s_innodb_buffer_pool_pages_index_fill(
+/*================*/
+ /* out: 0 on success, 1 on failure */
+ THD* thd, /* in: thread */
+ TABLE_LIST* tables, /* in/out: tables to fill */
+ COND* cond) /* in: condition (ignored) */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+
+ ulint n_chunks, n_blocks;
+ dulint index_id;
+
+ buf_chunk_t* chunk;
+
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ buf_pool_mutex_enter();
+
+ chunk = buf_pool->chunks;
+
+ for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+
+ for (n_blocks = chunk->size; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+ index_id = btr_page_get_index_id(frame);
+ table->field[0]->store(ut_conv_dulint_to_longlong(index_id));
+ table->field[1]->store(block->page.space);
+ table->field[2]->store(block->page.offset);
+ table->field[3]->store(page_get_n_recs(frame));
+ table->field[4]->store(page_get_data_size(frame));
+ table->field[5]->store(block->is_hashed);
+ table->field[6]->store(block->page.access_time);
+ table->field[7]->store(block->page.newest_modification != 0);
+ table->field[8]->store(block->page.oldest_modification != 0);
+ table->field[9]->store(block->page.old);
+ table->field[10]->store(0);
+ table->field[11]->store(block->page.buf_fix_count);
+ table->field[12]->store(block->page.flush_type);
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ buf_pool_mutex_exit();
+
+ DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
+static
+int
+i_s_innodb_buffer_pool_pages_blob_fill(
+/*================*/
+ /* out: 0 on success, 1 on failure */
+ THD* thd, /* in: thread */
+ TABLE_LIST* tables, /* in/out: tables to fill */
+ COND* cond) /* in: condition (ignored) */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+
+ ulint n_chunks, n_blocks;
+ buf_chunk_t* chunk;
+ page_zip_des_t* block_page_zip;
+
+ ulint part_len;
+ ulint next_page_no;
+
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ buf_pool_mutex_enter();
+
+ chunk = buf_pool->chunks;
+
+ for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+ buf_block_t* block = chunk->blocks;
+ block_page_zip = buf_block_get_page_zip(block);
+
+ for (n_blocks = chunk->size; n_blocks--; block++) {
+ const buf_frame_t* frame = block->frame;
+
+ if (fil_page_get_type(frame) == FIL_PAGE_TYPE_BLOB) {
+
+ if (UNIV_LIKELY_NULL(block_page_zip)) {
+ part_len = 0; /* hmm, can't figure it out */
+
+ next_page_no = mach_read_from_4(
+ buf_block_get_frame(block)
+ + FIL_PAGE_NEXT);
+ } else {
+ part_len = mach_read_from_4(
+ buf_block_get_frame(block)
+ + FIL_PAGE_DATA
+ + 0 /*BTR_BLOB_HDR_PART_LEN*/);
+
+ next_page_no = mach_read_from_4(
+ buf_block_get_frame(block)
+ + FIL_PAGE_DATA
+ + 4 /*BTR_BLOB_HDR_NEXT_PAGE_NO*/);
+ }
+
+ table->field[0]->store(block->page.space);
+ table->field[1]->store(block->page.offset);
+ table->field[2]->store(block_page_zip != NULL);
+ table->field[3]->store(part_len);
+
+ if(next_page_no == FIL_NULL)
+ {
+ table->field[4]->store(0);
+ } else {
+ table->field[4]->store(block->page.offset);
+ }
+
+ table->field[5]->store(0);
+ table->field[6]->store(block->page.buf_fix_count);
+ table->field[7]->store(block->page.flush_type);
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+
+ }
+ }
+ }
+
+ buf_pool_mutex_exit();
+
+ DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_init(
+/*=========*/
+ /* out: 0 on success */
+ void* p) /* in/out: table schema object */
+{
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_buffer_pool_pages_fields_info;
+ schema->fill_table = i_s_innodb_buffer_pool_pages_fill;
+
+ DBUG_RETURN(0);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_index_init(
+/*=========*/
+ /* out: 0 on success */
+ void* p) /* in/out: table schema object */
+{
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_buffer_pool_pages_index_fields_info;
+ schema->fill_table = i_s_innodb_buffer_pool_pages_index_fill;
+
+ DBUG_RETURN(0);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_blob_init(
+/*=========*/
+ /* out: 0 on success */
+ void* p) /* in/out: table schema object */
+{
+ DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_buffer_pool_pages_blob_fields_info;
+ schema->fill_table = i_s_innodb_buffer_pool_pages_blob_fill;
+
+ DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool index pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_index_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool index pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool blob pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB buffer pool blob pages"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
+static ST_FIELD_INFO innodb_trx_fields_info[] =
+{
+#define IDX_TRX_ID 0
+ {STRUCT_FLD(field_name, "trx_id"),
+ STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STATE 1
+ {STRUCT_FLD(field_name, "trx_state"),
+ STRUCT_FLD(field_length, TRX_QUE_STATE_STR_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STARTED 2
+ {STRUCT_FLD(field_name, "trx_started"),
+ STRUCT_FLD(field_length, 0),
+ STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_REQUESTED_LOCK_ID 3
+ {STRUCT_FLD(field_name, "trx_requested_lock_id"),
+ STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WAIT_STARTED 4
+ {STRUCT_FLD(field_name, "trx_wait_started"),
+ STRUCT_FLD(field_length, 0),
+ STRUCT_FLD(field_type, MYSQL_TYPE_DATETIME),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WEIGHT 5
+ {STRUCT_FLD(field_name, "trx_weight"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_MYSQL_THREAD_ID 6
+ {STRUCT_FLD(field_name, "trx_mysql_thread_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_QUERY 7
+ {STRUCT_FLD(field_name, "trx_query"),
+ STRUCT_FLD(field_length, TRX_I_S_TRX_QUERY_MAX_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
+table with it.
+@return 0 on success */
+static
+int
+fill_innodb_trx_from_cache(
+/*=======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: used to call
+ schema_table_store_record() */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_trx_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_TRX);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_trx_row_t* row;
+ char trx_id[TRX_ID_MAX_LEN + 1];
+
+ row = (i_s_trx_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_TRX, i);
+
+ /* trx_id */
+ ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id);
+ OK(field_store_string(fields[IDX_TRX_ID], trx_id));
+
+ /* trx_state */
+ OK(field_store_string(fields[IDX_TRX_STATE],
+ row->trx_state));
+
+ /* trx_started */
+ OK(field_store_time_t(fields[IDX_TRX_STARTED],
+ (time_t) row->trx_started));
+
+ /* trx_requested_lock_id */
+ /* trx_wait_started */
+ if (row->trx_wait_started != 0) {
+
+ OK(field_store_string(
+ fields[IDX_TRX_REQUESTED_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->requested_lock_row,
+ lock_id, sizeof(lock_id))));
+ /* field_store_string() sets it no notnull */
+
+ OK(field_store_time_t(
+ fields[IDX_TRX_WAIT_STARTED],
+ (time_t) row->trx_wait_started));
+ fields[IDX_TRX_WAIT_STARTED]->set_notnull();
+ } else {
+
+ fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null();
+ fields[IDX_TRX_WAIT_STARTED]->set_null();
+ }
+
+ /* trx_weight */
+ OK(fields[IDX_TRX_WEIGHT]->store((longlong) row->trx_weight,
+ true));
+
+ /* trx_mysql_thread_id */
+ OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store(
+ row->trx_mysql_thread_id));
+
+ /* trx_query */
+ OK(field_store_string(fields[IDX_TRX_QUERY],
+ row->trx_query));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_trx
+@return 0 on success */
+static
+int
+innodb_trx_init(
+/*============*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_trx_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_trx_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_trx =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TRX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB transactions"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_trx_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_trx_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_TRX"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB transactions"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_trx_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
+static ST_FIELD_INFO innodb_locks_fields_info[] =
+{
+#define IDX_LOCK_ID 0
+ {STRUCT_FLD(field_name, "lock_id"),
+ STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TRX_ID 1
+ {STRUCT_FLD(field_name, "lock_trx_id"),
+ STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_MODE 2
+ {STRUCT_FLD(field_name, "lock_mode"),
+ /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */
+ STRUCT_FLD(field_length, 32),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TYPE 3
+ {STRUCT_FLD(field_name, "lock_type"),
+ STRUCT_FLD(field_length, 32 /* RECORD|TABLE|UNKNOWN */),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TABLE 4
+ {STRUCT_FLD(field_name, "lock_table"),
+ STRUCT_FLD(field_length, 1024),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_INDEX 5
+ {STRUCT_FLD(field_name, "lock_index"),
+ STRUCT_FLD(field_length, 1024),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_SPACE 6
+ {STRUCT_FLD(field_name, "lock_space"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_PAGE 7
+ {STRUCT_FLD(field_name, "lock_page"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_REC 8
+ {STRUCT_FLD(field_name, "lock_rec"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_DATA 9
+ {STRUCT_FLD(field_name, "lock_data"),
+ STRUCT_FLD(field_length, TRX_I_S_LOCK_DATA_MAX_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_MAYBE_NULL),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
+table with it.
+@return 0 on success */
+static
+int
+fill_innodb_locks_from_cache(
+/*=========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: MySQL client connection */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_locks_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_LOCKS);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_locks_row_t* row;
+
+ /* note that the decoded database or table name is
+ never expected to be longer than NAME_LEN;
+ NAME_LEN for database name
+ 2 for surrounding quotes around database name
+ NAME_LEN for table name
+ 2 for surrounding quotes around table name
+ 1 for the separating dot (.)
+ 9 for the #mysql50# prefix */
+ char buf[2 * NAME_LEN + 14];
+ const char* bufend;
+
+ char lock_trx_id[TRX_ID_MAX_LEN + 1];
+
+ row = (i_s_locks_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_LOCKS, i);
+
+ /* lock_id */
+ trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id));
+ OK(field_store_string(fields[IDX_LOCK_ID],
+ lock_id));
+
+ /* lock_trx_id */
+ ut_snprintf(lock_trx_id, sizeof(lock_trx_id),
+ TRX_ID_FMT, row->lock_trx_id);
+ OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id));
+
+ /* lock_mode */
+ OK(field_store_string(fields[IDX_LOCK_MODE],
+ row->lock_mode));
+
+ /* lock_type */
+ OK(field_store_string(fields[IDX_LOCK_TYPE],
+ row->lock_type));
+
+ /* lock_table */
+ bufend = innobase_convert_name(buf, sizeof(buf),
+ row->lock_table,
+ strlen(row->lock_table),
+ thd, TRUE);
+ OK(fields[IDX_LOCK_TABLE]->store(buf, bufend - buf,
+ system_charset_info));
+
+ /* lock_index */
+ if (row->lock_index != NULL) {
+
+ bufend = innobase_convert_name(buf, sizeof(buf),
+ row->lock_index,
+ strlen(row->lock_index),
+ thd, FALSE);
+ OK(fields[IDX_LOCK_INDEX]->store(buf, bufend - buf,
+ system_charset_info));
+ fields[IDX_LOCK_INDEX]->set_notnull();
+ } else {
+
+ fields[IDX_LOCK_INDEX]->set_null();
+ }
+
+ /* lock_space */
+ OK(field_store_ulint(fields[IDX_LOCK_SPACE],
+ row->lock_space));
+
+ /* lock_page */
+ OK(field_store_ulint(fields[IDX_LOCK_PAGE],
+ row->lock_page));
+
+ /* lock_rec */
+ OK(field_store_ulint(fields[IDX_LOCK_REC],
+ row->lock_rec));
+
+ /* lock_data */
+ OK(field_store_string(fields[IDX_LOCK_DATA],
+ row->lock_data));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_locks
+@return 0 on success */
+static
+int
+innodb_locks_init(
+/*==============*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_locks_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_locks_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_locks =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCKS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_locks_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_locks_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCKS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_locks_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
+static ST_FIELD_INFO innodb_lock_waits_fields_info[] =
+{
+#define IDX_REQUESTING_TRX_ID 0
+ {STRUCT_FLD(field_name, "requesting_trx_id"),
+ STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_REQUESTED_LOCK_ID 1
+ {STRUCT_FLD(field_name, "requested_lock_id"),
+ STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_TRX_ID 2
+ {STRUCT_FLD(field_name, "blocking_trx_id"),
+ STRUCT_FLD(field_length, TRX_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_LOCK_ID 3
+ {STRUCT_FLD(field_name, "blocking_lock_id"),
+ STRUCT_FLD(field_length, TRX_I_S_LOCK_ID_MAX_LEN + 1),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the
+INFORMATION_SCHEMA.innodb_lock_waits table with it.
+@return 0 on success */
+static
+int
+fill_innodb_lock_waits_from_cache(
+/*==============================*/
+ trx_i_s_cache_t* cache, /*!< in: cache to read from */
+ THD* thd, /*!< in: used to call
+ schema_table_store_record() */
+ TABLE* table) /*!< in/out: fill this table */
+{
+ Field** fields;
+ ulint rows_num;
+ char requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ char blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+ ulint i;
+
+ DBUG_ENTER("fill_innodb_lock_waits_from_cache");
+
+ fields = table->field;
+
+ rows_num = trx_i_s_cache_get_rows_used(cache,
+ I_S_INNODB_LOCK_WAITS);
+
+ for (i = 0; i < rows_num; i++) {
+
+ i_s_lock_waits_row_t* row;
+
+ char requesting_trx_id[TRX_ID_MAX_LEN + 1];
+ char blocking_trx_id[TRX_ID_MAX_LEN + 1];
+
+ row = (i_s_lock_waits_row_t*)
+ trx_i_s_cache_get_nth_row(
+ cache, I_S_INNODB_LOCK_WAITS, i);
+
+ /* requesting_trx_id */
+ ut_snprintf(requesting_trx_id, sizeof(requesting_trx_id),
+ TRX_ID_FMT, row->requested_lock_row->lock_trx_id);
+ OK(field_store_string(fields[IDX_REQUESTING_TRX_ID],
+ requesting_trx_id));
+
+ /* requested_lock_id */
+ OK(field_store_string(
+ fields[IDX_REQUESTED_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->requested_lock_row,
+ requested_lock_id,
+ sizeof(requested_lock_id))));
+
+ /* blocking_trx_id */
+ ut_snprintf(blocking_trx_id, sizeof(blocking_trx_id),
+ TRX_ID_FMT, row->blocking_lock_row->lock_trx_id);
+ OK(field_store_string(fields[IDX_BLOCKING_TRX_ID],
+ blocking_trx_id));
+
+ /* blocking_lock_id */
+ OK(field_store_string(
+ fields[IDX_BLOCKING_LOCK_ID],
+ trx_i_s_create_lock_id(
+ row->blocking_lock_row,
+ blocking_lock_id,
+ sizeof(blocking_lock_id))));
+
+ OK(schema_table_store_record(thd, table));
+ }
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+innodb_lock_waits_init(
+/*===================*/
+ void* p) /*!< in/out: table schema object */
+{
+ ST_SCHEMA_TABLE* schema;
+
+ DBUG_ENTER("innodb_lock_waits_init");
+
+ schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = innodb_lock_waits_fields_info;
+ schema->fill_table = trx_i_s_common_fill_table;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_lock_waits =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Innobase Oy"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_lock_waits_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_lock_waits_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, "Innobase Oy"),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, innodb_lock_waits_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return 0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond) /*!< in: condition (not used) */
+{
+ const char* table_name;
+ int ret;
+ trx_i_s_cache_t* cache;
+
+ DBUG_ENTER("trx_i_s_common_fill_table");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ /* minimize the number of places where global variables are
+ referenced */
+ cache = trx_i_s_cache;
+
+ /* which table we have to fill? */
+ table_name = tables->schema_table_name;
+ /* or table_name = tables->schema_table->table_name; */
+
+ RETURN_IF_INNODB_NOT_STARTED(table_name);
+
+ /* update the cache */
+ trx_i_s_cache_start_write(cache);
+ trx_i_s_possibly_fetch_data_into_cache(cache);
+ trx_i_s_cache_end_write(cache);
+
+ if (trx_i_s_cache_is_truncated(cache)) {
+
+ /* XXX show warning to user if possible */
+ fprintf(stderr, "Warning: data in %s truncated due to "
+ "memory limit of %d bytes\n", table_name,
+ TRX_I_S_MEM_LIMIT);
+ }
+
+ ret = 0;
+
+ trx_i_s_cache_start_read(cache);
+
+ if (innobase_strcasecmp(table_name, "innodb_trx") == 0) {
+
+ if (fill_innodb_trx_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) {
+
+ if (fill_innodb_locks_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) {
+
+ if (fill_innodb_lock_waits_from_cache(
+ cache, thd, tables->table) != 0) {
+
+ ret = 1;
+ }
+
+ } else {
+
+ /* huh! what happened!? */
+ fprintf(stderr,
+ "InnoDB: trx_i_s_common_fill_table() was "
+ "called to fill unknown table: %s.\n"
+ "This function only knows how to fill "
+ "innodb_trx, innodb_locks and "
+ "innodb_lock_waits tables.\n", table_name);
+
+ ret = 1;
+ }
+
+ trx_i_s_cache_end_read(cache);
+
+#if 0
+ DBUG_RETURN(ret);
+#else
+ /* if this function returns something else than 0 then a
+ deadlock occurs between the mysqld server and mysql client,
+ see http://bugs.mysql.com/29900 ; when that bug is resolved
+ we can enable the DBUG_RETURN(ret) above */
+ DBUG_RETURN(0);
+#endif
+}
+
+/* Fields of the dynamic table information_schema.innodb_cmp. */
+static ST_FIELD_INFO i_s_cmp_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "page_size"),
+ STRUCT_FLD(field_length, 5),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Compressed Page Size"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "compress_ops"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Number of Compressions"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "compress_ops_ok"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Number of"
+ " Successful Compressions"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "compress_time"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Duration of Compressions,"
+ " in Seconds"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "uncompress_ops"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Number of Decompressions"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "uncompress_time"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Duration of Decompressions,"
+ " in Seconds"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp or
+innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill_low(
+/*=============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond, /*!< in: condition (ignored) */
+ ibool reset) /*!< in: TRUE=reset cumulated counts */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+
+ DBUG_ENTER("i_s_cmp_fill_low");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ for (uint i = 0; i < PAGE_ZIP_NUM_SSIZE - 1; i++) {
+ page_zip_stat_t* zip_stat = &page_zip_stat[i];
+
+ table->field[0]->store(PAGE_ZIP_MIN_SIZE << i);
+
+ /* The cumulated counts are not protected by any
+ mutex. Thus, some operation in page0zip.c could
+ increment a counter between the time we read it and
+ clear it. We could introduce mutex protection, but it
+ could cause a measureable performance hit in
+ page0zip.c. */
+ table->field[1]->store(zip_stat->compressed);
+ table->field[2]->store(zip_stat->compressed_ok);
+ table->field[3]->store(
+ (ulong) (zip_stat->compressed_usec / 1000000));
+ table->field[4]->store(zip_stat->decompressed);
+ table->field[5]->store(
+ (ulong) (zip_stat->decompressed_usec / 1000000));
+
+ if (reset) {
+ memset(zip_stat, 0, sizeof *zip_stat);
+ }
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+ }
+
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill(
+/*=========*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmp_reset_fill(
+/*===============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp.
+@return 0 on success */
+static
+int
+i_s_cmp_init(
+/*=========*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_cmp_fields_info;
+ schema->fill_table = i_s_cmp_fill;
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_reset.
+@return 0 on success */
+static
+int
+i_s_cmp_reset_init(
+/*===============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmp_reset_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_cmp_fields_info;
+ schema->fill_table = i_s_cmp_reset_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmp_reset =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmp_reset_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmp_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+/* Fields of the dynamic table information_schema.innodb_cmpmem. */
+static ST_FIELD_INFO i_s_cmpmem_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "page_size"),
+ STRUCT_FLD(field_length, 5),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Buddy Block Size"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "pages_used"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Currently in Use"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "pages_free"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Currently Available"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "relocation_ops"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Number of Relocations"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "relocation_time"),
+ STRUCT_FLD(field_length, MY_INT32_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, "Total Duration of Relocations,"
+ " in Seconds"),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem or
+innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill_low(
+/*================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond, /*!< in: condition (ignored) */
+ ibool reset) /*!< in: TRUE=reset cumulated counts */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+
+ DBUG_ENTER("i_s_cmpmem_fill_low");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ //buf_pool_mutex_enter();
+ mutex_enter(&zip_free_mutex);
+
+ for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+ buf_buddy_stat_t* buddy_stat = &buf_buddy_stat[x];
+
+ table->field[0]->store(BUF_BUDDY_LOW << x);
+ table->field[1]->store(buddy_stat->used);
+ table->field[2]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES)
+ ? UT_LIST_GET_LEN(buf_pool->zip_free[x])
+ : 0);
+ table->field[3]->store((longlong) buddy_stat->relocated, true);
+ table->field[4]->store(
+ (ulong) (buddy_stat->relocated_usec / 1000000));
+
+ if (reset) {
+ /* This is protected by buf_pool_mutex. */
+ buddy_stat->relocated = 0;
+ buddy_stat->relocated_usec = 0;
+ }
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&zip_free_mutex);
+ DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill(
+/*============*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_reset_fill(
+/*==================*/
+ THD* thd, /*!< in: thread */
+ TABLE_LIST* tables, /*!< in/out: tables to fill */
+ COND* cond) /*!< in: condition (ignored) */
+{
+ return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem.
+@return 0 on success */
+static
+int
+i_s_cmpmem_init(
+/*============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmpmem_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_cmpmem_fields_info;
+ schema->fill_table = i_s_cmpmem_fill;
+
+ DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem_reset.
+@return 0 on success */
+static
+int
+i_s_cmpmem_reset_init(
+/*==================*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_cmpmem_reset_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_cmpmem_fields_info;
+ schema->fill_table = i_s_cmpmem_reset_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_cmpmem_reset =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_cmpmem_reset_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+ " reset cumulated counts"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return 0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+ void* p) /*!< in/out: table schema object */
+{
+ DBUG_ENTER("i_s_common_deinit");
+
+ /* Do nothing */
+
+ DBUG_RETURN(0);
+}
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO i_s_innodb_rseg_fields_info[] =
+{
+ {STRUCT_FLD(field_name, "rseg_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "space_id"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "zip_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "page_no"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "max_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "curr_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static
+int
+i_s_innodb_rseg_fill(
+/*=================*/
+ THD* thd, /* in: thread */
+ TABLE_LIST* tables, /* in/out: tables to fill */
+ COND* cond) /* in: condition (ignored) */
+{
+ TABLE* table = (TABLE *) tables->table;
+ int status = 0;
+ trx_rseg_t* rseg;
+
+ DBUG_ENTER("i_s_innodb_rseg_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+
+ DBUG_RETURN(0);
+ }
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg) {
+ table->field[0]->store(rseg->id);
+ table->field[1]->store(rseg->space);
+ table->field[2]->store(rseg->zip_size);
+ table->field[3]->store(rseg->page_no);
+ table->field[4]->store(rseg->max_size);
+ table->field[5]->store(rseg->curr_size);
+
+ if (schema_table_store_record(thd, table)) {
+ status = 1;
+ break;
+ }
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+
+ DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_rseg_init(
+/*=================*/
+ /* out: 0 on success */
+ void* p) /* in/out: table schema object */
+{
+ DBUG_ENTER("i_s_innodb_rseg_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_rseg_fields_info;
+ schema->fill_table = i_s_innodb_rseg_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_rseg =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_RSEG"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB rollback segment information"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_rseg_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* reserved for dependency checking */
+ /* void* */
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_rseg_maria =
+{
+ /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+ /* int */
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+ /* pointer to type-specific plugin descriptor */
+ /* void* */
+ STRUCT_FLD(info, &i_s_info),
+
+ /* plugin name */
+ /* const char* */
+ STRUCT_FLD(name, "INNODB_RSEG"),
+
+ /* plugin author (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(author, plugin_author),
+
+ /* general descriptive text (for SHOW PLUGINS) */
+ /* const char* */
+ STRUCT_FLD(descr, "InnoDB rollback segment information"),
+
+ /* the plugin license (PLUGIN_LICENSE_XXX) */
+ /* int */
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+ /* the function to invoke when plugin is loaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(init, i_s_innodb_rseg_init),
+
+ /* the function to invoke when plugin is unloaded */
+ /* int (*)(void*); */
+ STRUCT_FLD(deinit, i_s_common_deinit),
+
+ /* plugin version (for SHOW PLUGINS) */
+ /* unsigned int */
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+ /* struct st_mysql_show_var* */
+ STRUCT_FLD(status_vars, NULL),
+
+ /* struct st_mysql_sys_var** */
+ STRUCT_FLD(system_vars, NULL),
+
+ /* string version */
+ /* const char * */
+ STRUCT_FLD(version_info, "1.0"),
+
+ /* Maturity */
+ /* int */
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO i_s_innodb_table_stats_info[] =
+{
+ {STRUCT_FLD(field_name, "table_schema"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "table_name"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "rows"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "clust_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "other_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "modified"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO i_s_innodb_index_stats_info[] =
+{
+ {STRUCT_FLD(field_name, "table_schema"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "table_name"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "index_name"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "fields"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "row_per_keys"),
+ STRUCT_FLD(field_length, 256),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "index_size"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "leaf_pages"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static
+int
+i_s_innodb_table_stats_fill(
+/*========================*/
+ THD* thd,
+ TABLE_LIST* tables,
+ COND* cond)
+{
+ TABLE* i_s_table = (TABLE *) tables->table;
+ int status = 0;
+ dict_table_t* table;
+
+ DBUG_ENTER("i_s_innodb_table_stats_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+ while (table) {
+ char buf[NAME_LEN * 2 + 2];
+ char* ptr;
+
+ if (table->stat_clustered_index_size == 0) {
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ continue;
+ }
+
+ buf[NAME_LEN * 2 + 1] = 0;
+ strncpy(buf, table->name, NAME_LEN * 2 + 1);
+ ptr = strchr(buf, '/');
+ if (ptr) {
+ *ptr = '\0';
+ ++ptr;
+ } else {
+ ptr = buf;
+ }
+
+ field_store_string(i_s_table->field[0], buf);
+ field_store_string(i_s_table->field[1], ptr);
+ i_s_table->field[2]->store(table->stat_n_rows, 1);
+ i_s_table->field[3]->store(table->stat_clustered_index_size);
+ i_s_table->field[4]->store(table->stat_sum_of_other_index_sizes);
+ i_s_table->field[5]->store(table->stat_modified_counter);
+
+ if (schema_table_store_record(thd, i_s_table)) {
+ status = 1;
+ break;
+ }
+
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_index_stats_fill(
+/*========================*/
+ THD* thd,
+ TABLE_LIST* tables,
+ COND* cond)
+{
+ TABLE* i_s_table = (TABLE *) tables->table;
+ int status = 0;
+ dict_table_t* table;
+ dict_index_t* index;
+
+ DBUG_ENTER("i_s_innodb_index_stats_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+
+ table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+ while (table) {
+ if (table->stat_clustered_index_size == 0) {
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ continue;
+ }
+
+ ib_int64_t n_rows = table->stat_n_rows;
+
+ if (n_rows < 0) {
+ n_rows = 0;
+ }
+
+ index = dict_table_get_first_index(table);
+
+ while (index) {
+ char buff[256+1];
+ char row_per_keys[256+1];
+ char buf[NAME_LEN * 2 + 2];
+ char* ptr;
+ ulint i;
+
+ buf[NAME_LEN * 2 + 1] = 0;
+ strncpy(buf, table->name, NAME_LEN * 2 + 1);
+ ptr = strchr(buf, '/');
+ if (ptr) {
+ *ptr = '\0';
+ ++ptr;
+ } else {
+ ptr = buf;
+ }
+
+ field_store_string(i_s_table->field[0], buf);
+ field_store_string(i_s_table->field[1], ptr);
+ field_store_string(i_s_table->field[2], index->name);
+ i_s_table->field[3]->store(index->n_uniq);
+
+ row_per_keys[0] = '\0';
+
+ /* It is remained optimistic operation still for now */
+ //dict_index_stat_mutex_enter(index);
+ if (index->stat_n_diff_key_vals) {
+ for (i = 1; i <= index->n_uniq; i++) {
+ ib_int64_t rec_per_key;
+ if (index->stat_n_diff_key_vals[i]) {
+ rec_per_key = n_rows / index->stat_n_diff_key_vals[i];
+ } else {
+ rec_per_key = n_rows;
+ }
+ ut_snprintf(buff, 256, (i == index->n_uniq)?"%llu":"%llu, ",
+ rec_per_key);
+ strncat(row_per_keys, buff, 256 - strlen(row_per_keys));
+ }
+ }
+ //dict_index_stat_mutex_exit(index);
+
+ field_store_string(i_s_table->field[4], row_per_keys);
+
+ i_s_table->field[5]->store(index->stat_index_size);
+ i_s_table->field[6]->store(index->stat_n_leaf_pages);
+
+ if (schema_table_store_record(thd, i_s_table)) {
+ status = 1;
+ break;
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ if (status == 1) {
+ break;
+ }
+
+ table = UT_LIST_GET_NEXT(table_LRU, table);
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_table_stats_init(
+/*========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_table_stats_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_table_stats_info;
+ schema->fill_table = i_s_innodb_table_stats_fill;
+
+ DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_index_stats_init(
+/*========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_index_stats_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_index_stats_info;
+ schema->fill_table = i_s_innodb_index_stats_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_table_stats =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_TABLE_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB table statistics in memory"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_table_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_table_stats_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_TABLE_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB table statistics in memory"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_table_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_index_stats =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_INDEX_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB index statistics in memory"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_index_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_index_stats_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_INDEX_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB index statistics in memory"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_index_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO i_s_innodb_admin_command_info[] =
+{
+ {STRUCT_FLD(field_name, "result_message"),
+ STRUCT_FLD(field_length, 1024),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+char **thd_query(MYSQL_THD thd);
+}
+
+static
+int
+i_s_innodb_admin_command_fill(
+/*==========================*/
+ THD* thd,
+ TABLE_LIST* tables,
+ COND* cond)
+{
+ TABLE* i_s_table = (TABLE *) tables->table;
+ char** query_str;
+ char* ptr;
+ char quote = '\0';
+ const char* command_head = "XTRA_";
+
+ DBUG_ENTER("i_s_innodb_admin_command_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ if(thd_sql_command(thd) != SQLCOM_SELECT) {
+ field_store_string(i_s_table->field[0],
+ "SELECT command is only accepted.");
+ goto end_func;
+ }
+
+ query_str = thd_query(thd);
+ ptr = *query_str;
+
+ for (; *ptr; ptr++) {
+ if (*ptr == quote) {
+ quote = '\0';
+ } else if (quote) {
+ } else if (*ptr == '`' || *ptr == '"') {
+ quote = *ptr;
+ } else {
+ long i;
+ for (i = 0; command_head[i]; i++) {
+ if (toupper((int)(unsigned char)(ptr[i]))
+ != toupper((int)(unsigned char)
+ (command_head[i]))) {
+ goto nomatch;
+ }
+ }
+ break;
+nomatch:
+ ;
+ }
+ }
+
+ if (!*ptr) {
+ field_store_string(i_s_table->field[0],
+ "No XTRA_* command in the SQL statement."
+ " Please add /*!XTRA_xxxx*/ to the SQL.");
+ goto end_func;
+ }
+
+ if (!strncasecmp("XTRA_HELLO", ptr, 10)) {
+ /* This is example command XTRA_HELLO */
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command test for XtraDB"
+ " 'XTRA_HELLO' was detected.\n");
+
+ field_store_string(i_s_table->field[0],
+ "Hello!");
+ goto end_func;
+ }
+ else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'"
+ " was detected.\n");
+
+ if (buf_LRU_file_dump()) {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_DUMP was succeeded.");
+ } else {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_DUMP was failed.");
+ }
+
+ goto end_func;
+ }
+ else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'"
+ " was detected.\n");
+
+ if (buf_LRU_file_restore()) {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_RESTORE was succeeded.");
+ } else {
+ field_store_string(i_s_table->field[0],
+ "XTRA_LRU_RESTORE was failed.");
+ }
+
+ goto end_func;
+ }
+
+ field_store_string(i_s_table->field[0],
+ "Undefined XTRA_* command.");
+ goto end_func;
+
+end_func:
+ if (schema_table_store_record(thd, i_s_table)) {
+ DBUG_RETURN(1);
+ } else {
+ DBUG_RETURN(0);
+ }
+}
+
+static
+int
+i_s_innodb_admin_command_init(
+/*==========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_admin_command_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_admin_command_info;
+ schema->fill_table = i_s_innodb_admin_command_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_admin_command =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_admin_command_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_admin_command_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_admin_command_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+static ST_FIELD_INFO i_s_innodb_sys_tables_info[] =
+{
+ {STRUCT_FLD(field_name, "SCHEMA"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "NAME"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "N_COLS"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "TYPE"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "MIX_ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "MIX_LEN"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "CLUSTER_NAME"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "SPACE"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO i_s_innodb_sys_indexes_info[] =
+{
+ {STRUCT_FLD(field_name, "TABLE_ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "NAME"),
+ STRUCT_FLD(field_length, NAME_LEN),
+ STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, 0),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "N_FIELDS"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "TYPE"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "SPACE"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "PAGE_NO"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO i_s_innodb_sys_stats_info[] =
+{
+ {STRUCT_FLD(field_name, "INDEX_ID"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "KEY_COLS"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ {STRUCT_FLD(field_name, "DIFF_VALS"),
+ STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+ STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+ STRUCT_FLD(value, 0),
+ STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+ STRUCT_FLD(old_name, ""),
+ STRUCT_FLD(open_method, SKIP_OPEN_TABLE)},
+
+ END_OF_ST_FIELD_INFO
+};
+
+static
+int
+copy_string_field(
+/*==============*/
+ TABLE* table,
+ int table_field,
+ const rec_t* rec,
+ int rec_field)
+{
+ int status;
+ const byte* data;
+ ulint len;
+
+ /*fprintf(stderr, "copy_string_field %d %d\n", table_field, rec_field);*/
+
+ data = rec_get_nth_field_old(rec, rec_field, &len);
+ if (len == UNIV_SQL_NULL) {
+ table->field[table_field]->set_null();
+ status = 0; /* success */
+ } else {
+ table->field[table_field]->set_notnull();
+ status = table->field[table_field]->store(
+ (char *) data, len, system_charset_info);
+ }
+
+ return status;
+}
+
+static
+int
+copy_name_fields(
+/*=============*/
+ TABLE* table,
+ int table_field_1,
+ const rec_t* rec,
+ int rec_field)
+{
+ int status;
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field_old(rec, rec_field, &len);
+ if (len == UNIV_SQL_NULL) {
+ table->field[table_field_1]->set_null();
+ table->field[table_field_1 + 1]->set_null();
+ status = 0; /* success */
+ } else {
+ char buf[NAME_LEN * 2 + 2];
+ char* ptr;
+
+ if (len > NAME_LEN * 2 + 1) {
+ table->field[table_field_1]->set_null();
+ status = field_store_string(table->field[table_field_1 + 1],
+ "###TOO LONG NAME###");
+ goto end_func;
+ }
+
+ strncpy(buf, (char*)data, len);
+ buf[len] = '\0';
+ ptr = strchr(buf, '/');
+ if (ptr) {
+ *ptr = '\0';
+ ++ptr;
+
+ status = field_store_string(table->field[table_field_1], buf);
+ status |= field_store_string(table->field[table_field_1 + 1], ptr);
+ } else {
+ table->field[table_field_1]->set_null();
+ status = field_store_string(table->field[table_field_1 + 1], buf);
+ }
+ }
+
+end_func:
+ return status;
+}
+
+static
+int
+copy_int_field(
+/*===========*/
+ TABLE* table,
+ int table_field,
+ const rec_t* rec,
+ int rec_field)
+{
+ int status;
+ const byte* data;
+ ulint len;
+
+ /*fprintf(stderr, "copy_int_field %d %d\n", table_field, rec_field);*/
+
+ data = rec_get_nth_field_old(rec, rec_field, &len);
+ if (len == UNIV_SQL_NULL) {
+ table->field[table_field]->set_null();
+ status = 0; /* success */
+ } else {
+ table->field[table_field]->set_notnull();
+ status = table->field[table_field]->store(
+ mach_read_from_4(data), true);
+ }
+
+ return status;
+}
+
+static
+int
+copy_id_field(
+/*==========*/
+ TABLE* table,
+ int table_field,
+ const rec_t* rec,
+ int rec_field)
+{
+ int status;
+ const byte* data;
+ ulint len;
+
+ /*fprintf(stderr, "copy_id_field %d %d\n", table_field, rec_field);*/
+
+ data = rec_get_nth_field_old(rec, rec_field, &len);
+ if (len == UNIV_SQL_NULL) {
+ table->field[table_field]->set_null();
+ status = 0; /* success */
+ } else {
+ table->field[table_field]->set_notnull();
+ status = table->field[table_field]->store(
+ ut_conv_dulint_to_longlong(mach_read_from_8(data)), true);
+ }
+
+ return status;
+}
+
+static
+int
+copy_sys_tables_rec(
+/*================*/
+ TABLE* table,
+ const dict_index_t* index,
+ const rec_t* rec
+)
+{
+ int status;
+ int field;
+
+ /* NAME */
+ field = dict_index_get_nth_col_pos(index, 0);
+ status = copy_name_fields(table, 0, rec, field);
+ if (status) {
+ return status;
+ }
+ /* ID */
+ field = dict_index_get_nth_col_pos(index, 1);
+ status = copy_id_field(table, 2, rec, field);
+ if (status) {
+ return status;
+ }
+ /* N_COLS */
+ field = dict_index_get_nth_col_pos(index, 2);
+ status = copy_int_field(table, 3, rec, field);
+ if (status) {
+ return status;
+ }
+ /* TYPE */
+ field = dict_index_get_nth_col_pos(index, 3);
+ status = copy_int_field(table, 4, rec, field);
+ if (status) {
+ return status;
+ }
+ /* MIX_ID */
+ field = dict_index_get_nth_col_pos(index, 4);
+ status = copy_id_field(table, 5, rec, field);
+ if (status) {
+ return status;
+ }
+ /* MIX_LEN */
+ field = dict_index_get_nth_col_pos(index, 5);
+ status = copy_int_field(table, 6, rec, field);
+ if (status) {
+ return status;
+ }
+ /* CLUSTER_NAME */
+ field = dict_index_get_nth_col_pos(index, 6);
+ status = copy_string_field(table, 7, rec, field);
+ if (status) {
+ return status;
+ }
+ /* SPACE */
+ field = dict_index_get_nth_col_pos(index, 7);
+ status = copy_int_field(table, 8, rec, field);
+ if (status) {
+ return status;
+ }
+
+ return 0;
+}
+
+static
+int
+copy_sys_indexes_rec(
+/*=================*/
+ TABLE* table,
+ const dict_index_t* index,
+ const rec_t* rec
+)
+{
+ int status;
+ int field;
+
+ /* TABLE_ID */
+ field = dict_index_get_nth_col_pos(index, 0);
+ status = copy_id_field(table, 0, rec, field);
+ if (status) {
+ return status;
+ }
+ /* ID */
+ field = dict_index_get_nth_col_pos(index, 1);
+ status = copy_id_field(table, 1, rec, field);
+ if (status) {
+ return status;
+ }
+ /* NAME */
+ field = dict_index_get_nth_col_pos(index, 2);
+ status = copy_string_field(table, 2, rec, field);
+ if (status) {
+ return status;
+ }
+ /* N_FIELDS */
+ field = dict_index_get_nth_col_pos(index, 3);
+ status = copy_int_field(table, 3, rec, field);
+ if (status) {
+ return status;
+ }
+ /* TYPE */
+ field = dict_index_get_nth_col_pos(index, 4);
+ status = copy_int_field(table, 4, rec, field);
+ if (status) {
+ return status;
+ }
+ /* SPACE */
+ field = dict_index_get_nth_col_pos(index, 5);
+ status = copy_int_field(table, 5, rec, field);
+ if (status) {
+ return status;
+ }
+ /* PAGE_NO */
+ field = dict_index_get_nth_col_pos(index, 6);
+ status = copy_int_field(table, 6, rec, field);
+ if (status) {
+ return status;
+ }
+
+ return 0;
+}
+
+static
+int
+copy_sys_stats_rec(
+/*===============*/
+ TABLE* table,
+ const dict_index_t* index,
+ const rec_t* rec
+)
+{
+ int status;
+ int field;
+
+ /* INDEX_ID */
+ field = dict_index_get_nth_col_pos(index, 0);
+ status = copy_id_field(table, 0, rec, field);
+ if (status) {
+ return status;
+ }
+ /* KEY_COLS */
+ field = dict_index_get_nth_col_pos(index, 1);
+ status = copy_int_field(table, 1, rec, field);
+ if (status) {
+ return status;
+ }
+ /* DIFF_VALS */
+ field = dict_index_get_nth_col_pos(index, 2);
+ status = copy_id_field(table, 2, rec, field);
+ if (status) {
+ return status;
+ }
+
+ return 0;
+}
+
+static
+int
+i_s_innodb_schema_table_fill(
+/*=========================*/
+ THD* thd,
+ TABLE_LIST* tables,
+ COND* cond)
+{
+ int status = 0;
+ TABLE* table = (TABLE *) tables->table;
+ const char* table_name = tables->schema_table_name;
+ dict_table_t* innodb_table;
+ dict_index_t* index;
+ btr_pcur_t pcur;
+ const rec_t* rec;
+ mtr_t mtr;
+ int id;
+
+ DBUG_ENTER("i_s_innodb_schema_table_fill");
+
+ /* deny access to non-superusers */
+ if (check_global_access(thd, PROCESS_ACL)) {
+ DBUG_RETURN(0);
+ }
+
+ if (innobase_strcasecmp(table_name, "innodb_sys_tables") == 0) {
+ id = 0;
+ } else if (innobase_strcasecmp(table_name, "innodb_sys_indexes") == 0) {
+ id = 1;
+ } else if (innobase_strcasecmp(table_name, "innodb_sys_stats") == 0) {
+ id = 2;
+ } else {
+ DBUG_RETURN(1);
+ }
+
+
+ RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ mtr_start(&mtr);
+
+ if (id == 0) {
+ innodb_table = dict_table_get_low("SYS_TABLES");
+ } else if (id == 1) {
+ innodb_table = dict_table_get_low("SYS_INDEXES");
+ } else {
+ innodb_table = dict_table_get_low("SYS_STATS");
+ }
+ index = UT_LIST_GET_FIRST(innodb_table->indexes);
+
+ btr_pcur_open_at_index_side(TRUE, index, BTR_SEARCH_LEAF, &pcur,
+ TRUE, &mtr);
+ for (;;) {
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* end of index */
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ break;
+ }
+ if (rec_get_deleted_flag(rec, 0)) {
+ /* record marked as deleted */
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ continue;
+ }
+
+ if (id == 0) {
+ status = copy_sys_tables_rec(table, index, rec);
+ } else if (id == 1) {
+ status = copy_sys_indexes_rec(table, index, rec);
+ } else {
+ status = copy_sys_stats_rec(table, index, rec);
+ }
+ if (status) {
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ break;
+ }
+
+#if 0
+ btr_pcur_store_position(&pcur, &mtr);
+ mtr_commit(&mtr);
+
+ status = schema_table_store_record(thd, table);
+ if (status) {
+ btr_pcur_close(&pcur);
+ break;
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+#else
+ status = schema_table_store_record(thd, table);
+ if (status) {
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ break;
+ }
+#endif
+ }
+
+ mutex_exit(&(dict_sys->mutex));
+
+ DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_sys_tables_init(
+/*=======================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_sys_tables_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_sys_tables_info;
+ schema->fill_table = i_s_innodb_schema_table_fill;
+
+ DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_sys_indexes_init(
+/*========================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_sys_indexes_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_sys_indexes_info;
+ schema->fill_table = i_s_innodb_schema_table_fill;
+
+ DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_sys_stats_init(
+/*======================*/
+ void* p)
+{
+ DBUG_ENTER("i_s_innodb_sys_stats_init");
+ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+ schema->fields_info = i_s_innodb_sys_stats_info;
+ schema->fill_table = i_s_innodb_schema_table_fill;
+
+ DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_tables =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_tables_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_tables_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_tables_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_indexes =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_indexes_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+UNIV_INTERN struct st_mysql_plugin i_s_innodb_sys_stats =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_sys_stats_maria =
+{
+ STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+ STRUCT_FLD(info, &i_s_info),
+ STRUCT_FLD(name, "INNODB_SYS_STATS"),
+ STRUCT_FLD(author, plugin_author),
+ STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
+ STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+ STRUCT_FLD(init, i_s_innodb_sys_stats_init),
+ STRUCT_FLD(deinit, i_s_common_deinit),
+ STRUCT_FLD(version, 0x0100 /* 1.0 */),
+ STRUCT_FLD(status_vars, NULL),
+ STRUCT_FLD(system_vars, NULL),
+ STRUCT_FLD(version_info, "1.0"),
+ STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h
new file mode 100644
index 00000000000..7a5c3ead5ed
--- /dev/null
+++ b/storage/xtradb/handler/i_s.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.h
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef i_s_h
+#define i_s_h
+
+extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages;
+extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages_index;
+extern struct st_mysql_plugin i_s_innodb_buffer_pool_pages_blob;
+extern struct st_mysql_plugin i_s_innodb_trx;
+extern struct st_mysql_plugin i_s_innodb_locks;
+extern struct st_mysql_plugin i_s_innodb_lock_waits;
+extern struct st_mysql_plugin i_s_innodb_cmp;
+extern struct st_mysql_plugin i_s_innodb_cmp_reset;
+extern struct st_mysql_plugin i_s_innodb_cmpmem;
+extern struct st_mysql_plugin i_s_innodb_cmpmem_reset;
+extern struct st_mysql_plugin i_s_innodb_patches;
+extern struct st_mysql_plugin i_s_innodb_rseg;
+extern struct st_mysql_plugin i_s_innodb_table_stats;
+extern struct st_mysql_plugin i_s_innodb_index_stats;
+extern struct st_mysql_plugin i_s_innodb_admin_command;
+extern struct st_mysql_plugin i_s_innodb_sys_tables;
+extern struct st_mysql_plugin i_s_innodb_sys_indexes;
+extern struct st_mysql_plugin i_s_innodb_sys_stats;
+
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_maria;
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_index_maria;
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob_maria;
+extern struct st_maria_plugin i_s_innodb_trx_maria;
+extern struct st_maria_plugin i_s_innodb_locks_maria;
+extern struct st_maria_plugin i_s_innodb_lock_waits_maria;
+extern struct st_maria_plugin i_s_innodb_cmp_maria;
+extern struct st_maria_plugin i_s_innodb_cmp_reset_maria;
+extern struct st_maria_plugin i_s_innodb_cmpmem_maria;
+extern struct st_maria_plugin i_s_innodb_cmpmem_reset_maria;
+extern struct st_maria_plugin i_s_innodb_patches_maria;
+extern struct st_maria_plugin i_s_innodb_rseg_maria;
+extern struct st_maria_plugin i_s_innodb_table_stats_maria;
+extern struct st_maria_plugin i_s_innodb_index_stats_maria;
+extern struct st_maria_plugin i_s_innodb_admin_command_maria;
+extern struct st_maria_plugin i_s_innodb_sys_tables_maria;
+extern struct st_maria_plugin i_s_innodb_sys_indexes_maria;
+extern struct st_maria_plugin i_s_innodb_sys_stats_maria;
+
+#endif /* i_s_h */
diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h
new file mode 100644
index 00000000000..e68f12d0fec
--- /dev/null
+++ b/storage/xtradb/handler/innodb_patch_info.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2002-2006 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface /* gcc class implementation */
+#endif
+
+struct innodb_enhancement {
+ const char *file;
+ const char *name;
+ const char *comment;
+ const char *link;
+}innodb_enhancements[] = {
+{"xtradb_show_enhancements","I_S.XTRADB_ENHANCEMENTS","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_show_status","Improvements to SHOW INNODB STATUS","Memory information and lock info fixes","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_io","Improvements to InnoDB IO","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_opt_lru_count","Fix of buffer_pool mutex","Decreases contention on buffer_pool mutex on LRU operations","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_buffer_pool_pages","Information of buffer pool content","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_expand_undo_slots","expandable maximum number of undo slots","from 1024 (default) to about 4000","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_extra_rseg","allow to create extra rollback segments","When create new db, the new parameter allows to create more rollback segments","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_overwrite_relay_log_info","overwrite relay-log.info when slave recovery","Building as plugin, it is not used.","http://www.percona.com/docs/wiki/percona-xtradb:innodb_overwrite_relay_log_info"},
+{"innodb_thread_concurrency_timer_based","use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0)","",""},
+{"innodb_expand_import","convert .ibd file automatically when import tablespace","the files are generated by xtrabackup export mode.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_dict_size_limit","Limit dictionary cache size","Variable innodb_dict_size_limit in bytes","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_split_buf_pool_mutex","More fix of buffer_pool mutex","Spliting buf_pool_mutex and optimizing based on innodb_opt_lru_count","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_stats","Additional features about InnoDB statistics/optimizer","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_recovery_patches","Bugfixes and adjustments about recovery process","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_purge_thread","Enable to use purge devoted thread","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_admin_command_base","XtraDB specific command interface through i_s","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_show_lock_name","Show mutex/lock name instead of crated file/line","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_extend_slow","Extended statistics in slow.log","It is InnoDB-part only. It needs to patch also to mysqld.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_lru_dump_restore","Dump and restore command for content of buffer pool","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_separate_doublewrite","Add option 'innodb_doublewrite_file' to separate doublewrite dedicated tablespace","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_pass_corrupt_table","Treat tables as corrupt instead of crash, when meet corrupt blocks","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_fast_checksum","Using the checksum on 32bit-unit calculation","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_files_extend","allow >4GB transaction log files, and can vary universal page size of datafiles","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_sys_tables_sys_indexes","Expose InnoDB SYS_TABLES and SYS_INDEXES schema tables","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_buffer_pool_shm","Put buffer pool contents to shared memory segment and reuse it at clean restart [experimental]","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{NULL, NULL, NULL, NULL}
+};
diff --git a/storage/xtradb/handler/mysql_addons.cc b/storage/xtradb/handler/mysql_addons.cc
new file mode 100644
index 00000000000..eae1fe9fbc2
--- /dev/null
+++ b/storage/xtradb/handler/mysql_addons.cc
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/mysql_addons.cc
+This file contains functions that need to be added to
+MySQL code but have not been added yet.
+
+Whenever you add a function here submit a MySQL bug
+report (feature request) with the implementation. Then
+write the bug number in the comment before the
+function in this file.
+
+When MySQL commits the function it can be deleted from
+here. In a perfect world this file exists but is empty.
+
+Created November 07, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef MYSQL_SERVER
+#define MYSQL_SERVER
+#endif /* MYSQL_SERVER */
+
+#include <mysql_priv.h>
+
+#include "mysql_addons.h"
+#include "univ.i"
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c
new file mode 100644
index 00000000000..e01c2d6b800
--- /dev/null
+++ b/storage/xtradb/ibuf/ibuf0ibuf.c
@@ -0,0 +1,3646 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ibuf/ibuf0ibuf.c
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ibuf0ibuf.h"
+
+/** Number of bits describing a single page */
+#define IBUF_BITS_PER_PAGE 4
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE must be an even number!"
+#endif
+/** The start address for an insert buffer bitmap page bitmap */
+#define IBUF_BITMAP PAGE_DATA
+
+#ifdef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+
+#include "buf0buf.h"
+#include "buf0rea.h"
+#include "fsp0fsp.h"
+#include "trx0sys.h"
+#include "fil0fil.h"
+#include "thr0loc.h"
+#include "rem0rec.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "sync0sync.h"
+#include "dict0boot.h"
+#include "fut0lst.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "que0que.h"
+
+/* STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+ field. We store the information which affects the ordering of records, and
+ also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+ is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker (0) which differentiates records from
+ the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+ store the charset. In the compressed table format of 5.0.x we must add more
+ information here so that we can build a dummy 'index' struct which 5.0.x
+ can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+In versions >= 5.0.3:
+
+The first byte of the fourth field is an additional marker (0) if the record
+is in the compact format. The presence of this marker can be detected by
+looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+
+The high-order bit of the character set field in the type info is the
+"nullable" flag for the field. */
+
+
+/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
+
+If an OS thread performs any operation that brings in disk pages from
+non-system tablespaces into the buffer pool, or creates such a page there,
+then the operation may have as a side effect an insert buffer index tree
+compression. Thus, the tree latch of the insert buffer tree may be acquired
+in the x-mode, and also the file space latch of the system tablespace may
+be acquired in the x-mode.
+
+Also, an insert to an index in a non-system tablespace can have the same
+effect. How do we know this cannot lead to a deadlock of OS threads? There
+is a problem with the i\o-handler threads: they break the latching order
+because they own x-latches to pages which are on a lower level than the
+insert buffer tree latch, its page latches, and the tablespace latch an
+insert buffer operation can reserve.
+
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
+Insert buffer pages must be such that the insert buffer is never invoked
+when these pages are accessed as this would result in a recursion violating
+the latching order. We let a special i/o-handler thread take care of i/o to
+the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
+pages and the first inode page, which contains the inode of the ibuf tree: let
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
+
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
+i/o-handlers for other tablespaces may require access to the insert buffer,
+but because all kinds of latches they need to access there are later in the
+latching order, no violation of the latching order occurs in this case,
+either.
+
+A problem is how to grow and contract an insert buffer tree. As it is later
+in the latching order than the fsp management, we have to reserve the fsp
+latch first, before adding or removing pages from the insert buffer tree.
+We let the insert buffer tree have its own file space management: a free
+list of pages linked to the tree root. To prevent recursive using of the
+insert buffer when adding pages to the tree, we must first load these pages
+to memory, obtaining a latch on them, and only after that add them to the
+free list of the insert buffer tree. More difficult is removing of pages
+from the free list. If there is an excess of pages in the free list of the
+ibuf tree, they might be needed if some thread reserves the fsp latch,
+intending to allocate more file space. So we do the following: if a thread
+reserves the fsp latch, we check the writer count field of the latch. If
+this field has value 1, it means that the thread did not own the latch
+before entering the fsp system, and the mtr of the thread contains no
+modifications to the fsp pages. Now we are free to reserve the ibuf latch,
+and check if there is an excess of pages in the free list. We can then, in a
+separate mini-transaction, take them out of the free list and free them to
+the fsp system.
+
+To avoid deadlocks in the ibuf system, we divide file pages into three levels:
+
+(1) non-ibuf pages,
+(2) ibuf tree pages and the pages in the ibuf tree free list, and
+(3) ibuf bitmap pages.
+
+No OS thread is allowed to access higher level pages if it has latches to
+lower level pages; even if the thread owns a B-tree latch it must not access
+the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
+is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
+exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
+level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
+
+/** Buffer pool size per the maximum insert buffer size */
+#define IBUF_POOL_SIZE_PER_MAX_SIZE 2
+
+/** Table name for the insert buffer. */
+#define IBUF_TABLE_NAME "SYS_IBUF_TABLE"
+
+/** Operations that can currently be buffered. */
+UNIV_INTERN ibuf_use_t ibuf_use = IBUF_USE_INSERT;
+
+/** The insert buffer control structure */
+UNIV_INTERN ibuf_t* ibuf = NULL;
+
+/** Counter for ibuf_should_try() */
+UNIV_INTERN ulint ibuf_flush_count = 0;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/** Number of tablespaces in the ibuf_counts array */
+#define IBUF_COUNT_N_SPACES 4
+/** Number of pages within each tablespace in the ibuf_counts array */
+#define IBUF_COUNT_N_PAGES 130000
+
+/** Buffered entry counts for file pages, used in debugging */
+static ulint ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES];
+
+/******************************************************************//**
+Checks that the indexes to ibuf_counts[][] are within limits. */
+UNIV_INLINE
+void
+ibuf_count_check(
+/*=============*/
+ ulint space_id, /*!< in: space identifier */
+ ulint page_no) /*!< in: page number */
+{
+ if (space_id < IBUF_COUNT_N_SPACES && page_no < IBUF_COUNT_N_PAGES) {
+ return;
+ }
+
+ fprintf(stderr,
+ "InnoDB: UNIV_IBUF_COUNT_DEBUG limits space_id and page_no\n"
+ "InnoDB: and breaks crash recovery.\n"
+ "InnoDB: space_id=%lu, should be 0<=space_id<%lu\n"
+ "InnoDB: page_no=%lu, should be 0<=page_no<%lu\n",
+ (ulint) space_id, (ulint) IBUF_COUNT_N_SPACES,
+ (ulint) page_no, (ulint) IBUF_COUNT_N_PAGES);
+ ut_error;
+}
+#endif
+
+/** @name Offsets to the per-page bits in the insert buffer bitmap */
+/* @{ */
+#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the
+ amount of free space */
+#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered
+ changes for the page */
+#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of
+ the ibuf tree, excluding the
+ root page, or is in the free
+ list of the ibuf */
+/* @} */
+
+/** The mutex used to block pessimistic inserts to ibuf trees */
+static mutex_t ibuf_pessimistic_insert_mutex;
+
+/** The mutex protecting the insert buffer structs */
+static mutex_t ibuf_mutex;
+
+/** The mutex protecting the insert buffer bitmaps */
+static mutex_t ibuf_bitmap_mutex;
+
+/** The area in pages from which contract looks for page numbers for merge */
+#define IBUF_MERGE_AREA 8
+
+/** Inside the merge area, pages which have at most 1 per this number less
+buffered entries compared to maximum volume that can buffered for a single
+page are merged along with the page whose buffer became full */
+#define IBUF_MERGE_THRESHOLD 4
+
+/** In ibuf_contract at most this number of pages is read to memory in one
+batch, in order to merge the entries for them in the insert buffer */
+#define IBUF_MAX_N_PAGES_MERGED IBUF_MERGE_AREA
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+non-synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_NON_SYNC 0
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_SYNC 5
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by
+this many pages, we start to contract it synchronous contract, but do
+not insert */
+#define IBUF_CONTRACT_DO_NOT_INSERT 10
+
+/* TODO: how to cope with drop table if there are records in the insert
+buffer for the indexes of the table? Is there actually any problem,
+because ibuf merge is done to a page when it is read in, and it is
+still physically like the index page even if the index would have been
+dropped! So, there seems to be no problem. */
+
+/******************************************************************//**
+Sets the flag in the current OS thread local storage denoting that it is
+inside an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_enter(void)
+/*============*/
+{
+ ibool* ptr;
+
+ ptr = thr_local_get_in_ibuf_field();
+
+ ut_ad(*ptr == FALSE);
+
+ *ptr = TRUE;
+}
+
+/******************************************************************//**
+Sets the flag in the current OS thread local storage denoting that it is
+exiting an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_exit(void)
+/*===========*/
+{
+ ibool* ptr;
+
+ ptr = thr_local_get_in_ibuf_field();
+
+ ut_ad(*ptr == TRUE);
+
+ *ptr = FALSE;
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INTERN
+ibool
+ibuf_inside(void)
+/*=============*/
+{
+ return(*thr_local_get_in_ibuf_field());
+}
+
+/******************************************************************//**
+Gets the ibuf header page and x-latches it.
+@return insert buffer header page */
+static
+page_t*
+ibuf_header_page_get(
+/*=================*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ ut_ad(!ibuf_inside());
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
+
+ return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets the root page and x-latches it.
+@return insert buffer tree root page */
+static
+page_t*
+ibuf_tree_root_get(
+/*===============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ ut_ad(ibuf_inside());
+
+ mtr_x_lock(dict_index_get_lock(ibuf->index), mtr);
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ return(buf_block_get_frame(block));
+}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+ ulint space, /*!< in: space id */
+ ulint page_no)/*!< in: page number */
+{
+ ibuf_count_check(space, page_no);
+
+ return(ibuf_counts[space][page_no]);
+}
+
+/******************************************************************//**
+Sets the ibuf count for a given page. */
+static
+void
+ibuf_count_set(
+/*===========*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: page number */
+ ulint val) /*!< in: value to set */
+{
+ ibuf_count_check(space, page_no);
+ ut_a(val < UNIV_PAGE_SIZE);
+
+ ibuf_counts[space][page_no] = val;
+}
+#endif
+
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void)
+/*============*/
+{
+ mutex_free(&ibuf_pessimistic_insert_mutex);
+ memset(&ibuf_pessimistic_insert_mutex,
+ 0x0, sizeof(ibuf_pessimistic_insert_mutex));
+
+ mutex_free(&ibuf_mutex);
+ memset(&ibuf_mutex, 0x0, sizeof(ibuf_mutex));
+
+ mutex_free(&ibuf_bitmap_mutex);
+ memset(&ibuf_bitmap_mutex, 0x0, sizeof(ibuf_mutex));
+
+ mem_free(ibuf);
+ ibuf = NULL;
+}
+
+/******************************************************************//**
+Updates the size information of the ibuf, assuming the segment size has not
+changed. */
+static
+void
+ibuf_size_update(
+/*=============*/
+ const page_t* root, /*!< in: ibuf tree root */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
+ + PAGE_BTR_IBUF_FREE_LIST, mtr);
+
+ ibuf->height = 1 + btr_page_get_level(root, mtr);
+
+ /* the '1 +' is the ibuf header page */
+ ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
+
+ ibuf->empty = page_get_n_recs(root) == 0;
+}
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void)
+/*=======================*/
+{
+ page_t* root;
+ mtr_t mtr;
+ dict_table_t* table;
+ mem_heap_t* heap;
+ dict_index_t* index;
+ ulint n_used;
+ page_t* header_page;
+ ulint error;
+
+ ibuf = mem_alloc(sizeof(ibuf_t));
+
+ memset(ibuf, 0, sizeof(*ibuf));
+
+ /* Note that also a pessimistic delete can sometimes make a B-tree
+ grow in size, as the references on the upper levels of the tree can
+ change */
+
+ ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
+ / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
+
+ srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+
+ mutex_create(&ibuf_pessimistic_insert_mutex,
+ SYNC_IBUF_PESS_INSERT_MUTEX);
+
+ mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX);
+
+ mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
+
+ mtr_start(&mtr);
+
+ mutex_enter(&ibuf_mutex);
+
+ mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr);
+
+ header_page = ibuf_header_page_get(&mtr);
+
+ fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+ &n_used, &mtr);
+ ibuf_enter();
+
+ ut_ad(n_used >= 2);
+
+ ibuf->seg_size = n_used;
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+ root = buf_block_get_frame(block);
+ }
+
+ ibuf_size_update(root, &mtr);
+ mutex_exit(&ibuf_mutex);
+
+ mtr_commit(&mtr);
+
+ ibuf_exit();
+
+ heap = mem_heap_create(450);
+
+ /* Use old-style record format for the insert buffer. */
+ table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0);
+ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
+
+ table->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
+
+ dict_table_add_to_cache(table, heap);
+ mem_heap_free(heap);
+
+ index = dict_mem_index_create(
+ IBUF_TABLE_NAME, "CLUST_IND",
+ IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1);
+
+ dict_mem_index_add_field(index, "DUMMY_COLUMN", 0);
+
+ index->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
+
+ error = dict_index_add_to_cache(table, index,
+ FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE);
+ ut_a(error == DB_SUCCESS);
+
+ ibuf->index = dict_table_get_first_index(table);
+}
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+ buf_block_t* block, /*!< in: bitmap page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+ ulint byte_offset;
+ ulint zip_size = buf_block_get_zip_size(block);
+
+ ut_a(ut_is_2pow(zip_size));
+
+ page = buf_block_get_frame(block);
+ fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP);
+
+ /* Write all zeros to the bitmap */
+
+ if (!zip_size) {
+ byte_offset = UT_BITS_IN_BYTES(UNIV_PAGE_SIZE
+ * IBUF_BITS_PER_PAGE);
+ } else {
+ byte_offset = UT_BITS_IN_BYTES(zip_size * IBUF_BITS_PER_PAGE);
+ }
+
+ memset(page + IBUF_BITMAP, 0, byte_offset);
+
+ /* The remaining area (up to the page trailer) is uninitialized. */
+
+#ifndef UNIV_HOTBACKUP
+ mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ if (block) {
+ ibuf_bitmap_page_init(block, mtr);
+ }
+
+ return(ptr);
+}
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Gets the desired bits for a given page from a bitmap page.
+@return value of bits */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_get_bits(
+/*======================*/
+ const page_t* page, /*!< in: bitmap page */
+ ulint page_no,/*!< in: page whose bits to get */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint bit, /*!< in: IBUF_BITMAP_FREE,
+ IBUF_BITMAP_BUFFERED, ... */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mtr containing an
+ x-latch to the bitmap page */
+{
+ ulint byte_offset;
+ ulint bit_offset;
+ ulint map_byte;
+ ulint value;
+
+ ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+
+ if (!zip_size) {
+ bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+ + bit;
+ } else {
+ bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+ + bit;
+ }
+
+ byte_offset = bit_offset / 8;
+ bit_offset = bit_offset % 8;
+
+ ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+ map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+ value = ut_bit_get_nth(map_byte, bit_offset);
+
+ if (bit == IBUF_BITMAP_FREE) {
+ ut_ad(bit_offset + 1 < 8);
+
+ value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
+ }
+
+ return(value);
+}
+
+/********************************************************************//**
+Sets the desired bit for a given page in a bitmap page. */
+static
+void
+ibuf_bitmap_page_set_bits(
+/*======================*/
+ page_t* page, /*!< in: bitmap page */
+ ulint page_no,/*!< in: page whose bits to set */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint bit, /*!< in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */
+ ulint val, /*!< in: value to set */
+ mtr_t* mtr) /*!< in: mtr containing an x-latch to the bitmap page */
+{
+ ulint byte_offset;
+ ulint bit_offset;
+ ulint map_byte;
+
+ ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE)
+ || (0 == ibuf_count_get(page_get_space_id(page),
+ page_no)));
+#endif
+ if (!zip_size) {
+ bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+ + bit;
+ } else {
+ bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+ + bit;
+ }
+
+ byte_offset = bit_offset / 8;
+ bit_offset = bit_offset % 8;
+
+ ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+ map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+ if (bit == IBUF_BITMAP_FREE) {
+ ut_ad(bit_offset + 1 < 8);
+ ut_ad(val <= 3);
+
+ map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2);
+ map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2);
+ } else {
+ ut_ad(val <= 1);
+ map_byte = ut_bit_set_nth(map_byte, bit_offset, val);
+ }
+
+ mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte,
+ MLOG_1BYTE, mtr);
+}
+
+/********************************************************************//**
+Calculates the bitmap page number for a given page number.
+@return the bitmap page number where the file page is mapped */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_no_calc(
+/*=====================*/
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no) /*!< in: tablespace page number */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(FSP_IBUF_BITMAP_OFFSET
+ + (page_no & ~(UNIV_PAGE_SIZE - 1)));
+ } else {
+ return(FSP_IBUF_BITMAP_OFFSET
+ + (page_no & ~(zip_size - 1)));
+ }
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+static
+page_t*
+ibuf_bitmap_get_map_page_func(
+/*==========================*/
+ ulint space, /*!< in: space id of the file page */
+ ulint page_no,/*!< in: page number of the file page */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ block = buf_page_get_gen(space, zip_size,
+ ibuf_bitmap_page_no_calc(zip_size, page_no),
+ RW_X_LATCH, NULL, BUF_GET,
+ file, line, mtr);
+ buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
+
+ return(buf_block_get_frame(block));
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched
+@param space in: space id of the file page
+@param page_no in: page number of the file page
+@param zip_size in: compressed page size in bytes; 0 for uncompressed pages
+@param mtr in: mini-transaction */
+#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr) \
+ ibuf_bitmap_get_map_page_func(space, page_no, zip_size, \
+ __FILE__, __LINE__, mtr)
+
+/************************************************************************//**
+Sets the free bits of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INLINE
+void
+ibuf_set_free_bits_low(
+/*===================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const buf_block_t* block, /*!< in: index page; free bits are set if
+ the index is non-clustered and page
+ level is 0 */
+ ulint val, /*!< in: value to set: < 4 */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ page_t* bitmap_page;
+ ulint space;
+ ulint page_no;
+
+ if (!page_is_leaf(buf_block_get_frame(block))) {
+
+ return;
+ }
+
+ space = buf_block_get_space(block);
+ page_no = buf_block_get_page_no(block);
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+#ifdef UNIV_IBUF_DEBUG
+# if 0
+ fprintf(stderr,
+ "Setting space %lu page %lu free bits to %lu should be %lu\n",
+ space, page_no, val,
+ ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+ ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, val, mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val) /*!< in: value to set: < 4 */
+{
+ mtr_t mtr;
+ page_t* page;
+ page_t* bitmap_page;
+ ulint space;
+ ulint page_no;
+ ulint zip_size;
+
+ page = buf_block_get_frame(block);
+
+ if (!page_is_leaf(page)) {
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ space = buf_block_get_space(block);
+ page_no = buf_block_get_page_no(block);
+ zip_size = buf_block_get_zip_size(block);
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
+
+#ifdef UNIV_IBUF_DEBUG
+ if (max_val != ULINT_UNDEFINED) {
+ ulint old_val;
+
+ old_val = ibuf_bitmap_page_get_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, &mtr);
+# if 0
+ if (old_val != max_val) {
+ fprintf(stderr,
+ "Ibuf: page %lu old val %lu max val %lu\n",
+ page_get_page_no(page),
+ old_val, max_val);
+ }
+# endif
+
+ ut_a(old_val <= max_val);
+ }
+# if 0
+ fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n",
+ page_get_page_no(page), val,
+ ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+ ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, val, &mtr);
+ mtr_commit(&mtr);
+}
+
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block) /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+{
+ ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ulint before;
+ ulint after;
+
+ ut_a(!buf_block_get_page_zip(block));
+
+ before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+ after = ibuf_index_page_calc_free(0, block);
+
+ /* This approach cannot be used on compressed pages, since the
+ computed value of "before" often does not match the current
+ state of the bitmap. This is because the free space may
+ increase or decrease when a compressed page is reorganized. */
+ if (before != after) {
+ ibuf_set_free_bits_low(0, block, after, mtr);
+ }
+}
+
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ page_t* bitmap_page;
+ ulint space;
+ ulint page_no;
+ ulint zip_size;
+ ulint after;
+
+ space = buf_block_get_space(block);
+ page_no = buf_block_get_page_no(block);
+ zip_size = buf_block_get_zip_size(block);
+
+ ut_a(page_is_leaf(buf_block_get_frame(block)));
+ ut_a(zip_size);
+
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+
+ after = ibuf_index_page_calc_free_zip(zip_size, block);
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, after, mtr);
+}
+
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint state;
+
+ /* As we have to x-latch two random bitmap pages, we have to acquire
+ the bitmap mutex to prevent a deadlock with a similar operation
+ performed by another OS thread. */
+
+ mutex_enter(&ibuf_bitmap_mutex);
+
+ state = ibuf_index_page_calc_free(zip_size, block1);
+
+ ibuf_set_free_bits_low(zip_size, block1, state, mtr);
+
+ state = ibuf_index_page_calc_free(zip_size, block2);
+
+ ibuf_set_free_bits_low(zip_size, block2, state, mtr);
+
+ mutex_exit(&ibuf_bitmap_mutex);
+}
+
+/**********************************************************************//**
+Returns TRUE if the page is one of the fixed address ibuf pages.
+@return TRUE if a fixed address ibuf i/o page */
+UNIV_INLINE
+ibool
+ibuf_fixed_addr_page(
+/*=================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO)
+ || ibuf_bitmap_page(zip_size, page_no));
+}
+
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page(
+/*======*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number */
+ mtr_t* mtr) /*!< in: mtr which will contain an x-latch to the
+ bitmap page if the page is not one of the fixed
+ address ibuf pages, or NULL, in which case a new
+ transaction is created. */
+{
+ ibool ret;
+ mtr_t local_mtr;
+ page_t* bitmap_page;
+
+ ut_ad(!recv_no_ibuf_operations);
+
+ if (ibuf_fixed_addr_page(space, zip_size, page_no)) {
+
+ return(TRUE);
+ } else if (space != IBUF_SPACE_ID) {
+
+ return(FALSE);
+ }
+
+ ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE);
+
+ if (mtr == NULL) {
+ mtr = &local_mtr;
+ mtr_start(mtr);
+ }
+
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+
+ ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_IBUF, mtr);
+
+ if (mtr == &local_mtr) {
+ mtr_commit(mtr);
+ }
+
+ return(ret);
+}
+
+/********************************************************************//**
+Returns the page number field of an ibuf record.
+@return page number */
+static
+ulint
+ibuf_rec_get_page_no(
+/*=================*/
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+
+ if (len == 1) {
+ /* This is of the >= 4.1.x record format */
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ field = rec_get_nth_field_old(rec, 2, &len);
+ } else {
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ }
+
+ ut_a(len == 4);
+
+ return(mach_read_from_4(field));
+}
+
+/********************************************************************//**
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0.
+@return space id */
+static
+ulint
+ibuf_rec_get_space(
+/*===============*/
+ const rec_t* rec) /*!< in: ibuf record */
+{
+ const byte* field;
+ ulint len;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields_old(rec) > 2);
+
+ field = rec_get_nth_field_old(rec, 1, &len);
+
+ if (len == 1) {
+ /* This is of the >= 4.1.x record format */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_a(len == 4);
+
+ return(mach_read_from_4(field));
+ }
+
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ return(0);
+}
+
+/********************************************************************//**
+Creates a dummy index for inserting a record to a non-clustered index.
+
+@return dummy index */
+static
+dict_index_t*
+ibuf_dummy_index_create(
+/*====================*/
+ ulint n, /*!< in: number of fields */
+ ibool comp) /*!< in: TRUE=use compact record format */
+{
+ dict_table_t* table;
+ dict_index_t* index;
+
+ table = dict_mem_table_create("IBUF_DUMMY",
+ DICT_HDR_SPACE, n,
+ comp ? DICT_TF_COMPACT : 0);
+
+ index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
+ DICT_HDR_SPACE, 0, n);
+
+ index->table = table;
+
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+
+ return(index);
+}
+/********************************************************************//**
+Add a column to the dummy index */
+static
+void
+ibuf_dummy_index_add_col(
+/*=====================*/
+ dict_index_t* index, /*!< in: dummy index */
+ const dtype_t* type, /*!< in: the data type of the column */
+ ulint len) /*!< in: length of the column */
+{
+ ulint i = index->table->n_def;
+ dict_mem_table_add_col(index->table, NULL, NULL,
+ dtype_get_mtype(type),
+ dtype_get_prtype(type),
+ dtype_get_len(type));
+ dict_index_add_col(index, index->table,
+ dict_table_get_nth_col(index->table, i), len);
+}
+/********************************************************************//**
+Deallocates a dummy index for inserting a record to a non-clustered index. */
+static
+void
+ibuf_dummy_index_free(
+/*==================*/
+ dict_index_t* index) /*!< in, own: dummy index */
+{
+ dict_table_t* table = index->table;
+
+ dict_mem_index_free(index);
+ dict_mem_table_free(table);
+}
+
+/*********************************************************************//**
+Builds the entry to insert into a non-clustered index when we have the
+corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+UNIV_INLINE
+dtuple_t*
+ibuf_build_entry_pre_4_1_x(
+/*=======================*/
+ const rec_t* ibuf_rec, /*!< in: record in an insert buffer */
+ mem_heap_t* heap, /*!< in: heap where built */
+ dict_index_t** pindex) /*!< out, own: dummy index that
+ describes the entry */
+{
+ ulint i;
+ ulint len;
+ const byte* types;
+ dtuple_t* tuple;
+ ulint n_fields;
+
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
+ tuple = dtuple_create(heap, n_fields);
+ types = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+ ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+
+ for (i = 0; i < n_fields; i++) {
+ const byte* data;
+ dfield_t* field;
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
+
+ dfield_set_data(field, data, len);
+
+ dtype_read_for_order_and_null_size(
+ dfield_get_type(field),
+ types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ }
+
+ *pindex = ibuf_dummy_index_create(n_fields, FALSE);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Builds the entry to insert into a non-clustered index when we have the
+corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+static
+dtuple_t*
+ibuf_build_entry_from_ibuf_rec(
+/*===========================*/
+ const rec_t* ibuf_rec, /*!< in: record in an insert buffer */
+ mem_heap_t* heap, /*!< in: heap where built */
+ dict_index_t** pindex) /*!< out, own: dummy index that
+ describes the entry */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ ulint n_fields;
+ const byte* types;
+ const byte* data;
+ ulint len;
+ ulint i;
+ dict_index_t* index;
+
+ data = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+ if (len > 1) {
+ /* This a < 4.1.x format record */
+
+ return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex));
+ }
+
+ /* This a >= 4.1.x format record */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ ut_a(*data == 0);
+ ut_a(rec_get_n_fields_old(ibuf_rec) > 4);
+
+ n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
+
+ tuple = dtuple_create(heap, n_fields);
+
+ types = rec_get_nth_field_old(ibuf_rec, 3, &len);
+
+ ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1);
+ index = ibuf_dummy_index_create(
+ n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+ /* compact record format */
+ len--;
+ ut_a(*types == 0);
+ types++;
+ }
+
+ ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
+
+ dfield_set_data(field, data, len);
+
+ dtype_new_read_for_order_and_null_size(
+ dfield_get_type(field),
+ types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+ ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
+ }
+
+ /* Prevent an ut_ad() failure in page_zip_write_rec() by
+ adding system columns to the dummy table pointed to by the
+ dummy secondary index. The insert buffer is only used for
+ secondary indexes, whose records never contain any system
+ columns, such as DB_TRX_ID. */
+ ut_d(dict_table_add_system_columns(index->table, index->table->heap));
+
+ *pindex = index;
+
+ return(tuple);
+}
+
+/********************************************************************//**
+Returns the space taken by a stored non-clustered index entry if converted to
+an index record.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_rec_get_volume(
+/*================*/
+ const rec_t* ibuf_rec)/*!< in: ibuf record */
+{
+ dtype_t dtype;
+ ibool new_format = FALSE;
+ ulint data_size = 0;
+ ulint n_fields;
+ const byte* types;
+ const byte* data;
+ ulint len;
+ ulint i;
+ ulint comp;
+
+ ut_ad(ibuf_inside());
+ ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
+
+ data = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+ if (len > 1) {
+ /* < 4.1.x format record */
+
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
+
+ types = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+ ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ comp = 0;
+ } else {
+ /* >= 4.1.x format record */
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ ut_a(*data == 0);
+
+ types = rec_get_nth_field_old(ibuf_rec, 3, &len);
+
+ comp = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+ ut_a(comp <= 1);
+ if (comp) {
+ /* compact record format */
+ ulint volume;
+ dict_index_t* dummy_index;
+ mem_heap_t* heap = mem_heap_create(500);
+ dtuple_t* entry = ibuf_build_entry_from_ibuf_rec(
+ ibuf_rec, heap, &dummy_index);
+ volume = rec_get_converted_size(dummy_index, entry, 0);
+ ibuf_dummy_index_free(dummy_index);
+ mem_heap_free(heap);
+ return(volume + page_dir_calc_reserved_space(1));
+ }
+
+ n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
+
+ new_format = TRUE;
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ if (new_format) {
+ data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
+
+ dtype_new_read_for_order_and_null_size(
+ &dtype, types + i
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+ } else {
+ data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
+
+ dtype_read_for_order_and_null_size(
+ &dtype, types + i
+ * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+ }
+
+ if (len == UNIV_SQL_NULL) {
+ data_size += dtype_get_sql_null_size(&dtype, comp);
+ } else {
+ data_size += len;
+ }
+ }
+
+ return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+ + page_dir_calc_reserved_space(1));
+}
+
+/*********************************************************************//**
+Builds the tuple to insert to an ibuf tree when we have an entry for a
+non-clustered index.
+
+NOTE that the original entry must be kept because we copy pointers to
+its fields.
+
+@return own: entry to insert into an ibuf index tree */
+static
+dtuple_t*
+ibuf_entry_build(
+/*=============*/
+ dict_index_t* index, /*!< in: non-clustered index */
+ const dtuple_t* entry, /*!< in: entry for a non-clustered index */
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number where entry should
+ be inserted */
+ mem_heap_t* heap) /*!< in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ const dfield_t* entry_field;
+ ulint n_fields;
+ byte* buf;
+ byte* buf2;
+ ulint i;
+
+ /* Starting from 4.1.x, we have to build a tuple whose
+ (1) first field is the space id,
+ (2) the second field a single marker byte (0) to tell that this
+ is a new format record,
+ (3) the third contains the page number, and
+ (4) the fourth contains the relevent type information of each data
+ field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is
+ (a) 0 for b-trees in the old format, and
+ (b) 1 for b-trees in the compact format, the first byte of the field
+ being the marker (0);
+ (5) and the rest of the fields are copied from entry. All fields
+ in the tuple are ordered like the type binary in our insert buffer
+ tree. */
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ tuple = dtuple_create(heap, n_fields + 4);
+
+ /* Store the space id in tuple */
+
+ field = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, space);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the marker byte field in tuple */
+
+ field = dtuple_get_nth_field(tuple, 1);
+
+ buf = mem_heap_alloc(heap, 1);
+
+ /* We set the marker byte zero */
+
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, 2);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the type info in buf2, and add the fields from entry to
+ tuple */
+ buf2 = mem_heap_alloc(heap, n_fields
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ + dict_table_is_comp(index->table));
+ if (dict_table_is_comp(index->table)) {
+ *buf2++ = 0; /* write the compact format indicator */
+ }
+ for (i = 0; i < n_fields; i++) {
+ ulint fixed_len;
+ const dict_field_t* ifield;
+
+ /* We add 4 below because we have the 4 extra fields at the
+ start of an ibuf record */
+
+ field = dtuple_get_nth_field(tuple, i + 4);
+ entry_field = dtuple_get_nth_field(entry, i);
+ dfield_copy(field, entry_field);
+
+ ifield = dict_index_get_nth_field(index, i);
+ /* Prefix index columns of fixed-length columns are of
+ fixed length. However, in the function call below,
+ dfield_get_type(entry_field) contains the fixed length
+ of the column in the clustered index. Replace it with
+ the fixed length of the secondary index column. */
+ fixed_len = ifield->fixed_len;
+
+#ifdef UNIV_DEBUG
+ if (fixed_len) {
+ /* dict_index_add_col() should guarantee these */
+ ut_ad(fixed_len <= (ulint)
+ dfield_get_type(entry_field)->len);
+ if (ifield->prefix_len) {
+ ut_ad(ifield->prefix_len == fixed_len);
+ } else {
+ ut_ad(fixed_len == (ulint)
+ dfield_get_type(entry_field)->len);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ dtype_new_store_for_order_and_null_size(
+ buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
+ dfield_get_type(entry_field), fixed_len);
+ }
+
+ /* Store the type info in buf2 to field 3 of tuple */
+
+ field = dtuple_get_nth_field(tuple, 3);
+
+ if (dict_table_is_comp(index->table)) {
+ buf2--;
+ }
+
+ dfield_set_data(field, buf2, n_fields
+ * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ + dict_table_is_comp(index->table));
+ /* Set all the types in the new tuple binary */
+
+ dtuple_set_types_binary(tuple, n_fields + 4);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for < 4.1.x format records
+@return own: search tuple */
+static
+dtuple_t*
+ibuf_search_tuple_build(
+/*====================*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number */
+ mem_heap_t* heap) /*!< in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+
+ ut_a(space == 0);
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ ut_a(!trx_sys_multiple_tablespace_format);
+
+ tuple = dtuple_create(heap, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ dtuple_set_types_binary(tuple, 1);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records.
+@return own: search tuple */
+static
+dtuple_t*
+ibuf_new_search_tuple_build(
+/*========================*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number */
+ mem_heap_t* heap) /*!< in: heap into which to build */
+{
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* buf;
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ tuple = dtuple_create(heap, 3);
+
+ /* Store the space id in tuple */
+
+ field = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, space);
+
+ dfield_set_data(field, buf, 4);
+
+ /* Store the new format record marker byte */
+
+ field = dtuple_get_nth_field(tuple, 1);
+
+ buf = mem_heap_alloc(heap, 1);
+
+ mach_write_to_1(buf, 0);
+
+ dfield_set_data(field, buf, 1);
+
+ /* Store the page number in tuple */
+
+ field = dtuple_get_nth_field(tuple, 2);
+
+ buf = mem_heap_alloc(heap, 4);
+
+ mach_write_to_4(buf, page_no);
+
+ dfield_set_data(field, buf, 4);
+
+ dtuple_set_types_binary(tuple, 3);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+dare to start a pessimistic insert to the insert buffer.
+@return TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_enough_free_for_insert(void)
+/*==================================*/
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ /* We want a big margin of free pages, because a B-tree can sometimes
+ grow in size also if records are deleted from it, as the node pointers
+ can change, and we must make sure that we are able to delete the
+ inserts buffered for pages that we read to the buffer pool, without
+ any risk of running out of free space in the insert buffer. */
+
+ return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+should remove them and free to the file space management.
+@return TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_too_much_free(void)
+/*=========================*/
+{
+ ut_ad(mutex_own(&ibuf_mutex));
+
+ return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Allocates a new page from the ibuf file segment and adds it to the free
+list.
+@return DB_SUCCESS, or DB_STRONG_FAIL if no space left */
+static
+ulint
+ibuf_add_free_page(void)
+/*====================*/
+{
+ mtr_t mtr;
+ page_t* header_page;
+ ulint flags;
+ ulint zip_size;
+ ulint page_no;
+ page_t* page;
+ page_t* root;
+ page_t* bitmap_page;
+
+ mtr_start(&mtr);
+
+ /* Acquire the fsp latch before the ibuf header, obeying the latching
+ order */
+ mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ header_page = ibuf_header_page_get(&mtr);
+
+ /* Allocate a new page: NOTE that if the page has been a part of a
+ non-clustered index which has subsequently been dropped, then the
+ page may have buffered inserts in the insert buffer, and these
+ should be deleted from there. These get deleted when the page
+ allocation creates the page in buffer. Thus the call below may end
+ up calling the insert buffer routines and, as we yet have no latches
+ to insert buffer tree pages, these routines can run without a risk
+ of a deadlock. This is the reason why we created a special ibuf
+ header page apart from the ibuf tree. */
+
+ page_no = fseg_alloc_free_page(
+ header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
+ &mtr);
+
+ if (page_no == FIL_NULL) {
+ mtr_commit(&mtr);
+
+ return(DB_STRONG_FAIL);
+ }
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+
+ page = buf_block_get_frame(block);
+ }
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ root = ibuf_tree_root_get(&mtr);
+
+ /* Add the page to the free list and update the ibuf size data */
+
+ flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
+ MLOG_2BYTES, &mtr);
+
+ ibuf->seg_size++;
+ ibuf->free_list_len++;
+
+ /* Set the bit indicating that this page is now an ibuf tree page
+ (level 2 page) */
+
+ bitmap_page = ibuf_bitmap_get_map_page(
+ IBUF_SPACE_ID, page_no, zip_size, &mtr);
+
+ ibuf_bitmap_page_set_bits(
+ bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr);
+
+ mtr_commit(&mtr);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Removes a page from the free list and frees it to the fsp system. */
+static
+void
+ibuf_remove_free_page(void)
+/*=======================*/
+{
+ mtr_t mtr;
+ mtr_t mtr2;
+ page_t* header_page;
+ ulint flags;
+ ulint zip_size;
+ ulint page_no;
+ page_t* page;
+ page_t* root;
+ page_t* bitmap_page;
+
+ mtr_start(&mtr);
+
+ /* Acquire the fsp latch before the ibuf header, obeying the latching
+ order */
+ mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+
+ header_page = ibuf_header_page_get(&mtr);
+
+ /* Prevent pessimistic inserts to insert buffer trees for a while */
+ mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ if (!ibuf_data_too_much_free()) {
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ mtr_start(&mtr2);
+
+ root = ibuf_tree_root_get(&mtr2);
+
+ page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ &mtr2).page;
+
+ /* NOTE that we must release the latch on the ibuf tree root
+ because in fseg_free_page we access level 1 pages, and the root
+ is a level 2 page. */
+
+ mtr_commit(&mtr2);
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ /* Since pessimistic inserts were prevented, we know that the
+ page is still in the free list. NOTE that also deletes may take
+ pages from the free list, but they take them from the start, and
+ the free list was so long that they cannot have taken the last
+ page from it. */
+
+ fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+ IBUF_SPACE_ID, page_no, &mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ root = ibuf_tree_root_get(&mtr);
+
+ ut_ad(page_no == flst_get_last(root + PAGE_HEADER
+ + PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+ page = buf_block_get_frame(block);
+ }
+
+ /* Remove the page from the free list and update the ibuf size data */
+
+ flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+ page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+ ibuf->seg_size--;
+ ibuf->free_list_len--;
+
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ /* Set the bit indicating that this page is no more an ibuf tree page
+ (level 2 page) */
+
+ bitmap_page = ibuf_bitmap_get_map_page(
+ IBUF_SPACE_ID, page_no, zip_size, &mtr);
+
+ ibuf_bitmap_page_set_bits(
+ bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+ buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif
+ mtr_commit(&mtr);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+}
+
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void)
+/*========================*/
+{
+ ulint i;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL),
+ RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(rw_lock_get_x_lock_count(
+ fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1);
+
+ ut_ad(!ibuf_inside());
+
+ /* NOTE: We require that the thread did not own the latch before,
+ because then we know that we can obey the correct latching order
+ for ibuf latches */
+
+ if (!ibuf) {
+ /* Not yet initialized; not sure if this is possible, but
+ does no harm to check for it. */
+
+ return;
+ }
+
+ /* Free at most a few pages at a time, so that we do not delay the
+ requested service too much */
+
+ for (i = 0; i < 4; i++) {
+
+ mutex_enter(&ibuf_mutex);
+
+ if (!ibuf_data_too_much_free()) {
+
+ mutex_exit(&ibuf_mutex);
+
+ return;
+ }
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_remove_free_page();
+ }
+}
+
+/*********************************************************************//**
+Reads page numbers from a leaf in an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static
+ulint
+ibuf_get_merge_page_nos(
+/*====================*/
+ ibool contract,/*!< in: TRUE if this function is called to
+ contract the tree, FALSE if this is called
+ when a single page becomes full and we look
+ if it pays to read also nearby pages */
+ rec_t* rec, /*!< in: record from which we read up and down
+ in the chain of records */
+ ulint* space_ids,/*!< in/out: space id's of the pages */
+ ib_int64_t* space_versions,/*!< in/out: tablespace version
+ timestamps; used to prevent reading in old
+ pages after DISCARD + IMPORT tablespace */
+ ulint* page_nos,/*!< in/out: buffer for at least
+ IBUF_MAX_N_PAGES_MERGED many page numbers;
+ the page numbers are in an ascending order */
+ ulint* n_stored)/*!< out: number of page numbers stored to
+ page_nos in this function */
+{
+ ulint prev_page_no;
+ ulint prev_space_id;
+ ulint first_page_no;
+ ulint first_space_id;
+ ulint rec_page_no;
+ ulint rec_space_id;
+ ulint sum_volumes;
+ ulint volume_for_page;
+ ulint rec_volume;
+ ulint limit;
+ ulint n_pages;
+
+ *n_stored = 0;
+
+ limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4);
+
+ if (page_rec_is_supremum(rec)) {
+
+ rec = page_rec_get_prev(rec);
+ }
+
+ if (page_rec_is_infimum(rec)) {
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ return(0);
+ }
+
+ first_page_no = ibuf_rec_get_page_no(rec);
+ first_space_id = ibuf_rec_get_space(rec);
+ n_pages = 0;
+ prev_page_no = 0;
+ prev_space_id = 0;
+
+ /* Go backwards from the first rec until we reach the border of the
+ 'merge area', or the page start or the limit of storeable pages is
+ reached */
+
+ while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
+
+ rec_page_no = ibuf_rec_get_page_no(rec);
+ rec_space_id = ibuf_rec_get_space(rec);
+
+ if (rec_space_id != first_space_id
+ || (rec_page_no / IBUF_MERGE_AREA)
+ != (first_page_no / IBUF_MERGE_AREA)) {
+
+ break;
+ }
+
+ if (rec_page_no != prev_page_no
+ || rec_space_id != prev_space_id) {
+ n_pages++;
+ }
+
+ prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
+
+ rec = page_rec_get_prev(rec);
+ }
+
+ rec = page_rec_get_next(rec);
+
+ /* At the loop start there is no prev page; we mark this with a pair
+ of space id, page no (0, 0) for which there can never be entries in
+ the insert buffer */
+
+ prev_page_no = 0;
+ prev_space_id = 0;
+ sum_volumes = 0;
+ volume_for_page = 0;
+
+ while (*n_stored < limit) {
+ if (page_rec_is_supremum(rec)) {
+ /* When no more records available, mark this with
+ another 'impossible' pair of space id, page no */
+ rec_page_no = 1;
+ rec_space_id = 0;
+ } else {
+ rec_page_no = ibuf_rec_get_page_no(rec);
+ rec_space_id = ibuf_rec_get_space(rec);
+ ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO);
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
+#endif
+ if ((rec_space_id != prev_space_id
+ || rec_page_no != prev_page_no)
+ && (prev_space_id != 0 || prev_page_no != 0)) {
+
+ if ((prev_page_no == first_page_no
+ && prev_space_id == first_space_id)
+ || contract
+ || (volume_for_page
+ > ((IBUF_MERGE_THRESHOLD - 1)
+ * 4 * UNIV_PAGE_SIZE
+ / IBUF_PAGE_SIZE_PER_FREE_SPACE)
+ / IBUF_MERGE_THRESHOLD)) {
+
+ space_ids[*n_stored] = prev_space_id;
+ space_versions[*n_stored]
+ = fil_space_get_version(prev_space_id);
+ page_nos[*n_stored] = prev_page_no;
+
+ (*n_stored)++;
+
+ sum_volumes += volume_for_page;
+ }
+
+ if (rec_space_id != first_space_id
+ || rec_page_no / IBUF_MERGE_AREA
+ != first_page_no / IBUF_MERGE_AREA) {
+
+ break;
+ }
+
+ volume_for_page = 0;
+ }
+
+ if (rec_page_no == 1 && rec_space_id == 0) {
+ /* Supremum record */
+
+ break;
+ }
+
+ rec_volume = ibuf_rec_get_volume(rec);
+
+ volume_for_page += rec_volume;
+
+ prev_page_no = rec_page_no;
+ prev_space_id = rec_space_id;
+
+ rec = page_rec_get_next(rec);
+ }
+
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+#if 0
+ fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
+ *n_stored, sum_volumes);
+#endif
+ return(sum_volumes);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_contract_ext(
+/*==============*/
+ ulint* n_pages,/*!< out: number of pages to which merged */
+ ibool sync) /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+{
+ btr_pcur_t pcur;
+ ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
+ ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
+ ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED];
+ ulint n_stored;
+ ulint sum_sizes;
+ mtr_t mtr;
+
+ *n_pages = 0;
+ ut_ad(!ibuf_inside());
+
+ mutex_enter(&ibuf_mutex);
+
+ if (ibuf->empty) {
+ibuf_is_empty:
+ mutex_exit(&ibuf_mutex);
+
+ return(0);
+ }
+
+ mtr_start(&mtr);
+
+ ibuf_enter();
+
+ /* Open a cursor to a randomly chosen leaf of the tree, at a random
+ position within the leaf */
+
+ btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
+ /* When the ibuf tree is emptied completely, the last record
+ is removed using an optimistic delete and ibuf_size_update
+ is not called, causing ibuf->empty to remain FALSE. If we do
+ not reset it to TRUE here then database shutdown will hang
+ in the loop in ibuf_contract_for_n_pages. */
+
+ ibuf->empty = TRUE;
+
+ ibuf_exit();
+
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ goto ibuf_is_empty;
+ }
+
+ mutex_exit(&ibuf_mutex);
+
+ sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
+ space_ids, space_versions,
+ page_nos, &n_stored);
+#if 0 /* defined UNIV_IBUF_DEBUG */
+ fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
+ sync, n_stored, sum_sizes);
+#endif
+ ibuf_exit();
+
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos,
+ n_stored);
+ *n_pages = n_stored;
+
+ return(sum_sizes + 1);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract(
+/*==========*/
+ ibool sync) /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+{
+ ulint n_pages;
+
+ return(ibuf_contract_ext(&n_pages, sync));
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_for_n_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+ ulint n_pages)/*!< in: try to read at least this many pages to
+ the buffer pool and merge the ibuf contents to
+ them */
+{
+ ulint sum_bytes = 0;
+ ulint sum_pages = 0;
+ ulint n_bytes;
+ ulint n_pag2;
+
+ while (sum_pages < n_pages) {
+ n_bytes = ibuf_contract_ext(&n_pag2, sync);
+
+ if (n_bytes == 0) {
+ return(sum_bytes);
+ }
+
+ sum_bytes += n_bytes;
+ sum_pages += n_pag2;
+ }
+
+ return(sum_bytes);
+}
+
+/*********************************************************************//**
+Contract insert buffer trees after insert if they are too big. */
+UNIV_INLINE
+void
+ibuf_contract_after_insert(
+/*=======================*/
+ ulint entry_size) /*!< in: size of a record which was inserted
+ into an ibuf tree */
+{
+ ibool sync;
+ ulint sum_sizes;
+ ulint size;
+
+ mutex_enter(&ibuf_mutex);
+
+ if (!srv_ibuf_active_contract) {
+ if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+ mutex_exit(&ibuf_mutex);
+
+ return;
+ }
+ }
+
+ sync = FALSE;
+
+ if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_ON_INSERT_SYNC) {
+
+ sync = TRUE;
+ }
+
+ mutex_exit(&ibuf_mutex);
+
+ /* Contract at least entry_size many bytes */
+ sum_sizes = 0;
+ size = 1;
+
+ while ((size > 0) && (sum_sizes < entry_size)) {
+
+ size = ibuf_contract(sync);
+ sum_sizes += size;
+ }
+}
+
+/*********************************************************************//**
+Gets an upper limit for the combined size of entries buffered in the insert
+buffer for a given page.
+@return upper limit for the volume of buffered inserts for the index
+page, in bytes; UNIV_PAGE_SIZE, if the entries for the index page span
+several pages in the insert buffer */
+static
+ulint
+ibuf_get_volume_buffered(
+/*=====================*/
+ btr_pcur_t* pcur, /*!< in: pcur positioned at a place in an
+ insert buffer tree where we would insert an
+ entry for the index page whose number is
+ page_no, latch mode has to be BTR_MODIFY_PREV
+ or BTR_MODIFY_TREE */
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: page number of an index page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint volume;
+ rec_t* rec;
+ page_t* page;
+ ulint prev_page_no;
+ page_t* prev_page;
+ ulint next_page_no;
+ page_t* next_page;
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
+ || (pcur->latch_mode == BTR_MODIFY_TREE));
+
+ /* Count the volume of records earlier in the alphabetical order than
+ pcur */
+
+ volume = 0;
+
+ rec = btr_pcur_get_rec(pcur);
+ page = page_align(rec);
+
+ if (page_rec_is_supremum(rec)) {
+ rec = page_rec_get_prev(rec);
+ }
+
+ for (;;) {
+ if (page_rec_is_infimum(rec)) {
+
+ break;
+ }
+
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
+
+ goto count_later;
+ }
+
+ volume += ibuf_rec_get_volume(rec);
+
+ rec = page_rec_get_prev(rec);
+ }
+
+ /* Look at the previous page */
+
+ prev_page_no = btr_page_get_prev(page, mtr);
+
+ if (prev_page_no == FIL_NULL) {
+
+ goto count_later;
+ }
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+ prev_page = buf_block_get_frame(block);
+ }
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_next(prev_page, mtr)
+ == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ rec = page_get_supremum_rec(prev_page);
+ rec = page_rec_get_prev(rec);
+
+ for (;;) {
+ if (page_rec_is_infimum(rec)) {
+
+ /* We cannot go to yet a previous page, because we
+ do not have the x-latch on it, and cannot acquire one
+ because of the latching order: we have to give up */
+
+ return(UNIV_PAGE_SIZE);
+ }
+
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
+
+ goto count_later;
+ }
+
+ volume += ibuf_rec_get_volume(rec);
+
+ rec = page_rec_get_prev(rec);
+ }
+
+count_later:
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_supremum(rec)) {
+ rec = page_rec_get_next(rec);
+ }
+
+ for (;;) {
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
+
+ return(volume);
+ }
+
+ volume += ibuf_rec_get_volume(rec);
+
+ rec = page_rec_get_next(rec);
+ }
+
+ /* Look at the next page */
+
+ next_page_no = btr_page_get_next(page, mtr);
+
+ if (next_page_no == FIL_NULL) {
+
+ return(volume);
+ }
+
+ {
+ buf_block_t* block;
+
+ block = buf_page_get(
+ IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, mtr);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+ next_page = buf_block_get_frame(block);
+ }
+
+#ifdef UNIV_BTR_DEBUG
+ ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+ rec = page_get_infimum_rec(next_page);
+ rec = page_rec_get_next(rec);
+
+ for (;;) {
+ if (page_rec_is_supremum(rec)) {
+
+ /* We give up */
+
+ return(UNIV_PAGE_SIZE);
+ }
+
+ if (page_no != ibuf_rec_get_page_no(rec)
+ || space != ibuf_rec_get_space(rec)) {
+
+ return(volume);
+ }
+
+ volume += ibuf_rec_get_volume(rec);
+
+ rec = page_rec_get_next(rec);
+ }
+}
+
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+ ulint max_space_id;
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ ut_a(!dict_table_is_comp(ibuf->index->table));
+
+ ibuf_enter();
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ btr_pcur_move_to_prev(&pcur, &mtr);
+
+ if (btr_pcur_is_before_first_on_page(&pcur)) {
+ /* The tree is empty */
+
+ max_space_id = 0;
+ } else {
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+
+ ut_a(len == 4);
+
+ max_space_id = mach_read_from_4(field);
+ }
+
+ mtr_commit(&mtr);
+ ibuf_exit();
+
+ /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+ fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible.
+@return DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */
+static
+ulint
+ibuf_insert_low(
+/*============*/
+ ulint mode, /*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ ulint entry_size,
+ /*!< in: rec_get_converted_size(index, entry) */
+ dict_index_t* index, /*!< in: index where to insert; must not be
+ unique or clustered */
+ ulint space, /*!< in: space id where to insert */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number where to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ big_rec_t* dummy_big_rec;
+ btr_pcur_t pcur;
+ btr_cur_t* cursor;
+ dtuple_t* ibuf_entry;
+ mem_heap_t* heap;
+ ulint buffered;
+ rec_t* ins_rec;
+ ibool old_bit_value;
+ page_t* bitmap_page;
+ page_t* root;
+ ulint err;
+ ibool do_merge;
+ ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
+ ib_int64_t space_versions[IBUF_MAX_N_PAGES_MERGED];
+ ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
+ ulint n_stored;
+ ulint bits;
+ mtr_t mtr;
+ mtr_t bitmap_mtr;
+
+ ut_a(!dict_index_is_clust(index));
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(ut_is_2pow(zip_size));
+
+ ut_a(trx_sys_multiple_tablespace_format);
+
+ do_merge = FALSE;
+
+ mutex_enter(&ibuf_mutex);
+
+ if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+ /* Insert buffer is now too big, contract it but do not try
+ to insert */
+
+ mutex_exit(&ibuf_mutex);
+
+#ifdef UNIV_IBUF_DEBUG
+ fputs("Ibuf too big\n", stderr);
+#endif
+ /* Use synchronous contract (== TRUE) */
+ ibuf_contract(TRUE);
+
+ return(DB_STRONG_FAIL);
+ }
+
+ mutex_exit(&ibuf_mutex);
+
+ if (mode == BTR_MODIFY_TREE) {
+ mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ while (!ibuf_data_enough_free_for_insert()) {
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+ err = ibuf_add_free_page();
+
+ if (err == DB_STRONG_FAIL) {
+
+ return(err);
+ }
+
+ mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+ }
+ } else {
+ ibuf_enter();
+ }
+
+ heap = mem_heap_create(512);
+
+ /* Build the entry which contains the space id and the page number as
+ the first fields and the type information for other fields, and which
+ will be inserted to the insert buffer. */
+
+ ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap);
+
+ /* Open a cursor to the insert buffer tree to calculate if we can add
+ the new entry to it without exceeding the free space limit for the
+ page. */
+
+ mtr_start(&mtr);
+
+ btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+
+ /* Find out the volume of already buffered inserts for the same index
+ page */
+ buffered = ibuf_get_volume_buffered(&pcur, space, page_no, &mtr);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a((buffered == 0) || ibuf_count_get(space, page_no));
+#endif
+ mtr_start(&bitmap_mtr);
+
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+ zip_size, &bitmap_mtr);
+
+ /* We check if the index page is suitable for buffered entries */
+
+ if (buf_page_peek(space, page_no)
+ || lock_rec_expl_exist_on_page(space, page_no)) {
+ err = DB_STRONG_FAIL;
+
+ mtr_commit(&bitmap_mtr);
+
+ goto function_exit;
+ }
+
+ bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, &bitmap_mtr);
+
+ if (buffered + entry_size + page_dir_calc_reserved_space(1)
+ > ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
+ mtr_commit(&bitmap_mtr);
+
+ /* It may not fit */
+ err = DB_STRONG_FAIL;
+
+ do_merge = TRUE;
+
+ ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur),
+ space_ids, space_versions,
+ page_nos, &n_stored);
+ goto function_exit;
+ }
+
+ /* Set the bitmap bit denoting that the insert buffer contains
+ buffered entries for this index page, if the bit is not set yet */
+
+ old_bit_value = ibuf_bitmap_page_get_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_BUFFERED, &bitmap_mtr);
+
+ if (!old_bit_value) {
+ ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_BUFFERED, TRUE,
+ &bitmap_mtr);
+ }
+
+ mtr_commit(&bitmap_mtr);
+
+ cursor = btr_pcur_get_btr_cur(&pcur);
+
+ if (mode == BTR_MODIFY_PREV) {
+ err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, 0, thr, &mtr);
+ if (err == DB_SUCCESS) {
+ /* Update the page max trx id field */
+ page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
+ thr_get_trx(thr)->id, &mtr);
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ /* We acquire an x-latch to the root page before the insert,
+ because a pessimistic insert releases the tree x-latch,
+ which would cause the x-latching of the root after that to
+ break the latching order. */
+
+ root = ibuf_tree_root_get(&mtr);
+
+ err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG,
+ cursor,
+ ibuf_entry, &ins_rec,
+ &dummy_big_rec, 0, thr, &mtr);
+ if (err == DB_SUCCESS) {
+ /* Update the page max trx id field */
+ page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
+ thr_get_trx(thr)->id, &mtr);
+ }
+
+ ibuf_size_update(root, &mtr);
+ }
+
+function_exit:
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ if (err == DB_SUCCESS) {
+ fprintf(stderr,
+ "Incrementing ibuf count of space %lu page %lu\n"
+ "from %lu by 1\n", space, page_no,
+ ibuf_count_get(space, page_no));
+
+ ibuf_count_set(space, page_no,
+ ibuf_count_get(space, page_no) + 1);
+ }
+#endif
+ if (mode == BTR_MODIFY_TREE) {
+
+ mutex_exit(&ibuf_mutex);
+ mutex_exit(&ibuf_pessimistic_insert_mutex);
+ }
+
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+ ibuf_exit();
+
+ mem_heap_free(heap);
+
+ if (err == DB_SUCCESS) {
+ mutex_enter(&ibuf_mutex);
+
+ ibuf->empty = FALSE;
+ ibuf->n_inserts++;
+
+ mutex_exit(&ibuf_mutex);
+
+ if (mode == BTR_MODIFY_TREE) {
+ ibuf_contract_after_insert(entry_size);
+ }
+ }
+
+ if (do_merge) {
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+ buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions,
+ page_nos, n_stored);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint space, /*!< in: space id where to insert */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number where to insert */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+ ulint entry_size;
+
+ ut_a(trx_sys_multiple_tablespace_format);
+ ut_ad(dtuple_check_typed(entry));
+ ut_ad(ut_is_2pow(zip_size));
+
+ ut_a(!dict_index_is_clust(index));
+
+ switch (UNIV_EXPECT(ibuf_use, IBUF_USE_INSERT)) {
+ case IBUF_USE_NONE:
+ return(FALSE);
+ case IBUF_USE_INSERT:
+ goto do_insert;
+ case IBUF_USE_COUNT:
+ break;
+ }
+
+ ut_error; /* unknown value of ibuf_use */
+
+do_insert:
+ entry_size = rec_get_converted_size(index, entry, 0);
+
+ if (entry_size
+ >= (page_get_free_space_of_empty(dict_table_is_comp(index->table))
+ / 2)) {
+ return(FALSE);
+ }
+
+ err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size,
+ index, space, zip_size, page_no, thr);
+ if (err == DB_FAIL) {
+ err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size,
+ index, space, zip_size, page_no, thr);
+ }
+
+ if (err == DB_SUCCESS) {
+#ifdef UNIV_IBUF_DEBUG
+ /* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
+ page_no, index->name); */
+#endif
+ return(TRUE);
+
+ } else {
+ ut_a(err == DB_STRONG_FAIL);
+
+ return(FALSE);
+ }
+}
+
+/********************************************************************//**
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer. */
+static
+void
+ibuf_insert_to_index_page(
+/*======================*/
+ dtuple_t* entry, /*!< in: buffered entry to insert */
+ buf_block_t* block, /*!< in/out: index page where the buffered entry
+ should be placed */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t page_cur;
+ ulint low_match;
+ page_t* page = buf_block_get_frame(block);
+ rec_t* rec;
+ page_t* bitmap_page;
+ ulint old_bits;
+
+ ut_ad(ibuf_inside());
+ ut_ad(dtuple_check_typed(entry));
+
+ if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
+ != (ibool)!!page_is_comp(page))) {
+ fputs("InnoDB: Trying to insert a record from"
+ " the insert buffer to an index page\n"
+ "InnoDB: but the 'compact' flag does not match!\n",
+ stderr);
+ goto dump;
+ }
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+
+ if (UNIV_UNLIKELY(rec_get_n_fields(rec, index)
+ != dtuple_get_n_fields(entry))) {
+ fputs("InnoDB: Trying to insert a record from"
+ " the insert buffer to an index page\n"
+ "InnoDB: but the number of fields does not match!\n",
+ stderr);
+dump:
+ buf_page_print(page, 0);
+
+ dtuple_print(stderr, entry);
+
+ fputs("InnoDB: The table where where"
+ " this index record belongs\n"
+ "InnoDB: is now probably corrupt."
+ " Please run CHECK TABLE on\n"
+ "InnoDB: your tables.\n"
+ "InnoDB: Submit a detailed bug report to"
+ " http://bugs.mysql.com!\n", stderr);
+
+ return;
+ }
+
+ low_match = page_cur_search(block, index, entry,
+ PAGE_CUR_LE, &page_cur);
+
+ if (low_match == dtuple_get_n_fields(entry)) {
+ page_zip_des_t* page_zip;
+
+ rec = page_cur_get_rec(&page_cur);
+ page_zip = buf_block_get_page_zip(block);
+
+ btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr);
+ } else {
+ rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr);
+
+ if (UNIV_LIKELY(rec != NULL)) {
+ return;
+ }
+
+ /* If the record did not fit, reorganize */
+
+ btr_page_reorganize(block, index, mtr);
+ page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur);
+
+ /* This time the record must fit */
+ if (UNIV_UNLIKELY
+ (!page_cur_tuple_insert(&page_cur, entry, index,
+ 0, mtr))) {
+ ulint space;
+ ulint page_no;
+ ulint zip_size;
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: Insert buffer insert"
+ " fails; page free %lu,"
+ " dtuple size %lu\n",
+ (ulong) page_get_max_insert_size(
+ page, 1),
+ (ulong) rec_get_converted_size(
+ index, entry, 0));
+ fputs("InnoDB: Cannot insert index record ",
+ stderr);
+ dtuple_print(stderr, entry);
+ fputs("\nInnoDB: The table where"
+ " this index record belongs\n"
+ "InnoDB: is now probably corrupt."
+ " Please run CHECK TABLE on\n"
+ "InnoDB: that table.\n", stderr);
+
+ space = page_get_space_id(page);
+ zip_size = buf_block_get_zip_size(block);
+ page_no = page_get_page_no(page);
+
+ bitmap_page = ibuf_bitmap_get_map_page(
+ space, page_no, zip_size, mtr);
+ old_bits = ibuf_bitmap_page_get_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, mtr);
+
+ fprintf(stderr,
+ "InnoDB: space %lu, page %lu,"
+ " zip_size %lu, bitmap bits %lu\n",
+ (ulong) space, (ulong) page_no,
+ (ulong) zip_size, (ulong) old_bits);
+
+ fputs("InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n", stderr);
+ }
+ }
+}
+
+/*********************************************************************//**
+Deletes from ibuf the record on which pcur is positioned. If we have to
+resort to a pessimistic delete, this function commits mtr and closes
+the cursor.
+@return TRUE if mtr was committed and pcur closed in this operation */
+static
+ibool
+ibuf_delete_rec(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint page_no,/*!< in: index page number where the record
+ should belong */
+ btr_pcur_t* pcur, /*!< in: pcur positioned on the record to
+ delete, having latch mode BTR_MODIFY_LEAF */
+ const dtuple_t* search_tuple,
+ /*!< in: search tuple for entries of page_no */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool success;
+ page_t* root;
+ ulint err;
+
+ ut_ad(ibuf_inside());
+ ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+ ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no);
+ ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space);
+
+ success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr);
+
+ if (success) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ fprintf(stderr,
+ "Decrementing ibuf count of space %lu page %lu\n"
+ "from %lu by 1\n", space, page_no,
+ ibuf_count_get(space, page_no));
+ ibuf_count_set(space, page_no,
+ ibuf_count_get(space, page_no) - 1);
+#endif
+ return(FALSE);
+ }
+
+ ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+ ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no);
+ ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space);
+
+ /* We have to resort to a pessimistic delete from ibuf */
+ btr_pcur_store_position(pcur, mtr);
+
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+
+ mutex_enter(&ibuf_mutex);
+
+ mtr_start(mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr);
+
+ if (!success) {
+ if (fil_space_get_flags(space) == ULINT_UNDEFINED) {
+ /* The tablespace has been dropped. It is possible
+ that another thread has deleted the insert buffer
+ entry. Do not complain. */
+ goto commit_and_exit;
+ }
+
+ fprintf(stderr,
+ "InnoDB: ERROR: Submit the output to"
+ " http://bugs.mysql.com\n"
+ "InnoDB: ibuf cursor restoration fails!\n"
+ "InnoDB: ibuf record inserted to page %lu\n",
+ (ulong) page_no);
+ fflush(stderr);
+
+ rec_print_old(stderr, btr_pcur_get_rec(pcur));
+ rec_print_old(stderr, pcur->old_rec);
+ dtuple_print(stderr, search_tuple);
+
+ rec_print_old(stderr,
+ page_rec_get_next(btr_pcur_get_rec(pcur)));
+ fflush(stderr);
+
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+
+ fputs("InnoDB: Validating insert buffer tree:\n", stderr);
+ if (!btr_validate_index(ibuf->index, NULL)) {
+ ut_error;
+ }
+
+ fprintf(stderr, "InnoDB: ibuf tree ok\n");
+ fflush(stderr);
+
+ goto func_exit;
+ }
+
+ root = ibuf_tree_root_get(mtr);
+
+ btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur),
+ RB_NONE, mtr);
+ ut_a(err == DB_SUCCESS);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1);
+#endif
+ ibuf_size_update(root, mtr);
+
+commit_and_exit:
+ btr_pcur_commit_specify_mtr(pcur, mtr);
+
+func_exit:
+ btr_pcur_close(pcur);
+
+ mutex_exit(&ibuf_mutex);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+ buf_block_t* block, /*!< in: if page has been read from
+ disk, pointer to the page x-latched,
+ else NULL */
+ ulint space, /*!< in: space id of the index page */
+ ulint page_no,/*!< in: page number of the index page */
+ ulint zip_size,/*!< in: compressed page size in bytes,
+ or 0 */
+ ibool update_ibuf_bitmap)/*!< in: normally this is set
+ to TRUE, but if we have deleted or are
+ deleting the tablespace, then we
+ naturally do not want to update a
+ non-existent bitmap page */
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ dtuple_t* search_tuple;
+ ulint n_inserts;
+#ifdef UNIV_IBUF_DEBUG
+ ulint volume;
+#endif
+ page_zip_des_t* page_zip = NULL;
+ ibool tablespace_being_deleted = FALSE;
+ ibool corruption_noticed = FALSE;
+ mtr_t mtr;
+
+ ut_ad(!block || buf_block_get_space(block) == space);
+ ut_ad(!block || buf_block_get_page_no(block) == page_no);
+ ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
+
+ if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
+ || trx_sys_hdr_page(space, page_no)) {
+ return;
+ }
+
+ /* We cannot refer to zip_size in the following, because
+ zip_size is passed as ULINT_UNDEFINED (it is unknown) when
+ buf_read_ibuf_merge_pages() is merging (discarding) changes
+ for a dropped tablespace. When block != NULL or
+ update_ibuf_bitmap is specified, the zip_size must be known.
+ That is why we will repeat the check below, with zip_size in
+ place of 0. Passing zip_size as 0 assumes that the
+ uncompressed page size always is a power-of-2 multiple of the
+ compressed page size. */
+
+ if (ibuf_fixed_addr_page(space, 0, page_no)
+ || fsp_descr_page(0, page_no)) {
+ return;
+ }
+
+ if (UNIV_LIKELY(update_ibuf_bitmap)) {
+ ut_a(ut_is_2pow(zip_size));
+
+ if (ibuf_fixed_addr_page(space, zip_size, page_no)
+ || fsp_descr_page(zip_size, page_no)) {
+ return;
+ }
+
+ /* If the following returns FALSE, we get the counter
+ incremented, and must decrement it when we leave this
+ function. When the counter is > 0, that prevents tablespace
+ from being dropped. */
+
+ tablespace_being_deleted = fil_inc_pending_ibuf_merges(space);
+
+ if (UNIV_UNLIKELY(tablespace_being_deleted)) {
+ /* Do not try to read the bitmap page from space;
+ just delete the ibuf records for the page */
+
+ block = NULL;
+ update_ibuf_bitmap = FALSE;
+ } else {
+ page_t* bitmap_page;
+
+ mtr_start(&mtr);
+
+ bitmap_page = ibuf_bitmap_get_map_page(
+ space, page_no, zip_size, &mtr);
+
+ if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no,
+ zip_size,
+ IBUF_BITMAP_BUFFERED,
+ &mtr)) {
+ /* No inserts buffered for this page */
+ mtr_commit(&mtr);
+
+ if (!tablespace_being_deleted) {
+ fil_decr_pending_ibuf_merges(space);
+ }
+
+ return;
+ }
+ mtr_commit(&mtr);
+ }
+ } else if (block
+ && (ibuf_fixed_addr_page(space, zip_size, page_no)
+ || fsp_descr_page(zip_size, page_no))) {
+
+ return;
+ }
+
+ ibuf_enter();
+
+ heap = mem_heap_create(512);
+
+ if (!trx_sys_multiple_tablespace_format) {
+ ut_a(trx_doublewrite_must_reset_space_ids);
+ search_tuple = ibuf_search_tuple_build(space, page_no, heap);
+ } else {
+ search_tuple = ibuf_new_search_tuple_build(space, page_no,
+ heap);
+ }
+
+ if (block) {
+ /* Move the ownership of the x-latch on the page to this OS
+ thread, so that we can acquire a second x-latch on it. This
+ is needed for the insert operations to the index page to pass
+ the debug checks. */
+
+ rw_lock_x_lock_move_ownership(&(block->lock));
+ page_zip = buf_block_get_page_zip(block);
+
+ if (UNIV_UNLIKELY(fil_page_get_type(block->frame)
+ != FIL_PAGE_INDEX)
+ || UNIV_UNLIKELY(!page_is_leaf(block->frame))) {
+
+ page_t* bitmap_page;
+
+ corruption_noticed = TRUE;
+
+ ut_print_timestamp(stderr);
+
+ mtr_start(&mtr);
+
+ fputs(" InnoDB: Dump of the ibuf bitmap page:\n",
+ stderr);
+
+ bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+ zip_size, &mtr);
+ buf_page_print(bitmap_page, 0);
+
+ mtr_commit(&mtr);
+
+ fputs("\nInnoDB: Dump of the page:\n", stderr);
+
+ buf_page_print(block->frame, 0);
+
+ fprintf(stderr,
+ "InnoDB: Error: corruption in the tablespace."
+ " Bitmap shows insert\n"
+ "InnoDB: buffer records to page n:o %lu"
+ " though the page\n"
+ "InnoDB: type is %lu, which is"
+ " not an index leaf page!\n"
+ "InnoDB: We try to resolve the problem"
+ " by skipping the insert buffer\n"
+ "InnoDB: merge for this page."
+ " Please run CHECK TABLE on your tables\n"
+ "InnoDB: to determine if they are corrupt"
+ " after this.\n\n"
+ "InnoDB: Please submit a detailed bug report"
+ " to http://bugs.mysql.com\n\n",
+ (ulong) page_no,
+ (ulong)
+ fil_page_get_type(block->frame));
+ }
+ }
+
+ n_inserts = 0;
+#ifdef UNIV_IBUF_DEBUG
+ volume = 0;
+#endif
+loop:
+ mtr_start(&mtr);
+
+ if (block) {
+ ibool success;
+
+ success = buf_page_get_known_nowait(
+ RW_X_LATCH, block,
+ BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
+
+ ut_a(success);
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
+
+ /* Position pcur in the insert buffer at the first entry for this
+ index page */
+ btr_pcur_open_on_user_rec(
+ ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+ goto reset_bit;
+ }
+
+ for (;;) {
+ rec_t* rec;
+
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ /* Check if the entry is for this index page */
+ if (ibuf_rec_get_page_no(rec) != page_no
+ || ibuf_rec_get_space(rec) != space) {
+
+ if (block) {
+ page_header_reset_last_insert(
+ block->frame, page_zip, &mtr);
+ }
+
+ goto reset_bit;
+ }
+
+ if (UNIV_UNLIKELY(corruption_noticed)) {
+ fputs("InnoDB: Discarding record\n ", stderr);
+ rec_print_old(stderr, rec);
+ fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
+ } else if (block) {
+ /* Now we have at pcur a record which should be
+ inserted to the index page; NOTE that the call below
+ copies pointers to fields in rec, and we must
+ keep the latch to the rec page until the
+ insertion is finished! */
+ dtuple_t* entry;
+ trx_id_t max_trx_id;
+ dict_index_t* dummy_index;
+
+ max_trx_id = page_get_max_trx_id(page_align(rec));
+ page_update_max_trx_id(block, page_zip, max_trx_id,
+ &mtr);
+
+ entry = ibuf_build_entry_from_ibuf_rec(
+ rec, heap, &dummy_index);
+#ifdef UNIV_IBUF_DEBUG
+ volume += rec_get_converted_size(dummy_index, entry, 0)
+ + page_dir_calc_reserved_space(1);
+ ut_a(volume <= 4 * UNIV_PAGE_SIZE
+ / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+#endif
+ ibuf_insert_to_index_page(entry, block,
+ dummy_index, &mtr);
+ ibuf_dummy_index_free(dummy_index);
+ }
+
+ n_inserts++;
+
+ /* Delete the record from ibuf */
+ if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+ &mtr)) {
+ /* Deletion was pessimistic and mtr was committed:
+ we start from the beginning again */
+
+ goto loop;
+ } else if (btr_pcur_is_after_last_on_page(&pcur)) {
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ goto loop;
+ }
+ }
+
+reset_bit:
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ if (ibuf_count_get(space, page_no) > 0) {
+ /* btr_print_tree(ibuf_data->index->tree, 100);
+ ibuf_print(); */
+ }
+#endif
+ if (UNIV_LIKELY(update_ibuf_bitmap)) {
+ page_t* bitmap_page;
+
+ bitmap_page = ibuf_bitmap_get_map_page(
+ space, page_no, zip_size, &mtr);
+
+ ibuf_bitmap_page_set_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+
+ if (block) {
+ ulint old_bits = ibuf_bitmap_page_get_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, &mtr);
+
+ ulint new_bits = ibuf_index_page_calc_free(
+ zip_size, block);
+
+ if (old_bits != new_bits) {
+ ibuf_bitmap_page_set_bits(
+ bitmap_page, page_no, zip_size,
+ IBUF_BITMAP_FREE, new_bits, &mtr);
+ }
+ }
+ }
+
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+ mem_heap_free(heap);
+
+ /* Protect our statistics keeping from race conditions */
+ mutex_enter(&ibuf_mutex);
+
+ ibuf->n_merges++;
+ ibuf->n_merged_recs += n_inserts;
+
+ mutex_exit(&ibuf_mutex);
+
+ if (update_ibuf_bitmap && !tablespace_being_deleted) {
+
+ fil_decr_pending_ibuf_merges(space);
+ }
+
+ ibuf_exit();
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ut_a(ibuf_count_get(space, page_no) == 0);
+#endif
+}
+
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+ ulint space) /*!< in: space id */
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ dtuple_t* search_tuple;
+ rec_t* ibuf_rec;
+ ulint page_no;
+ ibool closed;
+ ulint n_inserts;
+ mtr_t mtr;
+
+ heap = mem_heap_create(512);
+
+ /* Use page number 0 to build the search tuple so that we get the
+ cursor positioned at the first entry for this space id */
+
+ search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
+
+ n_inserts = 0;
+loop:
+ ibuf_enter();
+
+ mtr_start(&mtr);
+
+ /* Position pcur in the insert buffer at the first entry for the
+ space */
+ btr_pcur_open_on_user_rec(
+ ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+ goto leave_loop;
+ }
+
+ for (;;) {
+ ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+ ibuf_rec = btr_pcur_get_rec(&pcur);
+
+ /* Check if the entry is for this space */
+ if (ibuf_rec_get_space(ibuf_rec) != space) {
+
+ goto leave_loop;
+ }
+
+ page_no = ibuf_rec_get_page_no(ibuf_rec);
+
+ n_inserts++;
+
+ /* Delete the record from ibuf */
+ closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+ &mtr);
+ if (closed) {
+ /* Deletion was pessimistic and mtr was committed:
+ we start from the beginning again */
+
+ ibuf_exit();
+
+ goto loop;
+ }
+
+ if (btr_pcur_is_after_last_on_page(&pcur)) {
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ ibuf_exit();
+
+ goto loop;
+ }
+ }
+
+leave_loop:
+ mtr_commit(&mtr);
+ btr_pcur_close(&pcur);
+
+ /* Protect our statistics keeping from race conditions */
+ mutex_enter(&ibuf_mutex);
+
+ ibuf->n_merges++;
+ ibuf->n_merged_recs += n_inserts;
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ mem_heap_free(heap);
+}
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+ibuf_is_empty(void)
+/*===============*/
+{
+ ibool is_empty;
+ const page_t* root;
+ mtr_t mtr;
+
+ ibuf_enter();
+
+ mutex_enter(&ibuf_mutex);
+
+ mtr_start(&mtr);
+
+ root = ibuf_tree_root_get(&mtr);
+
+ if (page_get_n_recs(root) == 0) {
+
+ is_empty = TRUE;
+
+ if (ibuf->empty == FALSE) {
+ fprintf(stderr,
+ "InnoDB: Warning: insert buffer tree is empty"
+ " but the data struct does not\n"
+ "InnoDB: know it. This condition is legal"
+ " if the master thread has not yet\n"
+ "InnoDB: run to completion.\n");
+ }
+ } else {
+ ut_a(ibuf->empty == FALSE);
+
+ is_empty = FALSE;
+ }
+
+ mtr_commit(&mtr);
+
+ mutex_exit(&ibuf_mutex);
+
+ ibuf_exit();
+
+ return(is_empty);
+}
+
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+ FILE* file) /*!< in: file where to print */
+{
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ ulint i;
+ ulint j;
+#endif
+
+ mutex_enter(&ibuf_mutex);
+
+ fprintf(file,
+ "Ibuf: size %lu, free list len %lu, seg size %lu,\n"
+ "%lu inserts, %lu merged recs, %lu merges\n",
+ (ulong) ibuf->size,
+ (ulong) ibuf->free_list_len,
+ (ulong) ibuf->seg_size,
+ (ulong) ibuf->n_inserts,
+ (ulong) ibuf->n_merged_recs,
+ (ulong) ibuf->n_merges);
+#ifdef UNIV_IBUF_COUNT_DEBUG
+ for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
+ for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
+ ulint count = ibuf_count_get(i, j);
+
+ if (count > 0) {
+ fprintf(stderr,
+ "Ibuf count for space/page %lu/%lu"
+ " is %lu\n",
+ (ulong) i, (ulong) j, (ulong) count);
+ }
+ }
+ }
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+ mutex_exit(&ibuf_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
new file mode 100644
index 00000000000..5e6a76c7d21
--- /dev/null
+++ b/storage/xtradb/include/btr0btr.h
@@ -0,0 +1,517 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define BTR_PAGE_MAX_REC_SIZE (UNIV_PAGE_SIZE / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g. fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS 100
+
+/** Latching modes for btr_cur_search_to_nth_level(). */
+enum btr_latch_mode {
+ /** Search a record on a leaf page and S-latch it. */
+ BTR_SEARCH_LEAF = RW_S_LATCH,
+ /** (Prepare to) modify a record on a leaf page and X-latch it. */
+ BTR_MODIFY_LEAF = RW_X_LATCH,
+ /** Obtain no latches. */
+ BTR_NO_LATCHES = RW_NO_LATCH,
+ /** Start modifying the entire B-tree. */
+ BTR_MODIFY_TREE = 33,
+ /** Continue modifying the entire B-tree. */
+ BTR_CONT_MODIFY_TREE = 34,
+ /** Search the previous record. */
+ BTR_SEARCH_PREV = 35,
+ /** Modify the previous record. */
+ BTR_MODIFY_PREV = 36
+};
+
+/** If this is ORed to btr_latch_mode, it means that the search tuple
+will be inserted to the index, at the searched position */
+#define BTR_INSERT 512
+
+/** This flag ORed to btr_latch_mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE 1024
+
+/** This flag ORed to btr_latch_mode says that we can ignore possible
+UNIQUE definition on secondary indexes when we decide if we can use
+the insert buffer to speed up inserts */
+#define BTR_IGNORE_SEC_UNIQUE 2048
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+ dict_index_t* index, /*!< in: index tree */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page); /*!< in: index page */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ const page_t* page); /*!< in: index page */
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the next index page number.
+@return next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the previous index page number.
+@return prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if
+ needed, also to the previous page */
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+ rec_t* rec, /*!< in: record on leaf level */
+ mtr_t* mtr); /*!< in: mtr holding a latch on the page, and if
+ needed, also to the next page */
+/**************************************************************//**
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/************************************************************//**
+Creates the root node for a new index tree.
+@return page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+ ulint type, /*!< in: type of the index */
+ ulint space, /*!< in: space where created */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ dulint index_id,/*!< in: index id */
+ dict_index_t* index, /*!< in: index */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no); /*!< in: root page number */
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+ ulint space, /*!< in: space where created */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint root_page_no, /*!< in: root page number */
+ mtr_t* mtr); /*!< in: a mini-transaction which has already
+ been started */
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
+ on the root page; when the function returns,
+ the cursor is positioned on the predecessor
+ of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Reorganizes an index page.
+IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
+page of a non-clustered index, the caller must update the insert
+buffer free bits in the same mini-transaction in such a way that the
+modification will be redo-logged.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+btr_page_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in: page to be reorganized */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to left.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec);/*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to right.
+@return TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert */
+ rec_t** split_rec);/*!< out: if split recommended,
+ the first record on upper half page,
+ or NULL if tuple should be first */
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
+ function returns, the cursor is positioned
+ on the predecessor of the inserted record */
+ const dtuple_t* tuple, /*!< in: tuple to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mtr */
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: level, must be > 0 */
+ dtuple_t* tuple, /*!< in: the record to be inserted */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+# define btr_insert_on_non_leaf_level(i,l,t,m) \
+ btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+ rec_t* rec, /*!< in/out: record */
+ mtr_t* mtr); /*!< in: mtr */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: page whose node pointer is deleted */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
+ the page must not be empty: in record delete
+ use btr_discard_page if the page would become
+ empty */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
+ the root page */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ dict_index_t* index, /*!< in: record descriptor */
+ buf_block_t* block, /*!< in: page to be reorganized, or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return number of pages */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+ dict_index_t* index, /*!< in: index */
+ ulint flag); /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+ dict_index_t* index, /*!< in: index tree */
+ ulint hint_page_no, /*!< in: hint of a good page */
+ byte file_direction, /*!< in: direction where a possible
+ page split is made */
+ ulint level, /*!< in: level where the page is placed
+ in the tree */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to BLOB
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+ dict_index_t* index, /*!< in: index tree */
+ buf_block_t* block, /*!< in: block to be freed, x-latched */
+ ulint level, /*!< in: page level */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+ dict_index_t* index); /*!< in: index tree */
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+ dict_index_t* index, /*!< in: index */
+ ulint width); /*!< in: print this many entries from start
+ and end */
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+ const rec_t* rec, /*!< in: index record */
+ const dict_index_t* index, /*!< in: index */
+ ibool dump_on_error); /*!< in: TRUE if the function
+ should print hex dump of record
+ and page on error */
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_validate_index(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx); /*!< in: transaction or NULL */
+
+#define BTR_N_LEAF_PAGES 1
+#define BTR_TOTAL_SIZE 2
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
new file mode 100644
index 00000000000..c9c38f3c3b3
--- /dev/null
+++ b/storage/xtradb/include/btr0btr.ic
@@ -0,0 +1,316 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+#include "srv0srv.h"
+#define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level
+ (not really a hard limit).
+ Used in debug assertions
+ in btr_page_set_level and
+ btr_page_get_level_low */
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+
+ block = buf_page_get(space, zip_size, page_no, mode, mtr);
+
+ ut_a(srv_pass_corrupt_table || block);
+
+ if (block && mode != RW_NO_LATCH) {
+
+ buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+ }
+
+ return(block);
+}
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ ulint mode, /*!< in: latch mode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ return(buf_block_get_frame(btr_block_get(space, zip_size, page_no,
+ mode, mtr)));
+}
+
+/**************************************************************//**
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+ page_t* page, /*!< in: page to be created */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ dulint id, /*!< in: index id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_INDEX_ID),
+ 8, mtr);
+ } else {
+ mlog_write_dulint(page + (PAGE_HEADER + PAGE_INDEX_ID),
+ id, mtr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint level;
+
+ ut_ad(page);
+
+ level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ return(level);
+}
+
+/********************************************************//**
+Gets the node level field in an index page.
+@return level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(btr_page_get_level_low(page));
+}
+
+/********************************************************//**
+Sets the node level field in an index page. */
+UNIV_INLINE
+void
+btr_page_set_level(
+/*===============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint level, /*!< in: level, leaf level == 0 */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_LEVEL),
+ 2, mtr);
+ } else {
+ mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level,
+ MLOG_2BYTES, mtr);
+ }
+}
+
+/********************************************************//**
+Gets the next index page number.
+@return next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
+
+ return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/********************************************************//**
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint next, /*!< in: next page number */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_4(page + FIL_PAGE_NEXT, next);
+ page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
+ } else {
+ mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
+ }
+}
+
+/********************************************************//**
+Gets the previous index page number.
+@return prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+ const page_t* page, /*!< in: index page */
+ mtr_t* mtr __attribute__((unused))) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************//**
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
+ part will be updated, or NULL */
+ ulint prev, /*!< in: previous page number */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_4(page + FIL_PAGE_PREV, prev);
+ page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
+ } else {
+ mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
+ }
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+ const rec_t* rec, /*!< in: node pointer record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ const byte* field;
+ ulint len;
+ ulint page_no;
+
+ ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+ /* The child address is in the last field */
+ field = rec_get_nth_field(rec, offsets,
+ rec_offs_n_fields(offsets) - 1, &len);
+
+ ut_ad(len == 4);
+
+ page_no = mach_read_from_4(field);
+
+ if (UNIV_UNLIKELY(page_no == 0)) {
+ fprintf(stderr,
+ "InnoDB: a nonsensical page number 0"
+ " in a node ptr record at offset %lu\n",
+ (ulong) page_offset(rec));
+ buf_page_print(page_align(rec), 0);
+ }
+
+ return(page_no);
+}
+
+/**************************************************************//**
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+ ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY));
+
+ mtr_memo_release(mtr, block,
+ latch_mode == BTR_SEARCH_LEAF
+ ? MTR_MEMO_PAGE_S_FIX
+ : MTR_MEMO_PAGE_X_FIX);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h
new file mode 100644
index 00000000000..e151fdcb563
--- /dev/null
+++ b/storage/xtradb/include/btr0cur.h
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+
+/* Mode flags for btr_cur operations; these can be ORed */
+#define BTR_NO_UNDO_LOG_FLAG 1 /* do no undo logging */
+#define BTR_NO_LOCKING_FLAG 2 /* do no record lock checking */
+#define BTR_KEEP_SYS_FLAG 4 /* sys fields will be found from the
+ update vector or inserted entry */
+
+#ifndef UNIV_HOTBACKUP
+#include "que0types.h"
+#include "row0types.h"
+#include "ha0ha.h"
+
+#define BTR_CUR_ADAPT
+#define BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@return index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+ btr_cur_t* cursor);/*!< in: B-tree cursor */
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor);/*!< in: cursor */
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint level, /*!< in: the tree level of search */
+ const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
+ tuple must be set so that it cannot get
+ compared to the node ptr page number field! */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be PAGE_CUR_LE,
+ not PAGE_CUR_GE, as the latter may end up on
+ the previous page of the record! Inserts
+ should always be made using PAGE_CUR_LE to
+ search the position! */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+ BTR_INSERT and BTR_ESTIMATE;
+ cursor->left_block is used to store a pointer
+ to the left neighbor page, in the cases
+ BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+ NOTE that if has_search_latch
+ is != 0, we maybe do not have a latch set
+ on the cursor page, we assume
+ the caller uses his search latch
+ to protect the record! */
+ btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
+ s- or x-latched, but see also above! */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_cur_t* cursor, /*!< in: cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_index_side(f,i,l,c,m) \
+ btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< in/out: B-tree cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m) \
+ btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameters index and thr should be
+ specified */
+ btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr); /*!< in: mtr; if this function returns
+ DB_SUCCESS on a leaf page of a secondary
+ index in a compressed tablespace, the
+ mtr must be committed before latching
+ any further pages */
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+ ulint flags, /*!< in: undo logging and locking flags: if not
+ zero, the parameter thr should be
+ specified; if no undo logging is specified,
+ then the caller must have reserved enough
+ free extents in the file space so that the
+ insertion will certainly succeed */
+ btr_cur_t* cursor, /*!< in: cursor after which to insert;
+ cursor stays valid */
+ dtuple_t* entry, /*!< in/out: entry to insert */
+ rec_t** rec, /*!< out: pointer to inserted record if
+ succeed */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or
+ NULL */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr, /*!< in: query thread or NULL */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_update_in_place(
+/*====================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
+DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
+there is not enough space left on the compressed page */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update;
+ cursor stays valid and positioned on the
+ same record */
+ const upd_t* update, /*!< in: update vector; this must also
+ contain trx id and roll ptr fields */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+ ulint flags, /*!< in: undo logging, locking, and rollback
+ flags */
+ btr_cur_t* cursor, /*!< in: cursor on the record to update */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
+ be stored externally by the caller, or NULL */
+ const upd_t* update, /*!< in: update vector; this is allowed also
+ contain trx id and roll ptr fields, but
+ the values in update vector have no effect */
+ ulint cmpl_info,/*!< in: compiler info on secondary index
+ updates */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr; must be committed before
+ latching any further pages */
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+ ulint flags, /*!< in: undo logging and locking flags */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+ ulint flags, /*!< in: locking flag */
+ btr_cur_t* cursor, /*!< in: cursor */
+ ibool val, /*!< in: value to set */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Clear a secondary index record's delete mark. This function is only
+used by the insert buffer insert merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+ rec_t* rec, /*!< in/out: record to delete unmark */
+ page_zip_des_t* page_zip, /*!< in/out: compressed page
+ corresponding to rec, or NULL
+ when the tablespace is
+ uncompressed */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the page to compress;
+ cursor does not stay valid if compression
+ occurs */
+ mtr_t* mtr); /*!< in: mtr */
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ cursor stays valid: if deletion succeeds,
+ on function exit it points to the successor
+ of the deleted record */
+ mtr_t* mtr); /*!< in: mtr; if this function returns
+ TRUE on a leaf page of a secondary
+ index, the mtr must be committed
+ before latching any further pages */
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+ the latter may occur because we may have
+ to update node pointers on upper levels,
+ and in the case of variable length keys
+ these may actually grow in size */
+ ibool has_reserved_extents, /*!< in: TRUE if the
+ caller has already reserved enough free
+ extents so that he knows that the operation
+ will succeed */
+ btr_cur_t* cursor, /*!< in: cursor on the record to delete;
+ if compression does not occur, the cursor
+ stays valid: it points to successor of
+ deleted record on function exit */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ dict_index_t* index); /*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in/out: page or NULL */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page, or NULL */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple1, /*!< in: range start, may also be empty tuple */
+ ulint mode1, /*!< in: search mode for range start */
+ const dtuple_t* tuple2, /*!< in: range end, may also be empty tuple */
+ ulint mode2); /*!< in: search mode for range end */
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+ dict_index_t* index); /*!< in: index */
+/*******************************************************************//**
+Marks not updated extern fields as not-owned by this record. The ownership
+is transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field.
+@return TRUE if BLOB ownership was transferred */
+UNIV_INTERN
+ibool
+btr_cur_mark_extern_inherited_fields(
+/*=================================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
+ part will be updated, or NULL */
+ rec_t* rec, /*!< in/out: record in a clustered index */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+The complement of the previous function: in an update entry may inherit
+some externally stored fields from a record. We must mark them as inherited
+in entry, so that they are not freed in a rollback. */
+UNIV_INTERN
+void
+btr_cur_mark_dtuple_inherited_extern(
+/*=================================*/
+ dtuple_t* entry, /*!< in/out: updated entry to be
+ inserted to clustered index */
+ const upd_t* update); /*!< in: update vector */
+/*******************************************************************//**
+Marks all extern fields in a dtuple as owned by the record. */
+UNIV_INTERN
+void
+btr_cur_unmark_dtuple_extern_fields(
+/*================================*/
+ dtuple_t* entry); /*!< in/out: clustered index entry */
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec. The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return DB_SUCCESS or error */
+UNIV_INTERN
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+ dict_index_t* index, /*!< in: index of rec; the index tree
+ MUST be X-latched */
+ buf_block_t* rec_block, /*!< in/out: block containing rec */
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index);
+ the "external storage" flags in offsets
+ will not correspond to rec when
+ this function returns */
+ big_rec_t* big_rec_vec, /*!< in: vector containing fields
+ to be stored externally */
+ mtr_t* local_mtr); /*!< in: mtr containing the latch to
+ rec and to the tree */
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+ dict_index_t* index, /*!< in: index of the data, the index
+ tree MUST be X-latched; if the tree
+ height is 1, then also the root page
+ must be X-latched! (this is relevant
+ in the case this function is called
+ from purge where 'data' is located on
+ an undo log page, not an index
+ page) */
+ byte* field_ref, /*!< in/out: field reference */
+ const rec_t* rec, /*!< in: record containing field_ref, for
+ page_zip_write_blob_ptr(), or NULL */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
+ or NULL */
+ page_zip_des_t* page_zip, /*!< in: compressed page corresponding
+ to rec, or NULL if rec == NULL */
+ ulint i, /*!< in: field number of field_ref;
+ ignored if rec == NULL */
+ enum trx_rb_ctx rb_ctx, /*!< in: rollback context */
+ mtr_t* local_mtr); /*!< in: mtr containing the latch to
+ data an an X-latch to the index
+ tree */
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record. The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+ byte* buf, /*!< out: the field, or a prefix of it */
+ ulint len, /*!< in: length of buf, in bytes */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ const byte* data, /*!< in: 'internally' stored part of the
+ field containing also the reference to
+ the external part; must be protected by
+ a lock or a page latch */
+ ulint local_len);/*!< in: length of data, in bytes */
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+ const rec_t* rec, /*!< in: record in a clustered index;
+ must be protected by a lock or a page latch */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint zip_size,/*!< in: nonzero=compressed BLOB page size,
+ zero for uncompressed BLOBs */
+ ulint no, /*!< in: field number */
+ ulint* len, /*!< out: length of the field */
+ mem_heap_t* heap); /*!< in: mem heap */
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector. We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const upd_t* update, /*!< in: update vector */
+ mem_heap_t* heap) /*!< in: memory heap */
+ __attribute__((nonnull));
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT (UNIV_PAGE_SIZE / 2)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+
+typedef struct btr_path_struct btr_path_t;
+struct btr_path_struct{
+ ulint nth_rec; /*!< index of the record
+ where the page cursor stopped on
+ this level (index in alphabetical
+ order); value ULINT_UNDEFINED
+ denotes array end */
+ ulint n_recs; /*!< number of records on the page */
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS 250 /*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+ BTR_CUR_HASH = 1, /*!< successful shortcut using
+ the hash index */
+ BTR_CUR_HASH_FAIL, /*!< failure using hash, success using
+ binary search: the misleading hash
+ reference is stored in the field
+ hash_node, and might be necessary to
+ update */
+ BTR_CUR_BINARY, /*!< success using the binary search */
+ BTR_CUR_INSERT_TO_IBUF /*!< performed the intended insert to
+ the insert buffer */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_struct {
+ dict_index_t* index; /*!< index where positioned */
+ page_cur_t page_cur; /*!< page cursor */
+ buf_block_t* left_block; /*!< this field is used to store
+ a pointer to the left neighbor
+ page, in the cases
+ BTR_SEARCH_PREV and
+ BTR_MODIFY_PREV */
+ /*------------------------------*/
+ que_thr_t* thr; /*!< this field is only used
+ when btr_cur_search_to_nth_level
+ is called for an index entry
+ insertion: the calling query
+ thread is passed here to be
+ used in the insert buffer */
+ /*------------------------------*/
+ /** The following fields are used in
+ btr_cur_search_to_nth_level to pass information: */
+ /* @{ */
+ enum btr_cur_method flag; /*!< Search method used */
+ ulint tree_height; /*!< Tree height if the search is done
+ for a pessimistic insert or update
+ operation */
+ ulint up_match; /*!< If the search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ the first user record to the right of
+ the cursor record after
+ btr_cur_search_to_nth_level;
+ for the mode PAGE_CUR_GE, the matched
+ fields to the first user record AT THE
+ CURSOR or to the right of it;
+ NOTE that the up_match and low_match
+ values may exceed the correct values
+ for comparison to the adjacent user
+ record if that record is on a
+ different leaf page! (See the note in
+ row_ins_duplicate_key.) */
+ ulint up_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint low_match; /*!< if search mode was PAGE_CUR_LE,
+ the number of matched fields to the
+ first user record AT THE CURSOR or
+ to the left of it after
+ btr_cur_search_to_nth_level;
+ NOT defined for PAGE_CUR_GE or any
+ other search modes; see also the NOTE
+ in up_match! */
+ ulint low_bytes; /*!< number of matched bytes to the
+ right at the time cursor positioned;
+ only used internally in searches: not
+ defined after the search */
+ ulint n_fields; /*!< prefix length used in a hash
+ search if hash_node != NULL */
+ ulint n_bytes; /*!< hash prefix bytes if hash_node !=
+ NULL */
+ ulint fold; /*!< fold value used in the search if
+ flag is BTR_CUR_HASH */
+ /*------------------------------*/
+ /* @} */
+ btr_path_t* path_arr; /*!< in estimating the number of
+ rows in range, we store in this array
+ information of the path through
+ the tree */
+};
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES 100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later. Sleep this many
+microseconds between retries. */
+#define BTR_CUR_RETRY_SLEEP_TIME 50000
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID 0 /*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO 4 /*!< page no where stored */
+#define BTR_EXTERN_OFFSET 8 /*!< offset of BLOB header
+ on that page */
+#define BTR_EXTERN_LEN 12 /*!< 8 bytes containing the
+ length of the externally
+ stored part of the BLOB.
+ The 2 highest bits are
+ reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE 20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG 128
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row. In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG 64
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+extern ulint btr_cur_n_non_sea;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+extern ulint btr_cur_n_sea;
+/** Old value of btr_cur_n_non_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_non_sea_old;
+/** Old value of btr_cur_n_sea. Copied by
+srv_refresh_innodb_monitor_stats(). Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint btr_cur_n_sea_old;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic
new file mode 100644
index 00000000000..280583f6ccf
--- /dev/null
+++ b/storage/xtradb/include/btr0cur.ic
@@ -0,0 +1,200 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+ const btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(&((btr_cur_t*) cursor)->page_cur);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_cur_get_rec(&(cursor->page_cur)));
+}
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ page_cur_invalidate(&(cursor->page_cur));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+ btr_cur_t* cursor) /*!< in: tree cursor */
+{
+ return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Returns the index of a cursor.
+@return index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+ btr_cur_t* cursor) /*!< in: B-tree cursor */
+{
+ return(cursor->index);
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record in tree */
+ buf_block_t* block, /*!< in: buffer block of rec */
+ btr_cur_t* cursor) /*!< out: cursor */
+{
+ ut_ad(page_align(rec) == block->frame);
+
+ page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+
+ cursor->index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))) {
+
+ /* The page fillfactor has dropped below a predefined
+ minimum value OR the level in the B-tree contains just
+ one page: we recommend compression if this is not the
+ root page. */
+
+ return(dict_index_get_page(cursor->index)
+ != page_get_page_no(page));
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+ btr_cur_t* cursor, /*!< in: btr cursor */
+ ulint rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* page;
+
+ ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+ MTR_MEMO_PAGE_X_FIX));
+
+ page = btr_cur_get_page(cursor);
+
+ if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)
+ || ((btr_page_get_next(page, mtr) == FIL_NULL)
+ && (btr_page_get_prev(page, mtr) == FIL_NULL))
+ || (page_get_n_recs(page) < 2)) {
+
+ /* The page fillfactor will drop below a predefined
+ minimum value, OR the level in the B-tree contains just
+ one page, OR the page will become empty: we recommend
+ compression if this is not the root page. */
+
+ return(dict_index_get_page(cursor->index)
+ == page_get_page_no(page));
+ }
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h
new file mode 100644
index 00000000000..2334a266280
--- /dev/null
+++ b/storage/xtradb/include/btr0pcur.h
@@ -0,0 +1,551 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "btr0types.h"
+
+/* Relative positions for a stored cursor position */
+#define BTR_PCUR_ON 1
+#define BTR_PCUR_BEFORE 2
+#define BTR_PCUR_AFTER 3
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+#define BTR_PCUR_BEFORE_FIRST_IN_TREE 4 /* in an empty tree */
+#define BTR_PCUR_AFTER_LAST_IN_TREE 5 /* in an empty tree */
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+ btr_pcur_t* cursor); /*!< in, own: persistent cursor */
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+ btr_pcur_t* pcur_receive, /*!< in: pcur which will receive the
+ position info */
+ btr_pcur_t* pcur_donate); /*!< in: pcur from which the info is
+ copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_func(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m) \
+ btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if has_search_latch != 0 then
+ we maybe do not acquire a latch on the cursor
+ page, but assume that the caller uses his
+ btr search latch to protect the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m) \
+ btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m)
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in: cursor */
+ ibool do_init, /*!< in: TRUE if should be initialized */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ btr_pcur_t* cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ btr_pcur_t* cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF or
+ BTR_MODIFY_LEAF */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent
+ cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m) \
+ btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m) \
+ btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Frees the possible old_rec_buf buffer of a persistent cursor and sets the
+latch mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: detached persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr) \
+ btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
+/**************************************************************//**
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_release_leaf(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in, own: mtr */
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ btr_pcur_t* cursor); /*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr to commit */
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ btr_pcur_t* pcur); /*!< in: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; must be on the
+ last record of the current page */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record
+of the page. Releases the latch on the current page, and bufferunfixes
+it. Note that to prevent a possible deadlock, the operation first
+stores the position of the cursor, releases the leaf latch, acquires
+necessary latches and restores the cursor position again before returning.
+The alphabetical position of the cursor is guaranteed to be sensible
+on return, but it may happen that the cursor is not positioned on the
+last record of any page, because the structure of the tree may have
+changed while the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the
+ first record of the current page */
+ mtr_t* mtr); /*!< in: mtr */
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor); /*!< in: persistent cursor */
+#else /* UNIV_DEBUG */
+# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor);/*!< in/out: persistent cursor */
+
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_struct{
+ btr_cur_t btr_cur; /*!< a B-tree cursor */
+ ulint latch_mode; /*!< see TODO note below!
+ BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
+ BTR_MODIFY_TREE, or BTR_NO_LATCHES,
+ depending on the latching state of
+ the page and tree where the cursor is
+ positioned; the last value means that
+ the cursor is not currently positioned:
+ we say then that the cursor is
+ detached; it can be restored to
+ attached if the old position was
+ stored in old_rec */
+ ulint old_stored; /*!< BTR_PCUR_OLD_STORED
+ or BTR_PCUR_OLD_NOT_STORED */
+ rec_t* old_rec; /*!< if cursor position is stored,
+ contains an initial segment of the
+ latest record cursor was positioned
+ either on, before, or after */
+ ulint old_n_fields; /*!< number of fields in old_rec */
+ ulint rel_pos; /*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or
+ BTR_PCUR_AFTER, depending on whether
+ cursor was on, before, or after the
+ old_rec record */
+ buf_block_t* block_when_stored;/* buffer block when the position was
+ stored */
+ ib_uint64_t modify_clock; /*!< the modify clock value of the
+ buffer block when the cursor position
+ was stored */
+ ulint pos_state; /*!< see TODO note below!
+ BTR_PCUR_IS_POSITIONED,
+ BTR_PCUR_WAS_POSITIONED,
+ BTR_PCUR_NOT_POSITIONED */
+ ulint search_mode; /*!< PAGE_CUR_G, ... */
+ trx_t* trx_if_known; /*!< the transaction, if we know it;
+ otherwise this field is not defined;
+ can ONLY BE USED in error prints in
+ fatal assertion failures! */
+ /*-----------------------------*/
+ /* NOTE that the following fields may possess dynamically allocated
+ memory which should be freed if not needed anymore! */
+
+ mtr_t* mtr; /*!< NULL, or this field may contain
+ a mini-transaction which holds the
+ latch on the cursor page */
+ byte* old_rec_buf; /*!< NULL, or a dynamically allocated
+ buffer for old_rec */
+ ulint buf_size; /*!< old_rec_buf size if old_rec_buf
+ is not NULL */
+};
+
+#define BTR_PCUR_IS_POSITIONED 1997660512 /* TODO: currently, the state
+ can be BTR_PCUR_IS_POSITIONED,
+ though it really should be
+ BTR_PCUR_WAS_POSITIONED,
+ because we have no obligation
+ to commit the cursor with
+ mtr; similarly latch_mode may
+ be out of date. This can
+ lead to problems if btr_pcur
+ is not used the right way;
+ all current code should be
+ ok. */
+#define BTR_PCUR_WAS_POSITIONED 1187549791
+#define BTR_PCUR_NOT_POSITIONED 1328997689
+
+#define BTR_PCUR_OLD_STORED 908467085
+#define BTR_PCUR_OLD_NOT_STORED 122766467
+
+#ifndef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic
new file mode 100644
index 00000000000..0c38797e6c5
--- /dev/null
+++ b/storage/xtradb/include/btr0pcur.ic
@@ -0,0 +1,642 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+ ut_ad(cursor->old_rec);
+ ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+ ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+ || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(cursor->rel_pos);
+}
+
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in, own: mtr */
+{
+ ut_ad(cursor);
+
+ cursor->mtr = mtr;
+}
+
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor);
+
+ return(cursor->mtr);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ const btr_cur_t* btr_cur = &cursor->btr_cur;
+ return((btr_cur_t*) btr_cur);
+}
+
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+ btr_pcur_t* cursor) /*!< in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+ btr_pcur_t* cursor) /*!< in: memory buffer for persistent cursor */
+{
+ btr_cur_t* btr_cursor;
+
+ ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+ || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+ ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+ return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+ const btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_pcur_is_before_first_on_page(cursor)
+ || btr_pcur_is_after_last_on_page(cursor)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+ return(FALSE);
+ }
+
+ return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+ btr_pcur_t* cursor) /*!< in/out: persistent cursor */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ UT_NOT_USED(mtr);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ page_cur_set_after_last(btr_pcur_get_block(cursor),
+ btr_pcur_get_page_cur(cursor));
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+loop:
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+ } else {
+ btr_pcur_move_to_next_on_page(cursor);
+ }
+
+ if (btr_pcur_is_on_user_rec(cursor)) {
+
+ return(TRUE);
+ }
+
+ goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+ btr_pcur_t* cursor, /*!< in: persistent cursor; NOTE that the
+ function may release the page latch */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+ ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ if (btr_pcur_is_after_last_on_page(cursor)) {
+
+ if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+ return(FALSE);
+ }
+
+ btr_pcur_move_to_next_page(cursor, mtr);
+
+ return(TRUE);
+ }
+
+ btr_pcur_move_to_next_on_page(cursor);
+
+ return(TRUE);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+ btr_pcur_t* pcur, /*!< in: persistent cursor */
+ mtr_t* mtr) /*!< in: mtr to commit */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ mtr_commit(mtr);
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the pcur latch mode to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_detach(
+/*============*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+ pcur->latch_mode = BTR_NO_LATCHES;
+
+ pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ if (pcur->latch_mode == BTR_NO_LATCHES) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+ btr_pcur_t* pcur) /*!< in: persistent cursor */
+{
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+ pcur->old_rec_buf = NULL;
+ pcur->old_rec = NULL;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_func(
+/*===============*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page from the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ /* Initialize the cursor */
+
+ btr_pcur_init(cursor);
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, 0, file, line, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* tuple, /*!< in: tuple on which search done */
+ ulint mode, /*!< in: PAGE_CUR_L, ...;
+ NOTE that if the search is made using a unique
+ prefix of a record, mode should be
+ PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+ may end up on the previous page of the
+ record! */
+ ulint latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+ NOTE that if has_search_latch != 0 then
+ we maybe do not acquire a latch on the cursor
+ page, but assume that the caller uses his
+ btr search latch to protect the record! */
+ btr_pcur_t* cursor, /*!< in: memory buffer for persistent cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, or 0 */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ btr_cur_t* btr_cursor;
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = mode;
+
+ /* Search with the tree cursor */
+
+ btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+ btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+ btr_cursor, has_search_latch,
+ file, line, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+ ibool from_left, /*!< in: TRUE if open to the low end,
+ FALSE if to the high end */
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: latch mode */
+ btr_pcur_t* pcur, /*!< in: cursor */
+ ibool do_init, /*!< in: TRUE if should be initialized */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ pcur->latch_mode = latch_mode;
+
+ if (from_left) {
+ pcur->search_mode = PAGE_CUR_G;
+ } else {
+ pcur->search_mode = PAGE_CUR_L;
+ }
+
+ if (do_init) {
+ btr_pcur_init(pcur);
+ }
+
+ btr_cur_open_at_index_side(from_left, index, latch_mode,
+ btr_pcur_get_btr_cur(pcur), mtr);
+ pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+ pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ pcur->trx_if_known = NULL;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+ dict_index_t* index, /*!< in: index */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_pcur_t* cursor, /*!< in/out: B-tree pcur */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ /* Initialize the cursor */
+
+ cursor->latch_mode = latch_mode;
+ cursor->search_mode = PAGE_CUR_G;
+
+ btr_pcur_init(cursor);
+
+ btr_cur_open_at_rnd_pos_func(index, latch_mode,
+ btr_pcur_get_btr_cur(cursor),
+ file, line, mtr);
+ cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+ btr_pcur_t* cursor) /*!< in: persistent cursor */
+{
+ if (cursor->old_rec_buf != NULL) {
+
+ mem_free(cursor->old_rec_buf);
+
+ cursor->old_rec = NULL;
+ cursor->old_rec_buf = NULL;
+ }
+
+ cursor->btr_cur.page_cur.rec = NULL;
+ cursor->btr_cur.page_cur.block = NULL;
+ cursor->old_rec = NULL;
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ cursor->latch_mode = BTR_NO_LATCHES;
+ cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+ cursor->trx_if_known = NULL;
+}
diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h
new file mode 100644
index 00000000000..f6d194319ae
--- /dev/null
+++ b/storage/xtradb/include/btr0sea.h
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "univ.i"
+
+#include "rem0rec.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+#include "mtr0mtr.h"
+#include "ha0ha.h"
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+ ulint hash_size); /*!< in: hash index hash table size */
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void);
+/*=====================*/
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void);
+/*====================*/
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void);
+/*====================*/
+
+/********************************************************************//**
+Returns search info for an index.
+@return search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+ mem_heap_t* heap); /*!< in: heap where created */
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+ btr_search_t* info); /*!< in: search info. */
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ btr_search_t* info, /*!< in: index search info */
+ const dtuple_t* tuple, /*!< in: logical record */
+ ulint mode, /*!< in: PAGE_CUR_L, ... */
+ ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
+ btr_cur_t* cursor, /*!< out: tree cursor */
+ ulint has_search_latch,/*!< in: latch mode the caller
+ currently has on btr_search_latch:
+ RW_S_LATCH, RW_X_LATCH, or 0 */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+ buf_block_t* new_block, /*!< in: records are copied
+ to this page */
+ buf_block_t* block, /*!< in: index page from which
+ records were copied, and the
+ copied records will be deleted
+ from this page */
+ dict_index_t* index); /*!< in: record descriptor */
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+ buf_block_t* block); /*!< in: block containing index page,
+ s- or x-latched, or an index page
+ for which we know that
+ block->buf_fix_count == 0 */
+/************************************************************************
+Drops a page hash index based on index */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index_on_index(
+/*=====================================*/
+ dict_index_t* index); /* in: record descriptor */
+/********************************************************************//**
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no); /*!< in: page number */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned to the
+ place to insert using btr_cur_search_...,
+ and the new record has been inserted next
+ to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+ btr_cur_t* cursor);/*!< in: cursor which was positioned on the
+ record to delete using btr_cur_search_...,
+ the record is not yet deleted */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void);
+/*======================*/
+#else
+# define btr_search_validate() TRUE
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+extern char btr_search_enabled;
+
+/** Flag: whether the search system has completed its disabling process,
+It is set to TRUE right after buf_pool_drop_hash_index() in
+btr_search_disable(), indicating hash index entries are cleaned up.
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+extern ibool btr_search_fully_disabled;
+
+/** The search info struct in an index */
+struct btr_search_struct{
+ ulint ref_count; /*!< Number of blocks in this index tree
+ that have search index built
+ i.e. block->index points to this index.
+ Protected by btr_search_latch except
+ when during initialization in
+ btr_search_info_create(). */
+
+ /* @{ The following fields are not protected by any latch.
+ Unfortunately, this means that they must be aligned to
+ the machine word, i.e., they cannot be turned into bit-fields. */
+ buf_block_t* root_guess;/*!< the root page frame when it was last time
+ fetched, or NULL */
+ ulint hash_analysis; /*!< when this exceeds
+ BTR_SEARCH_HASH_ANALYSIS, the hash
+ analysis starts; this is reset if no
+ success noticed */
+ ibool last_hash_succ; /*!< TRUE if the last search would have
+ succeeded, or did succeed, using the hash
+ index; NOTE that the value here is not exact:
+ it is not calculated for every search, and the
+ calculation itself is not always accurate! */
+ ulint n_hash_potential;
+ /*!< number of consecutive searches
+ which would have succeeded, or did succeed,
+ using the hash index;
+ the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+ /* @} */
+ /*---------------------- @{ */
+ ulint n_fields; /*!< recommended prefix length for hash search:
+ number of full fields */
+ ulint n_bytes; /*!< recommended prefix: number of bytes in
+ an incomplete field
+ @see BTR_PAGE_MAX_REC_SIZE */
+ ibool left_side; /*!< TRUE or FALSE, depending on whether
+ the leftmost record of several records with
+ the same prefix should be indexed in the
+ hash index */
+ /*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+ ulint n_hash_succ; /*!< number of successful hash searches thus
+ far */
+ ulint n_hash_fail; /*!< number of failed hash searches */
+ ulint n_patt_succ; /*!< number of successful pattern searches thus
+ far */
+ ulint n_searches; /*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_struct::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N 1112765
+#endif /* UNIV_DEBUG */
+};
+
+/** The hash index system */
+typedef struct btr_search_sys_struct btr_search_sys_t;
+
+/** The hash index system */
+struct btr_search_sys_struct{
+ hash_table_t* hash_index; /*!< the adaptive hash index,
+ mapping dtuple_fold values
+ to rec_t pointers on index pages */
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t* btr_search_sys;
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t* btr_search_latch_temp;
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch (*btr_search_latch_temp)
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS 17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT 3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT 3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT 10000
+
+#ifndef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic
new file mode 100644
index 00000000000..beadeeb8d02
--- /dev/null
+++ b/storage/xtradb/include/btr0sea.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+ btr_search_t* info, /*!< in/out: search info */
+ btr_cur_t* cursor);/*!< in: cursor which was just positioned */
+
+/********************************************************************//**
+Returns search info for an index.
+@return search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+
+ return(index->search_info);
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+ dict_index_t* index, /*!< in: index of the cursor */
+ btr_cur_t* cursor) /*!< in: cursor which was just positioned */
+{
+ btr_search_t* info;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+ ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ info = btr_search_get_info(index);
+
+ info->hash_analysis++;
+
+ if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+ /* Do nothing */
+
+ return;
+
+ }
+
+ ut_ad(cursor->flag != BTR_CUR_HASH);
+
+ btr_search_info_update_slow(info, cursor);
+}
diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h
new file mode 100644
index 00000000000..ef4a6b04b34
--- /dev/null
+++ b/storage/xtradb/include/btr0types.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "univ.i"
+
+#include "rem0types.h"
+#include "page0types.h"
+
+/** Persistent cursor */
+typedef struct btr_pcur_struct btr_pcur_t;
+/** B-tree cursor */
+typedef struct btr_cur_struct btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+typedef struct btr_search_struct btr_search_t;
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define BTR_EXTERN_FIELD_REF_SIZE 20
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#endif
diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h
new file mode 100644
index 00000000000..3a35f8e46e9
--- /dev/null
+++ b/storage/xtradb/include/buf0buddy.h
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "univ.i"
+#include "buf0types.h"
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex. The buf_pool_mutex may only be released and reacquired
+if lru != NULL. This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t). Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool* lru, /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ ibool have_page_hash_mutex)
+ __attribute__((malloc));
+
+/**********************************************************************//**
+Release a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool have_page_hash_mutex)
+ __attribute__((nonnull));
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_struct {
+ /** Number of blocks allocated from the buddy system. */
+ ulint used;
+ /** Number of blocks relocated by the buddy system. */
+ ib_uint64_t relocated;
+ /** Total duration of block relocations, in microseconds. */
+ ib_uint64_t relocated_usec;
+};
+
+/** Statistics of buddy blocks of a given size. */
+typedef struct buf_buddy_stat_struct buf_buddy_stat_t;
+
+/** Statistics of the buddy system, indexed by block size.
+Protected by buf_pool_mutex. */
+extern buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+
+#ifndef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+
+#endif /* buf0buddy_h */
diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic
new file mode 100644
index 00000000000..69659fb69d6
--- /dev/null
+++ b/storage/xtradb/include/buf0buddy.ic
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.ic
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "buf0buf.h"
+#include "buf0buddy.h"
+#include "ut0ut.h"
+#include "sync0sync.h"
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
+The buf_pool_mutex may only be released and reacquired if lru != NULL.
+@return allocated block, possibly NULL if lru==NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool* lru, /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ ibool have_page_hash_mutex)
+ __attribute__((malloc));
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint i, /*!< in: index of buf_pool->zip_free[],
+ or BUF_BUDDY_SIZES */
+ ibool have_page_hash_mutex)
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Get the index of buf_pool->zip_free[] for a given block size.
+@return index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */
+UNIV_INLINE
+ulint
+buf_buddy_get_slot(
+/*===============*/
+ ulint size) /*!< in: block size */
+{
+ ulint i;
+ ulint s;
+
+ for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+ }
+
+ ut_ad(i <= BUF_BUDDY_SIZES);
+ return(i);
+}
+
+/**********************************************************************//**
+Allocate a block. The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex. The buf_pool_mutex may only be released and reacquired
+if lru != NULL. This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t). Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool* lru, /*!< in: pointer to a variable that will be assigned
+ TRUE if storage was allocated from the LRU list
+ and buf_pool_mutex was temporarily released,
+ or NULL if the LRU list should not be used */
+ ibool have_page_hash_mutex)
+{
+ //ut_ad(buf_pool_mutex_own());
+
+ return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex));
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+ void* buf, /*!< in: block to be freed, must not be
+ pointed to by the buffer pool */
+ ulint size, /*!< in: block size, up to UNIV_PAGE_SIZE */
+ ibool have_page_hash_mutex)
+{
+ //ut_ad(buf_pool_mutex_own());
+
+ if (!have_page_hash_mutex) {
+ mutex_enter(&LRU_list_mutex);
+ rw_lock_x_lock(&page_hash_latch);
+ }
+
+ mutex_enter(&zip_free_mutex);
+ buf_buddy_free_low(buf, buf_buddy_get_slot(size), TRUE);
+ mutex_exit(&zip_free_mutex);
+
+ if (!have_page_hash_mutex) {
+ mutex_exit(&LRU_list_mutex);
+ rw_lock_x_unlock(&page_hash_latch);
+ }
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
new file mode 100644
index 00000000000..e06927f42f0
--- /dev/null
+++ b/storage/xtradb/include/buf0buf.h
@@ -0,0 +1,1574 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+#include "univ.i"
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "ut0rbt.h"
+#ifndef UNIV_HOTBACKUP
+#include "os0proc.h"
+#include "srv0srv.h"
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET 10 /*!< get always */
+#define BUF_GET_IF_IN_POOL 11 /*!< get if in pool */
+#define BUF_GET_NO_LATCH 14 /*!< get and bufferfix, but
+ set no latch; we have
+ separated this case, because
+ it is error-prone programming
+ not to set a latch, and it
+ should be used with care */
+/* @} */
+/** @name Modes for buf_page_get_known_nowait */
+/* @{ */
+#define BUF_MAKE_YOUNG 51 /*!< Move the block to the
+ start of the LRU list if there
+ is a danger that the block
+ would drift out of the buffer
+ pool*/
+#define BUF_KEEP_OLD 52 /*!< Preserve the current LRU
+ position of the block. */
+/* @} */
+
+extern buf_pool_t* buf_pool; /*!< The buffer pool of the database */
+#ifdef UNIV_DEBUG
+extern ibool buf_debug_prints;/*!< If this is set TRUE, the program
+ prints info whenever read or flush
+ occurs */
+#endif /* UNIV_DEBUG */
+extern ulint srv_buf_pool_write_requests; /*!< variable to count write request
+ issued */
+#else /* !UNIV_HOTBACKUP */
+extern buf_block_t* back_block1; /*!< first block, for --apply-log */
+extern buf_block_t* back_block2; /*!< second block, for page reorganize */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+/** @brief States of a control block
+@see buf_page_struct
+
+The enumeration values must be 0..7. */
+enum buf_page_state {
+ BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free
+ compressed page */
+ BUF_BLOCK_ZIP_PAGE, /*!< contains a clean
+ compressed page */
+ BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed
+ page that is in the
+ buf_pool->flush_list */
+
+ BUF_BLOCK_NOT_USED, /*!< is in the free list;
+ must be after the BUF_BLOCK_ZIP_
+ constants for compressed-only pages
+ @see buf_block_state_valid() */
+ BUF_BLOCK_READY_FOR_USE, /*!< when buf_LRU_get_free_block
+ returns a block, it is in this state */
+ BUF_BLOCK_FILE_PAGE, /*!< contains a buffered file page */
+ BUF_BLOCK_MEMORY, /*!< contains some main memory
+ object */
+ BUF_BLOCK_REMOVE_HASH /*!< hash index should be removed
+ before putting to the free list */
+};
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates the buffer pool.
+@return own: buf_pool object, NULL if not enough memory or error */
+UNIV_INTERN
+buf_pool_t*
+buf_pool_init(void);
+/*===============*/
+/********************************************************************//**
+Frees the buffer pool at shutdown. This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(void);
+/*===============*/
+
+/********************************************************************//**
+Drops the adaptive hash index. To prevent a livelock, this function
+is only to be called while holding btr_search_latch and while
+btr_search_enabled == FALSE. */
+UNIV_INTERN
+void
+buf_pool_drop_hash_index(void);
+/*==========================*/
+
+/********************************************************************//**
+Relocate a buffer control block. Relocates the block on the LRU list
+and in buf_pool->page_hash. Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+ buf_page_t* bpage, /*!< in/out: control block being relocated;
+ buf_page_get_state(bpage) must be
+ BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+ buf_page_t* dpage) /*!< in/out: destination control block */
+ __attribute__((nonnull));
+/********************************************************************//**
+Resizes the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_resize(void);
+/*=================*/
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void);
+/*==================================*/
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+ ulint zip_size); /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block); /*!< in, own: block to be freed */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ byte* buf, /*!< in: buffer to copy to */
+ const buf_frame_t* frame); /*!< in: buffer frame */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#define buf_page_get(SP, ZS, OF, LA, MTR) buf_page_get_gen(\
+ SP, ZS, OF, LA, NULL,\
+ BUF_GET, __FILE__, __LINE__, MTR)
+/**************************************************************//**
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
+ SP, ZS, OF, RW_NO_LATCH, NULL,\
+ BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: guessed block */
+ ib_uint64_t modify_clock,/*!< in: modify clock value if mode is
+ ..._GUESS_ON_CLOCK */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+ buf_block_t* block, /*!< in: the known page */
+ ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the kernel mutex. */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+ ulint space_id,/*!< in: tablespace id */
+ ulint page_no,/*!< in: page number */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/** Tries to get a page. If the page is not in the buffer pool it is
+not loaded. Suitable for using when holding the kernel mutex.
+@param space_id in: tablespace id
+@param page_no in: page number
+@param mtr in: mini-transaction
+@return the page if in buffer pool, NULL if not */
+#define buf_page_try_get(space_id, page_no, mtr) \
+ buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr);
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch. Mutual exclusion has to
+be implemented at a higher level. In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return pointer to the block, or NULL if not compressed */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint offset, /*!< in: page number */
+ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+ buf_block_t* guess, /*!< in: guessed block or NULL */
+ ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+ BUF_GET_NO_LATCH */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line where called */
+ mtr_t* mtr); /*!< in: mini-transaction */
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space in units of
+ a page */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset, /*!< in: offset of the page within space
+ in units of a page */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ buf_block_t* block); /*!< in: block to init */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage); /*!< in: buffer block */
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+ buf_page_t* bpage); /*!< in: buffer block of a file page */
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+ ulint space, /*!< in: space id */
+ ulint offset); /*!< in: page number */
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+ __attribute__((pure));
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+ __attribute__((pure));
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+ const buf_page_t* bpage); /*!< in: block to make younger */
+/********************************************************************//**
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there.
+@return TRUE if page hash index is built in search system */
+UNIV_INTERN
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+ const buf_page_t* bpage); /*!< in: block containing the
+ page frame */
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block); /*!< in: block */
+#else /* !UNIV_HOTBACKUP */
+# define buf_block_modify_clock_inc(block) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value
+on 32-bit and 64-bit architectures.
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+ const byte* page); /*!< in: buffer page */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum_32(
+/*==========================*/
+ const byte* page); /*!< in: buffer page */
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+ const byte* page); /*!< in: buffer page */
+/********************************************************************//**
+Checks if a page is corrupt.
+@return TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size); /*!< in: size of compressed page;
+ 0 for uncompressed pages */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ const void* ptr, /*!< in: pointer to a buffer frame */
+ ulint* space, /*!< out: space id */
+ fil_addr_t* addr); /*!< out: page offset and byte offset */
+/**********************************************************************//**
+Gets the hash value of a block. This can be used in searches in the
+lock hash table.
+@return lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+ const buf_block_t* block) /*!< in: block */
+ __attribute__((pure));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+ const void* data); /*!< in: pointer to compressed page */
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer pool data structure.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void);
+/*==============*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer pool data structure. */
+UNIV_INTERN
+void
+buf_print(void);
+/*============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+ const byte* read_buf, /*!< in: a database page */
+ ulint zip_size); /*!< in: compressed page size, or
+ 0 for uncompressed pages */
+/********************************************************************//**
+Decompress a block.
+@return TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+ buf_block_t* block, /*!< in/out: block */
+ ibool check); /*!< in: TRUE=verify the page checksum */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void);
+/*==============================*/
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Returns the number of pending buf pool ios.
+@return number of pending I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_ios(void);
+/*=======================*/
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void);
+/*============================*/
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(void);
+/*======================*/
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void);
+/*===============*/
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return TRUE if there is no pending i/o */
+UNIV_INTERN
+ibool
+buf_pool_check_no_pending_io(void);
+/*==============================*/
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void);
+/*=====================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ ulint level); /*!< in: latching order level */
+#else /* UNIV_SYNC_DEBUG */
+# define buf_block_dbg_add_level(block, level) /* nothing */
+#endif /* UNIV_SYNC_DEBUG */
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+ const buf_page_t* bpage); /*!< in: pointer to the control block */
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+ buf_page_t* bpage, /*!< in/out: pointer to control block */
+ enum buf_page_state state); /*!< in: state */
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ enum buf_page_state state); /*!< in: state */
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+
+/*************************************************************************
+Gets the mutex of a block and enter the mutex with consistency. */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex_enter(
+/*=========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+ const buf_page_t* bpage) /*!< in: buffer page */
+ __attribute__((pure));
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+ buf_page_t* bpage, /*!< in: buffer page */
+ enum buf_flush flush_type); /*!< in: flush type */
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ ulint space, /*!< in: tablespace id */
+ ulint page_no);/*!< in: page number */
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ enum buf_io_fix io_fix);/*!< in: io_fix state */
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+ buf_block_t* block, /*!< in/out: control block */
+ enum buf_io_fix io_fix);/*!< in: io_fix state */
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory. The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+ const buf_page_t* bpage) /*!< control block being relocated */
+ __attribute__((pure));
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+ const buf_page_t* bpage) /*!< in: control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool old); /*!< in: old */
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: control block */
+ __attribute__((nonnull, pure));
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ulint time_ms) /*!< in: ut_time_ms() */
+ __attribute__((nonnull));
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+ buf_page_t* bpage) /*!< in: control block, or NULL */
+ __attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+#else /* UNIV_DEBUG */
+# define buf_block_get_frame(block) (block ? (block)->frame : 0)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+ __attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+ (UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+ const byte* ptr); /*!< in: pointer to a frame */
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+ const void* ptr); /*!< in: pointer not
+ dereferenced */
+/** Find out if a pointer corresponds to a buf_block_t::mutex.
+@param m in: mutex candidate
+@return TRUE if m is a buf_block_t::mutex */
+#define buf_pool_is_block_mutex(m) \
+ buf_pointer_is_block_field((const void*)(m))
+/** Find out if a pointer corresponds to a buf_block_t::lock.
+@param l in: rw-lock candidate
+@return TRUE if l is a buf_block_t::lock */
+#define buf_pool_is_block_lock(l) \
+ buf_pointer_is_block_field((const void*)(l))
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+ const byte* ptr); /*!< in: pointer to the page */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version,/*!< in: prevents reading from a wrong
+ version of the tablespace in case we have done
+ DISCARD + IMPORT */
+ ulint offset);/*!< in: page number */
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_io_complete(
+/*=================*/
+ buf_page_t* bpage, /*!< in: pointer to the block in question */
+ trx_t* trx);
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+ __attribute__((const));
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: offset of the page within space */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset);/*!< in: offset of the page within space */
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void);
+/*=======================*/
+#endif /* !UNIV_HOTBACKUP */
+
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+struct buf_page_struct{
+ /** @name General fields
+ None of these bit-fields must be modified without holding
+ buf_page_get_mutex() [buf_block_struct::mutex or
+ buf_pool_zip_mutex], since they can be stored in the same
+ machine word. Some of these fields are additionally protected
+ by buf_pool_mutex. */
+ /* @{ */
+
+ unsigned space:32; /*!< tablespace id; also protected
+ by buf_pool_mutex. */
+ unsigned offset:32; /*!< page number; also protected
+ by buf_pool_mutex. */
+
+ unsigned state:3; /*!< state of the control block; also
+ protected by buf_pool_mutex.
+ State transitions from
+ BUF_BLOCK_READY_FOR_USE to
+ BUF_BLOCK_MEMORY need not be
+ protected by buf_page_get_mutex().
+ @see enum buf_page_state */
+#ifndef UNIV_HOTBACKUP
+ unsigned flush_type:2; /*!< if this block is currently being
+ flushed to disk, this tells the
+ flush_type.
+ @see enum buf_flush */
+ unsigned io_fix:2; /*!< type of pending I/O operation;
+ also protected by buf_pool_mutex
+ @see enum buf_io_fix */
+ unsigned buf_fix_count:25;/*!< count of how manyfold this block
+ is currently bufferfixed */
+ /* @} */
+#endif /* !UNIV_HOTBACKUP */
+ page_zip_des_t zip; /*!< compressed page; zip.data
+ (but not the data it points to) is
+ also protected by buf_pool_mutex */
+#ifndef UNIV_HOTBACKUP
+ buf_page_t* hash; /*!< node used in chaining to
+ buf_pool->page_hash or
+ buf_pool->zip_hash */
+#ifdef UNIV_DEBUG
+ ibool in_page_hash; /*!< TRUE if in buf_pool->page_hash */
+ ibool in_zip_hash; /*!< TRUE if in buf_pool->zip_hash */
+#endif /* UNIV_DEBUG */
+
+ /** @name Page flushing fields
+ All these are protected by buf_pool_mutex. */
+ /* @{ */
+
+ /* UT_LIST_NODE_T(buf_page_t) list; */
+ /*!< based on state, this is a
+ list node, protected only by
+ buf_pool_mutex, in one of the
+ following lists in buf_pool:
+
+ - BUF_BLOCK_NOT_USED: free
+ - BUF_BLOCK_FILE_PAGE: flush_list
+ - BUF_BLOCK_ZIP_DIRTY: flush_list
+ - BUF_BLOCK_ZIP_PAGE: zip_clean
+ - BUF_BLOCK_ZIP_FREE: zip_free[]
+
+ The contents of the list node
+ is undefined if !in_flush_list
+ && state == BUF_BLOCK_FILE_PAGE,
+ or if state is one of
+ BUF_BLOCK_MEMORY,
+ BUF_BLOCK_REMOVE_HASH or
+ BUF_BLOCK_READY_IN_USE. */
+
+ /* resplit for optimistic use */
+ UT_LIST_NODE_T(buf_page_t) free;
+ UT_LIST_NODE_T(buf_page_t) flush_list;
+ UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */
+#ifdef UNIV_DEBUG
+ ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list;
+ when buf_pool_mutex is free, the
+ following should hold: in_flush_list
+ == (state == BUF_BLOCK_FILE_PAGE
+ || state == BUF_BLOCK_ZIP_DIRTY) */
+ ibool in_free_list; /*!< TRUE if in buf_pool->free; when
+ buf_pool_mutex is free, the following
+ should hold: in_free_list
+ == (state == BUF_BLOCK_NOT_USED) */
+#endif /* UNIV_DEBUG */
+ ib_uint64_t newest_modification;
+ /*!< log sequence number of
+ the youngest modification to
+ this block, zero if not
+ modified */
+ ib_uint64_t oldest_modification;
+ /*!< log sequence number of
+ the START of the log entry
+ written of the oldest
+ modification to this block
+ which has not yet been flushed
+ on disk; zero if all
+ modifications are on disk */
+ /* @} */
+ /** @name LRU replacement algorithm fields
+ These fields are protected by buf_pool_mutex only (not
+ buf_pool_zip_mutex or buf_block_struct::mutex). */
+ /* @{ */
+
+ UT_LIST_NODE_T(buf_page_t) LRU;
+ /*!< node of the LRU list */
+//#ifdef UNIV_DEBUG
+ ibool in_LRU_list; /*!< TRUE if the page is in
+ the LRU list; used in
+ debugging */
+//#endif /* UNIV_DEBUG */
+ unsigned old:1; /*!< TRUE if the block is in the old
+ blocks in buf_pool->LRU_old */
+ unsigned freed_page_clock:31;/*!< the value of
+ buf_pool->freed_page_clock
+ when this block was the last
+ time put to the head of the
+ LRU list; a thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ unsigned access_time:32; /*!< time of first access, or
+ 0 if the block was never accessed
+ in the buffer pool */
+ /* @} */
+ ibool is_corrupt;
+# ifdef UNIV_DEBUG_FILE_ACCESSES
+ ibool file_page_was_freed;
+ /*!< this is set to TRUE when fsp
+ frees a page in buffer pool */
+# endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** The buffer control block structure */
+
+struct buf_block_struct{
+
+ /** @name General fields */
+ /* @{ */
+
+ buf_page_t page; /*!< page information; this must
+ be the first field, so that
+ buf_pool->page_hash can point
+ to buf_page_t or buf_block_t */
+ byte* frame; /*!< pointer to buffer frame which
+ is of size UNIV_PAGE_SIZE, and
+ aligned to an address divisible by
+ UNIV_PAGE_SIZE */
+#ifndef UNIV_HOTBACKUP
+ UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+ /*!< node of the decompressed LRU list;
+ a block is in the unzip_LRU list
+ if page.state == BUF_BLOCK_FILE_PAGE
+ and page.zip.data != NULL */
+//#ifdef UNIV_DEBUG
+ ibool in_unzip_LRU_list;/*!< TRUE if the page is in the
+ decompressed LRU list;
+ used in debugging */
+//#endif /* UNIV_DEBUG */
+ mutex_t mutex; /*!< mutex protecting this block:
+ state (also protected by the buffer
+ pool mutex), io_fix, buf_fix_count,
+ and accessed; we introduce this new
+ mutex in InnoDB-5.1 to relieve
+ contention on the buffer pool mutex */
+ rw_lock_t lock; /*!< read-write lock of the buffer
+ frame */
+ unsigned lock_hash_val:32;/*!< hashed value of the page address
+ in the record lock hash table;
+ protected by buf_block_t::lock
+ (or buf_block_t::mutex, buf_pool_mutex
+ in buf_page_get_gen(),
+ buf_page_init_for_read()
+ and buf_page_create()) */
+ ibool check_index_page_at_flush;
+ /*!< TRUE if we know that this is
+ an index page, and want the database
+ to check its consistency before flush;
+ note that there may be pages in the
+ buffer pool which are index pages,
+ but this flag is not set because
+ we do not keep track of all pages;
+ NOT protected by any mutex */
+ /* @} */
+ /** @name Optimistic search field */
+ /* @{ */
+
+ ib_uint64_t modify_clock; /*!< this clock is incremented every
+ time a pointer to a record on the
+ page may become obsolete; this is
+ used in the optimistic cursor
+ positioning: if the modify clock has
+ not changed, we know that the pointer
+ is still valid; this field may be
+ changed if the thread (1) owns the
+ pool mutex and the page is not
+ bufferfixed, or (2) the thread has an
+ x-latch on the block */
+ /* @} */
+ /** @name Hash search fields (unprotected)
+ NOTE that these fields are NOT protected by any semaphore! */
+ /* @{ */
+
+ ulint n_hash_helps; /*!< counter which controls building
+ of a new hash index for the page */
+ ulint n_fields; /*!< recommended prefix length for hash
+ search: number of full fields */
+ ulint n_bytes; /*!< recommended prefix: number of bytes
+ in an incomplete field */
+ ibool left_side; /*!< TRUE or FALSE, depending on
+ whether the leftmost record of several
+ records with the same prefix should be
+ indexed in the hash index */
+ /* @} */
+
+ /** @name Hash search fields
+ These 6 fields may only be modified when we have
+ an x-latch on btr_search_latch AND
+ - we are holding an s-latch or x-latch on buf_block_struct::lock or
+ - we know that buf_block_struct::buf_fix_count == 0.
+
+ An exception to this is when we init or create a page
+ in the buffer pool in buf0buf.c. */
+
+ /* @{ */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ ulint n_pointers; /*!< used in debugging: the number of
+ pointers in the adaptive hash index
+ pointing to this frame */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ unsigned is_hashed:1; /*!< TRUE if hash index has
+ already been built on this
+ page; note that it does not
+ guarantee that the index is
+ complete, though: there may
+ have been hash collisions,
+ record deletions, etc. */
+ unsigned curr_n_fields:10;/*!< prefix length for hash indexing:
+ number of full fields */
+ unsigned curr_n_bytes:15;/*!< number of bytes in hash
+ indexing */
+ unsigned curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+ dict_index_t* index; /*!< Index for which the adaptive
+ hash index has been created. */
+ /* @} */
+# ifdef UNIV_SYNC_DEBUG
+ /** @name Debug fields */
+ /* @{ */
+ rw_lock_t debug_latch; /*!< in the debug version, each thread
+ which bufferfixes the block acquires
+ an s-latch here; so we can use the
+ debug utilities in sync0rw */
+ /* @} */
+# endif
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** Check if a buf_block_t object is in a valid state
+@param block buffer block
+@return TRUE if valid */
+#define buf_block_state_valid(block) \
+(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \
+ && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH))
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool->zip_hash. */
+/* @{ */
+/* the fold should be relative when srv_buffer_pool_shm_key is enabled */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (!srv_buffer_pool_shm_key\
+ ?((ulint) (ptr) / UNIV_PAGE_SIZE)\
+ :((ulint) ((byte*)ptr - (byte*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE))
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A chunk of buffers. The buffer pool is allocated in chunks. */
+struct buf_chunk_struct{
+ ulint mem_size; /*!< allocated size of the chunk */
+ ulint size; /*!< size of frames[] and blocks[] */
+ void* mem; /*!< pointer to the memory area which
+ was allocated for the frames */
+ buf_block_t* blocks; /*!< array of buffer control blocks */
+};
+
+/** @brief The buffer pool statistics structure. */
+struct buf_pool_stat_struct{
+ ulint n_page_gets; /*!< number of page gets performed;
+ also successful searches through
+ the adaptive hash index are
+ counted as page gets; this field
+ is NOT protected by the buffer
+ pool mutex */
+ ulint n_pages_read; /*!< number read operations */
+ ulint n_pages_written;/*!< number write operations */
+ ulint n_pages_created;/*!< number of pages created
+ in the pool with no read */
+ ulint n_ra_pages_read;/*!< number of pages read in
+ as part of read ahead */
+ ulint n_ra_pages_evicted;/*!< number of read ahead
+ pages that are evicted without
+ being accessed */
+ ulint n_pages_made_young; /*!< number of pages made young, in
+ calls to buf_LRU_make_block_young() */
+ ulint n_pages_not_made_young; /*!< number of pages not made
+ young because the first access
+ was not long enough ago, in
+ buf_page_peek_if_too_old() */
+};
+
+/** @brief The buffer pool structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+struct buf_pool_struct{
+
+ /** @name General fields */
+ /* @{ */
+
+ ulint n_chunks; /*!< number of buffer pool chunks */
+ buf_chunk_t* chunks; /*!< buffer pool chunks */
+ ulint curr_size; /*!< current pool size in pages */
+ hash_table_t* page_hash; /*!< hash table of buf_page_t or
+ buf_block_t file pages,
+ buf_page_in_file() == TRUE,
+ indexed by (space_id, offset) */
+ hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks
+ whose frames are allocated to the
+ zip buddy system,
+ indexed by block->frame */
+ ulint n_pend_reads; /*!< number of pending read operations */
+ ulint n_pend_unzip; /*!< number of pending decompressions */
+
+ time_t last_printout_time;
+ /*!< when buf_print_io was last time
+ called */
+ buf_pool_stat_t stat; /*!< current statistics */
+ buf_pool_stat_t old_stat; /*!< old statistics */
+
+ /* @} */
+
+ /** @name Page flushing algorithm fields */
+
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+ /*!< base node of the modified block
+ list */
+ ibool init_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is TRUE when a flush of the
+ given type is being initialized */
+ ulint n_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is the number of pending
+ writes in the given flush type */
+ os_event_t no_flush[BUF_FLUSH_N_TYPES];
+ /*!< this is in the set state
+ when there is no flush batch
+ of the given type running */
+ ib_rbt_t* flush_rbt; /* !< a red-black tree is used
+ exclusively during recovery to
+ speed up insertions in the
+ flush_list. This tree contains
+ blocks in order of
+ oldest_modification LSN and is
+ kept in sync with the
+ flush_list.
+ Each member of the tree MUST
+ also be on the flush_list.
+ This tree is relevant only in
+ recovery and is set to NULL
+ once the recovery is over. */
+ ulint freed_page_clock;/*!< a sequence number used
+ to count the number of buffer
+ blocks removed from the end of
+ the LRU list; NOTE that this
+ counter may wrap around at 4
+ billion! A thread is allowed
+ to read this for heuristic
+ purposes without holding any
+ mutex or latch */
+ ulint LRU_flush_ended;/*!< when an LRU flush ends for a page,
+ this is incremented by one; this is
+ set to zero when a buffer block is
+ allocated */
+
+ /* @} */
+ /** @name LRU replacement algorithm fields */
+ /* @{ */
+
+ UT_LIST_BASE_NODE_T(buf_page_t) free;
+ /*!< base node of the free
+ block list */
+ UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+ /*!< base node of the LRU list */
+ buf_page_t* LRU_old; /*!< pointer to the about
+ buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+ oldest blocks in the LRU list;
+ NULL if LRU length less than
+ BUF_LRU_OLD_MIN_LEN;
+ NOTE: when LRU_old != NULL, its length
+ should always equal LRU_old_len */
+ ulint LRU_old_len; /*!< length of the LRU list from
+ the block to which LRU_old points
+ onward, including that block;
+ see buf0lru.c for the restrictions
+ on this value; 0 if LRU_old == NULL;
+ NOTE: LRU_old_len must be adjusted
+ whenever LRU_old shrinks or grows! */
+
+ UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+ /*!< base node of the
+ unzip_LRU list */
+
+ /* @} */
+ /** @name Buddy allocator fields
+ The buddy allocator is used for allocating compressed page
+ frames and buf_page_t descriptors of blocks that exist
+ in the buffer pool only in compressed form. */
+ /* @{ */
+ UT_LIST_BASE_NODE_T(buf_page_t) zip_clean;
+ /*!< unmodified compressed pages */
+ UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
+ /*!< buddy free lists */
+//#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
+//# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
+//#endif
+#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
+# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
+#endif
+ /* @} */
+};
+
+/** mutex protecting the buffer pool struct and control blocks, except the
+read-write lock in them */
+extern mutex_t buf_pool_mutex;
+extern mutex_t LRU_list_mutex;
+extern mutex_t flush_list_mutex;
+extern rw_lock_t page_hash_latch;
+extern mutex_t free_list_mutex;
+extern mutex_t zip_free_mutex;
+extern mutex_t zip_hash_mutex;
+/** mutex protecting the control blocks of compressed-only pages
+(of type buf_page_t, not buf_block_t) */
+extern mutex_t buf_pool_zip_mutex;
+
+/** @name Accessors for buf_pool_mutex.
+Use these instead of accessing buf_pool_mutex directly. */
+/* @{ */
+
+/** Test if buf_pool_mutex is owned. */
+#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex)
+/** Acquire the buffer pool mutex. */
+#define buf_pool_mutex_enter() do { \
+ ut_ad(!mutex_own(&buf_pool_zip_mutex)); \
+ mutex_enter(&buf_pool_mutex); \
+} while (0)
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Flag to forbid the release of the buffer pool mutex.
+Protected by buf_pool_mutex. */
+extern ulint buf_pool_mutex_exit_forbidden;
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do { \
+ ut_ad(buf_pool_mutex_own()); \
+ buf_pool_mutex_exit_forbidden++; \
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do { \
+ ut_ad(buf_pool_mutex_own()); \
+ ut_a(buf_pool_mutex_exit_forbidden); \
+ buf_pool_mutex_exit_forbidden--; \
+} while (0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() do { \
+ ut_a(!buf_pool_mutex_exit_forbidden); \
+ mutex_exit(&buf_pool_mutex); \
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() mutex_exit(&buf_pool_mutex)
+#endif
+#endif /* !UNIV_HOTBACKUP */
+/* @} */
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED: is in free list, not in LRU list, not in flush list, nor
+ page hash table
+READY_FOR_USE: is not in free list, LRU list, or flush list, nor page
+ hash table
+MEMORY: is not in free list, LRU list, or flush list, nor page
+ hash table
+FILE_PAGE: space and offset are defined, is in page hash table
+ if io_fix == BUF_IO_WRITE,
+ pool: no_flush[flush_type] is in reset state,
+ pool: n_flush[flush_type] > 0
+
+ (1) if buf_fix_count == 0, then
+ is in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ is x-locked,
+ if and only if io_fix == BUF_IO_READ
+ is s-locked,
+ if and only if io_fix == BUF_IO_WRITE
+
+ (2) if buf_fix_count > 0, then
+ is not in LRU list, not in free list
+ is in flush list,
+ if and only if oldest_modification > 0
+ if io_fix == BUF_IO_READ,
+ is x-locked
+ if io_fix == BUF_IO_WRITE,
+ is s-locked
+
+State transitions:
+
+NOT_USED => READY_FOR_USE
+READY_FOR_USE => MEMORY
+READY_FOR_USE => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if
+ (1) buf_fix_count == 0,
+ (2) oldest_modification == 0, and
+ (3) io_fix == 0.
+*/
+
+#ifndef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
new file mode 100644
index 00000000000..93cc68e7fc9
--- /dev/null
+++ b/storage/xtradb/include/buf0buf.ic
@@ -0,0 +1,1126 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "srv0srv.h"
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: block */
+{
+ /* This is sometimes read without holding buf_pool_mutex. */
+ return(bpage->freed_page_clock);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+ const buf_block_t* block) /*!< in: block */
+{
+ return(buf_page_get_freed_page_clock(&block->page));
+}
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+ const buf_page_t* bpage) /*!< in: block to make younger */
+{
+ if (UNIV_UNLIKELY(buf_pool->freed_page_clock == 0)) {
+ /* If eviction has not started yet, do not update the
+ statistics or move blocks in the LRU list. This is
+ either the warm-up phase or an in-memory workload. */
+ return(FALSE);
+ } else if (buf_LRU_old_threshold_ms && bpage->old) {
+ unsigned access_time = buf_page_is_accessed(bpage);
+
+ if (access_time > 0
+ && ((ib_uint32_t) (ut_time_ms() - access_time))
+ >= buf_LRU_old_threshold_ms) {
+ return(TRUE);
+ }
+
+ buf_pool->stat.n_pages_not_made_young++;
+ return(FALSE);
+ } else {
+ /* FIXME: bpage->freed_page_clock is 31 bits */
+ return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+ > ((ulint) bpage->freed_page_clock
+ + (buf_pool->curr_size
+ * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
+ / (BUF_LRU_OLD_RATIO_DIV * 4))));
+ }
+}
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+{
+ return(buf_pool->curr_size * UNIV_PAGE_SIZE);
+}
+
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+{
+ buf_page_t* bpage;
+ ib_uint64_t lsn;
+
+try_again:
+ //buf_pool_mutex_enter();
+ mutex_enter(&flush_list_mutex);
+
+ bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+ if (bpage == NULL) {
+ lsn = 0;
+ } else {
+ ut_ad(bpage->in_flush_list);
+ lsn = bpage->oldest_modification;
+ if (lsn == 0) {
+ mutex_exit(&flush_list_mutex);
+ goto try_again;
+ }
+ }
+
+ //buf_pool_mutex_exit();
+ mutex_exit(&flush_list_mutex);
+
+ /* The returned answer may be out of date: the flush_list can
+ change after the mutex has been released. */
+
+ return(lsn);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ enum buf_page_state state = (enum buf_page_state) bpage->state;
+
+#ifdef UNIV_DEBUG
+ switch (state) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_FILE_PAGE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ default:
+ ut_error;
+ }
+#endif /* UNIV_DEBUG */
+
+ return(state);
+}
+/*********************************************************************//**
+Gets the state of a block.
+@return state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(buf_page_get_state(&block->page));
+}
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+ buf_page_t* bpage, /*!< in/out: pointer to control block */
+ enum buf_page_state state) /*!< in: state */
+{
+#ifdef UNIV_DEBUG
+ enum buf_page_state old_state = buf_page_get_state(bpage);
+
+ switch (old_state) {
+ case BUF_BLOCK_ZIP_FREE:
+ ut_error;
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ ut_a(state == BUF_BLOCK_ZIP_DIRTY);
+ break;
+ case BUF_BLOCK_ZIP_DIRTY:
+ ut_a(state == BUF_BLOCK_ZIP_PAGE);
+ break;
+ case BUF_BLOCK_NOT_USED:
+ ut_a(state == BUF_BLOCK_READY_FOR_USE);
+ break;
+ case BUF_BLOCK_READY_FOR_USE:
+ ut_a(state == BUF_BLOCK_MEMORY
+ || state == BUF_BLOCK_FILE_PAGE
+ || state == BUF_BLOCK_NOT_USED);
+ break;
+ case BUF_BLOCK_MEMORY:
+ ut_a(state == BUF_BLOCK_NOT_USED);
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+ ut_a(state == BUF_BLOCK_NOT_USED
+ || state == BUF_BLOCK_REMOVE_HASH);
+ break;
+ case BUF_BLOCK_REMOVE_HASH:
+ ut_a(state == BUF_BLOCK_MEMORY);
+ break;
+ }
+#endif /* UNIV_DEBUG */
+ bpage->state = state;
+ ut_ad(buf_page_get_state(bpage) == state);
+}
+
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ enum buf_page_state state) /*!< in: state */
+{
+ buf_page_set_state(&block->page, state);
+}
+
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ /* This is a free page in buf_pool->zip_free[].
+ Such pages should only be accessed by the buddy allocator. */
+ /* ut_error; */ /* optimistic */
+ break;
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_FILE_PAGE:
+ return(TRUE);
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+
+ return(bpage->zip.data
+ && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+}
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_FREE:
+ /* ut_error; */ /* optimistic */
+ return(NULL);
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ return(&buf_pool_zip_mutex);
+ default:
+ return(&((buf_block_t*) bpage)->mutex);
+ }
+}
+
+/*************************************************************************
+Gets the mutex of a block and enter the mutex with consistency. */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex_enter(
+/*=========================*/
+ const buf_page_t* bpage) /*!< in: pointer to control block */
+{
+ mutex_t* block_mutex;
+
+ while(1) {
+ block_mutex = buf_page_get_mutex(bpage);
+ if (!block_mutex)
+ return block_mutex;
+
+ mutex_enter(block_mutex);
+ if (block_mutex == buf_page_get_mutex(bpage))
+ return block_mutex;
+ mutex_exit(block_mutex);
+ }
+}
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+ const buf_page_t* bpage) /*!< in: buffer page */
+{
+ enum buf_flush flush_type = (enum buf_flush) bpage->flush_type;
+
+#ifdef UNIV_DEBUG
+ switch (flush_type) {
+ case BUF_FLUSH_LRU:
+ case BUF_FLUSH_SINGLE_PAGE:
+ case BUF_FLUSH_LIST:
+ return(flush_type);
+ case BUF_FLUSH_N_TYPES:
+ break;
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(flush_type);
+}
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+ buf_page_t* bpage, /*!< in: buffer page */
+ enum buf_flush flush_type) /*!< in: flush type */
+{
+ bpage->flush_type = flush_type;
+ ut_ad(buf_page_get_flush_type(bpage) == flush_type);
+}
+
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+ buf_block_t* block, /*!< in/out: pointer to control block */
+ ulint space, /*!< in: tablespace id */
+ ulint page_no)/*!< in: page number */
+{
+ buf_block_set_state(block, BUF_BLOCK_FILE_PAGE);
+ block->page.space = space;
+ block->page.offset = page_no;
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ enum buf_io_fix io_fix = (enum buf_io_fix) bpage->io_fix;
+#ifdef UNIV_DEBUG
+ switch (io_fix) {
+ case BUF_IO_NONE:
+ case BUF_IO_READ:
+ case BUF_IO_WRITE:
+ return(io_fix);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return(io_fix);
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(buf_page_get_io_fix(&block->page));
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ enum buf_io_fix io_fix) /*!< in: io_fix state */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ bpage->io_fix = io_fix;
+ ut_ad(buf_page_get_io_fix(bpage) == io_fix);
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+ buf_block_t* block, /*!< in/out: control block */
+ enum buf_io_fix io_fix) /*!< in: io_fix state */
+{
+ buf_page_set_io_fix(&block->page, io_fix);
+}
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory. The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+ const buf_page_t* bpage) /*!< control block being relocated */
+{
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+ ut_ad(buf_page_in_file(bpage));
+ /* optimistic */
+ //ut_ad(bpage->in_LRU_list);
+
+ return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE
+ && bpage->buf_fix_count == 0);
+}
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+ //ut_ad(buf_pool_mutex_own()); /* This is used in optimistic */
+
+ return(bpage->old);
+}
+
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ibool old) /*!< in: old */
+{
+ ut_a(buf_page_in_file(bpage));
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(&LRU_list_mutex));
+ ut_ad(bpage->in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+ ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL));
+ /* If a block is flagged "old", the LRU_old list must exist. */
+ ut_a(!old || buf_pool->LRU_old);
+
+ if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) {
+ const buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+ const buf_page_t* next = UT_LIST_GET_NEXT(LRU, bpage);
+ if (prev->old == next->old) {
+ ut_a(prev->old == old);
+ } else {
+ ut_a(!prev->old);
+ ut_a(buf_pool->LRU_old == (old ? bpage : next));
+ }
+ }
+#endif /* UNIV_LRU_DEBUG */
+
+ bpage->old = old;
+}
+
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: control block */
+{
+ ut_ad(buf_page_in_file(bpage));
+
+ return(bpage->access_time);
+}
+
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+ buf_page_t* bpage, /*!< in/out: control block */
+ ulint time_ms) /*!< in: ut_time_ms() */
+{
+ ut_a(buf_page_in_file(bpage));
+ //ut_ad(buf_pool_mutex_own());
+ ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+ if (!bpage->access_time) {
+ /* Make this the time of the first access. */
+ bpage->access_time = time_ms;
+ }
+}
+
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+ buf_page_t* bpage) /*!< in: control block, or NULL */
+{
+ if (UNIV_LIKELY(bpage != NULL)) {
+ ut_ad(buf_page_in_file(bpage));
+
+ if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+ return((buf_block_t*) bpage);
+ }
+ }
+
+ return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_a(srv_pass_corrupt_table || block);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(0);
+ }
+
+ ut_ad(block);
+
+ switch (buf_block_get_state(block)) {
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ case BUF_BLOCK_NOT_USED:
+ ut_error;
+ break;
+ case BUF_BLOCK_FILE_PAGE:
+# ifndef UNIV_HOTBACKUP
+ ut_a(block->page.buf_fix_count > 0);
+# endif /* !UNIV_HOTBACKUP */
+ /* fall through */
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ goto ok;
+ }
+ ut_error;
+ok:
+ return((buf_frame_t*) block->frame);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ ut_ad(bpage);
+ ut_a(buf_page_in_file(bpage));
+
+ return(bpage->space);
+}
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ return(block->page.space);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ ut_ad(bpage);
+ ut_a(buf_page_in_file(bpage));
+
+ return(bpage->offset);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ ut_ad(block);
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+ return(block->page.offset);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+ const buf_page_t* bpage) /*!< in: pointer to the control block */
+{
+ return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+ const buf_block_t* block) /*!< in: pointer to the control block */
+{
+ return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+ const byte* ptr) /*!< in: pointer to the page */
+{
+ return(buf_block_get_page_zip(buf_block_align(ptr)));
+}
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+ const void* ptr, /*!< in: pointer to a buffer frame */
+ ulint* space, /*!< out: space id */
+ fil_addr_t* addr) /*!< out: page offset and byte offset */
+{
+ const page_t* page = (const page_t*) ut_align_down(ptr,
+ UNIV_PAGE_SIZE);
+
+ *space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET);
+ addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table.
+@return lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+ const buf_block_t* block) /*!< in: block */
+{
+ ut_ad(block);
+ ut_ad(buf_page_in_file(&block->page));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE)
+ || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ return(block->lock_hash_val);
+}
+
+/********************************************************************//**
+Allocates a buffer block.
+@return own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+ ulint zip_size) /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+{
+ buf_block_t* block;
+
+ block = buf_LRU_get_free_block(zip_size);
+
+ buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+ return(block);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+ buf_block_t* block) /*!< in, own: block to be freed */
+{
+ //buf_pool_mutex_enter();
+
+ mutex_enter(&block->mutex);
+
+ ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+ buf_LRU_block_free_non_file_page(block, FALSE);
+
+ mutex_exit(&block->mutex);
+
+ //buf_pool_mutex_exit();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+ byte* buf, /*!< in: buffer to copy to */
+ const buf_frame_t* frame) /*!< in: buffer frame */
+{
+ ut_ad(buf && frame);
+
+ ut_memcpy(buf, frame, UNIV_PAGE_SIZE);
+
+ return(buf);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ return((space << 20) + space + offset);
+}
+
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+ const buf_page_t* bpage) /*!< in: block containing the
+ page frame */
+{
+ ib_uint64_t lsn;
+ mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+
+ if (block_mutex && buf_page_in_file(bpage)) {
+ lsn = bpage->newest_modification;
+ } else {
+ lsn = 0;
+ }
+
+ if (block_mutex) {
+ mutex_exit(block_mutex);
+ }
+
+ return(lsn);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad((mutex_own(&LRU_list_mutex)
+ && (block->page.buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+ buf_block_t* block) /*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ return(block->modify_clock);
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_SYNC_DEBUG
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line */
+#endif /* UNIV_SYNC_DEBUG */
+ buf_block_t* block) /*!< in/out: block to bufferfix */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ibool ret;
+
+ ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line);
+ ut_a(ret);
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mutex_own(&block->mutex));
+
+ block->page.buf_fix_count++;
+}
+#ifdef UNIV_SYNC_DEBUG
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+#else /* UNIV_SYNC_DEBUG */
+/** Increments the bufferfix count.
+@param b in/out: block to bufferfix
+@param f in: file name where requested
+@param l in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_dec(
+/*==================*/
+ buf_block_t* block) /*!< in/out: block to bufferunfix */
+{
+ ut_ad(mutex_own(&block->mutex));
+
+ block->page.buf_fix_count--;
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&block->debug_latch);
+#endif
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ buf_page_t* bpage;
+ ulint fold;
+
+ ut_ad(buf_pool);
+ //ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)
+ || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED));
+#endif
+
+ /* Look for the page in the hash table */
+
+ fold = buf_page_address_fold(space, offset);
+
+ HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
+ ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
+ && buf_page_in_file(bpage)),
+ bpage->space == space && bpage->offset == offset);
+ if (bpage) {
+ ut_a(buf_page_in_file(bpage));
+ ut_ad(bpage->in_page_hash);
+ ut_ad(!bpage->in_zip_hash);
+#if UNIV_WORD_SIZE == 4
+ /* On 32-bit systems, there is no padding in
+ buf_page_t. On other systems, Valgrind could complain
+ about uninitialized pad bytes. */
+ UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+ }
+
+ return(bpage);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: offset of the page within space */
+{
+ return(buf_page_get_block(buf_page_hash_get(space, offset)));
+}
+
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint offset) /*!< in: page number */
+{
+ const buf_page_t* bpage;
+
+ //buf_pool_mutex_enter();
+ rw_lock_s_lock(&page_hash_latch);
+
+ bpage = buf_page_hash_get(space, offset);
+
+ //buf_pool_mutex_exit();
+ rw_lock_s_unlock(&page_hash_latch);
+
+ return(bpage != NULL);
+}
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+ buf_page_t* bpage) /*!< in: buffer block */
+{
+ buf_block_t* block;
+
+ ut_ad(bpage);
+ ut_a(bpage->buf_fix_count > 0);
+
+ switch (buf_page_get_state(bpage)) {
+ case BUF_BLOCK_ZIP_PAGE:
+ case BUF_BLOCK_ZIP_DIRTY:
+ mutex_enter(&buf_pool_zip_mutex);
+ bpage->buf_fix_count--;
+ mutex_exit(&buf_pool_zip_mutex);
+ return;
+ case BUF_BLOCK_FILE_PAGE:
+ block = (buf_block_t*) bpage;
+ mutex_enter(&block->mutex);
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&block->debug_latch);
+#endif
+ bpage->buf_fix_count--;
+ mutex_exit(&block->mutex);
+ return;
+ case BUF_BLOCK_ZIP_FREE:
+ case BUF_BLOCK_NOT_USED:
+ case BUF_BLOCK_READY_FOR_USE:
+ case BUF_BLOCK_MEMORY:
+ case BUF_BLOCK_REMOVE_HASH:
+ break;
+ }
+
+ ut_error;
+}
+
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+ buf_block_t* block, /*!< in: buffer block */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
+ RW_NO_LATCH */
+ mtr_t* mtr __attribute__((unused))) /*!< in: mtr */
+{
+ ut_ad(block);
+
+ ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_a(block->page.buf_fix_count > 0);
+
+ /* buf_flush_note_modification() should be called before this function. */
+/*
+ if (rw_latch == RW_X_LATCH && mtr->modifications) {
+ buf_pool_mutex_enter();
+ buf_flush_note_modification(block, mtr);
+ buf_pool_mutex_exit();
+ }
+*/
+
+ mutex_enter(&block->mutex);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_s_unlock(&(block->debug_latch));
+#endif
+ block->page.buf_fix_count--;
+
+ mutex_exit(&block->mutex);
+
+ if (rw_latch == RW_S_LATCH) {
+ rw_lock_s_unlock(&(block->lock));
+ } else if (rw_latch == RW_X_LATCH) {
+ rw_lock_x_unlock(&(block->lock));
+ }
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+ buf_block_t* block, /*!< in: buffer page
+ where we have acquired latch */
+ ulint level) /*!< in: latching order level */
+{
+ sync_thread_add_level(&block->lock, level);
+}
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h
new file mode 100644
index 00000000000..2f7108fda1b
--- /dev/null
+++ b/storage/xtradb/include/buf0flu.h
@@ -0,0 +1,218 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0types.h"
+#include "buf0types.h"
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+ buf_page_t* bpage, /*!< in/out: control block being moved */
+ buf_page_t* dpage); /*!< in/out: destination block */
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+/*********************************************************************//**
+Flushes pages from the end of the LRU list if there is too small
+a margin of replaceable pages there. */
+UNIV_INTERN
+void
+buf_flush_free_margin(
+/*=======================*/
+ ibool wait);
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+ byte* page, /*!< in/out: page */
+ void* page_zip_, /*!< in/out: compressed page, or NULL */
+ ib_uint64_t newest_lsn); /*!< in: newest modification lsn
+ to the page */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_batch(
+/*============*/
+ enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
+ BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+ then the caller must not own any
+ latches on pages */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ ib_uint64_t lsn_limit); /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+ enum buf_flush type); /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the first mtr in a
+ set of mtr's */
+ ib_uint64_t end_lsn); /*!< in: end lsn of the last mtr in the
+ set of mtr's */
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., transition FILE_PAGE => NOT_USED allowed.
+@return TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+ buf_page_t* bpage); /*!< in: buffer control block, must be
+ buf_page_in_file(bpage) and in the LRU list */
+
+/** @brief Statistics for selecting flush rate based on redo log
+generation speed.
+
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+
+struct buf_flush_stat_struct
+{
+ ib_uint64_t redo; /*!< amount of redo generated. */
+ ulint n_flushed; /*!< number of pages flushed. */
+};
+
+/** Statistics for selecting flush rate of dirty pages. */
+typedef struct buf_flush_stat_struct buf_flush_stat_t;
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void);
+/*=======================*/
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at significant rate without a corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void);
+/*==================================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(void);
+/*====================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/******************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void);
+/*==========================*/
+
+/******************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void);
+/*==========================*/
+
+/** When buf_flush_free_margin is called, it tries to make this many blocks
+available to replacement in the free list and at the end of the LRU list (to
+make sure that a read-ahead batch can be read efficiently in a single
+sweep). */
+#define BUF_FLUSH_FREE_BLOCK_MARGIN (5 + BUF_READ_AHEAD_AREA)
+/** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */
+#define BUF_FLUSH_EXTRA_MARGIN (BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0flu.ic b/storage/xtradb/include/buf0flu.ic
new file mode 100644
index 00000000000..4ad0814f344
--- /dev/null
+++ b/storage/xtradb/include/buf0flu.ic
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.ic
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+ buf_block_t* block); /*!< in/out: block which is modified */
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+ buf_block_t* block); /*!< in/out: block which is modified */
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool use_LRU_mutex = FALSE;
+
+ if (UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+ use_LRU_mutex = TRUE;
+
+ if (use_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+
+ mutex_enter(&block->mutex);
+
+ ut_ad(block);
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ //ut_ad(buf_pool_mutex_own());
+
+ ut_ad(mtr->start_lsn != 0);
+ ut_ad(mtr->modifications);
+ ut_ad(block->page.newest_modification <= mtr->end_lsn);
+
+ block->page.newest_modification = mtr->end_lsn;
+
+ if (!block->page.oldest_modification) {
+ mutex_enter(&flush_list_mutex);
+
+ block->page.oldest_modification = mtr->start_lsn;
+ ut_ad(block->page.oldest_modification != 0);
+
+ buf_flush_insert_into_flush_list(block);
+ mutex_exit(&flush_list_mutex);
+ } else {
+ ut_ad(block->page.oldest_modification <= mtr->start_lsn);
+ }
+
+ mutex_exit(&block->mutex);
+
+ ++srv_buf_pool_write_requests;
+
+ if (use_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+}
+
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+ buf_block_t* block, /*!< in: block which is modified */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the first mtr in a
+ set of mtr's */
+ ib_uint64_t end_lsn) /*!< in: end lsn of the last mtr in the
+ set of mtr's */
+{
+ ibool use_LRU_mutex = FALSE;
+
+ if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+ use_LRU_mutex = TRUE;
+
+ if (use_LRU_mutex)
+ mutex_enter(&LRU_list_mutex);
+
+ mutex_enter(&(block->mutex));
+
+ ut_ad(block);
+ ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ //buf_pool_mutex_enter();
+
+ ut_ad(block->page.newest_modification <= end_lsn);
+
+ block->page.newest_modification = end_lsn;
+
+ if (!block->page.oldest_modification) {
+ mutex_enter(&flush_list_mutex);
+
+ block->page.oldest_modification = start_lsn;
+
+ ut_ad(block->page.oldest_modification != 0);
+
+ buf_flush_insert_sorted_into_flush_list(block);
+ mutex_exit(&flush_list_mutex);
+ } else {
+ ut_ad(block->page.oldest_modification <= start_lsn);
+ }
+
+ //buf_pool_mutex_exit();
+ if (use_LRU_mutex)
+ mutex_exit(&LRU_list_mutex);
+ mutex_exit(&(block->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h
new file mode 100644
index 00000000000..d3b59e8b579
--- /dev/null
+++ b/storage/xtradb/include/buf0lru.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+
+/** The return type of buf_LRU_free_block() */
+enum buf_lru_free_block_status {
+ /** freed */
+ BUF_LRU_FREED = 0,
+ /** not freed because the caller asked to remove the
+ uncompressed frame but the control block cannot be
+ relocated */
+ BUF_LRU_CANNOT_RELOCATE,
+ /** not freed because of some other reason */
+ BUF_LRU_NOT_FREED
+};
+
+/******************************************************************//**
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+UNIV_INTERN
+void
+buf_LRU_try_free_flushed_blocks(void);
+/*==================================*/
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void);
+/*==============================*/
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN 512 /* 8 megabytes of 16k pages */
+
+/** Maximum LRU list search length in buf_flush_LRU_recommendation() */
+#define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA)
+
+/******************************************************************//**
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. A PROBLEM: if readahead is being started,
+what guarantees that it will not try to read in pages after this operation has
+completed? */
+UNIV_INTERN
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+ ulint id); /*!< in: space id */
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: pointer to the block in question */
+
+/******************************************************************//**
+Try to free a block. If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+release buf_pool_mutex. Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
+release these two mutexes after the call. No other
+buf_page_get_mutex() may be held when calling this function.
+@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
+BUF_LRU_NOT_FREED otherwise. */
+UNIV_INTERN
+enum buf_lru_free_block_status
+buf_LRU_free_block(
+/*===============*/
+ buf_page_t* bpage, /*!< in: block to be freed */
+ ibool zip, /*!< in: TRUE if should remove also the
+ compressed page of an uncompressed page */
+ ibool* buf_pool_mutex_released,
+ /*!< in: pointer to a variable that will
+ be assigned TRUE if buf_pool_mutex
+ was temporarily released, or NULL */
+ ibool have_LRU_mutex);
+/******************************************************************//**
+Try to free a replaceable block.
+@return TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+ ulint n_iterations); /*!< in: how many times this has been called
+ repeatedly without result: a high value means
+ that we should search farther; if
+ n_iterations < 10, then we search
+ n_iterations / 10 * buf_pool->curr_size
+ pages from the end of the LRU list; if
+ n_iterations < 5, then we will also search
+ n_iterations / 5 of the unzip_LRU list. */
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, returns NULL.
+@return a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(void);
+/*=======================*/
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+ ulint zip_size); /*!< in: compressed page size in bytes,
+ or 0 if uncompressed tablespace */
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+ buf_block_t* block, /*!< in: block, must not contain a file page */
+ ibool have_page_hash_mutex);
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+ buf_page_t* bpage, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the old
+ blocks in the LRU list, else put to the
+ start; if the LRU list is very short, added to
+ the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+ buf_block_t* block, /*!< in: control block */
+ ibool old); /*!< in: TRUE if should be put to the end
+ of the list, else put to the start */
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+ buf_page_t* bpage); /*!< in: control block */
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+ buf_page_t* bpage); /*!< in: control block */
+/**********************************************************************//**
+Updates buf_LRU_old_ratio.
+@return updated old_pct */
+UNIV_INTERN
+uint
+buf_LRU_old_ratio_update(
+/*=====================*/
+ uint old_pct,/*!< in: Reserve this percentage of
+ the buffer pool for "old" blocks. */
+ ibool adjust);/*!< in: TRUE=adjust the LRU list;
+ FALSE=just assign buf_LRU_old_ratio
+ during the initialization of InnoDB */
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void);
+/*=====================*/
+/********************************************************************//**
+Dump the LRU page list to the specific file. */
+UNIV_INTERN
+ibool
+buf_LRU_file_dump(void);
+/*===================*/
+/********************************************************************//**
+Read the pages based on the specific file.*/
+UNIV_INTERN
+ibool
+buf_LRU_file_restore(void);
+/*======================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void);
+/*==================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void);
+/*===============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for
+"old" blocks. Protected by buf_pool_mutex. */
+extern uint buf_LRU_old_ratio;
+/** The denominator of buf_LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV 1024
+/** Maximum value of buf_LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN 51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago. Not protected by any mutex or latch. */
+extern uint buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
+and page_zip_decompress() operations. Based on the statistics we decide
+if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
+struct buf_LRU_stat_struct
+{
+ ulint io; /**< Counter of buffer pool I/O operations. */
+ ulint unzip; /**< Counter of page_zip_decompress operations. */
+};
+
+/** Statistics for selecting the LRU list for eviction. */
+typedef struct buf_LRU_stat_struct buf_LRU_stat_t;
+
+/** Current operation counters. Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update(). Protected by buf_pool_mutex. */
+extern buf_LRU_stat_t buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
+
+#ifndef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0lru.ic b/storage/xtradb/include/buf0lru.ic
new file mode 100644
index 00000000000..556f45d987f
--- /dev/null
+++ b/storage/xtradb/include/buf0lru.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.ic
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/buf0rea.h b/storage/xtradb/include/buf0rea.h
new file mode 100644
index 00000000000..56d3d24a3b7
--- /dev/null
+++ b/storage/xtradb/include/buf0rea.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "buf0types.h"
+
+/********************************************************************//**
+Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+@return 1 if a read request was queued, 0 if the page already resided
+in buf_pool, or if the page is in the doublewrite buffer blocks in
+which case it is never read into the pool, or if the tablespace does
+not exist or is being dropped
+@return 1 if read request is issued. 0 if it is not */
+UNIV_INTERN
+ulint
+buf_read_page_low(
+/*==============*/
+ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+ trying to read from a non-existent tablespace, or a
+ tablespace which is just now being dropped */
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
+ ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
+ at read-ahead functions) */
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or 0 */
+ ibool unzip, /*!< in: TRUE=request uncompressed page */
+ ib_int64_t tablespace_version, /*!< in: if the space memory object has
+ this timestamp different from what we are giving here,
+ treat the tablespace as dropped; this is a timestamp we
+ use to stop dangling page reads from a tablespace
+ which we have DISCARDed + IMPORTed back */
+ ulint offset, /*!< in: page number */
+ trx_t* trx);
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset, /*!< in: page number */
+ trx_t* trx);
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint offset, /*!< in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
+ trx_t* trx);
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ const ulint* space_ids, /*!< in: array of space ids */
+ const ib_int64_t* space_versions,/*!< in: the spaces must have
+ this version number
+ (timestamp), otherwise we
+ discard the read; we use this
+ to cancel reads if DISCARD +
+ IMPORT may have changed the
+ tablespace size */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored); /*!< in: number of elements
+ in the arrays */
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+ ibool sync, /*!< in: TRUE if the caller
+ wants this function to wait
+ for the highest address page
+ to get read in, before this
+ function returns */
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in
+ bytes, or 0 */
+ const ulint* page_nos, /*!< in: array of page numbers
+ to read, with the highest page
+ number the last in the
+ array */
+ ulint n_stored); /*!< in: number of page numbers
+ in the array */
+
+/** The size in pages of the area which the read-ahead algorithms read if
+invoked */
+#define BUF_READ_AHEAD_AREA 64
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY 131
+/** read any page */
+#define BUF_READ_ANY_PAGE 132
+/* @} */
+
+#endif
diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h
new file mode 100644
index 00000000000..507f1543bbb
--- /dev/null
+++ b/storage/xtradb/include/buf0types.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+/** Buffer page (uncompressed or compressed) */
+typedef struct buf_page_struct buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+typedef struct buf_block_struct buf_block_t;
+/** Buffer pool chunk comprising buf_block_t */
+typedef struct buf_chunk_struct buf_chunk_t;
+/** Buffer pool comprising buf_chunk_t */
+typedef struct buf_pool_struct buf_pool_t;
+/** Buffer pool statistics struct */
+typedef struct buf_pool_stat_struct buf_pool_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef byte buf_frame_t;
+
+/** Flags for flush types */
+enum buf_flush {
+ BUF_FLUSH_LRU = 0, /*!< flush via the LRU list */
+ BUF_FLUSH_SINGLE_PAGE, /*!< flush a single page */
+ BUF_FLUSH_LIST, /*!< flush via the flush list
+ of dirty blocks */
+ BUF_FLUSH_N_TYPES /*!< index of last element + 1 */
+};
+
+/** Flags for io_fix types */
+enum buf_io_fix {
+ BUF_IO_NONE = 0, /**< no pending I/O */
+ BUF_IO_READ, /**< read pending */
+ BUF_IO_WRITE /**< write pending */
+};
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+#if UNIV_WORD_SIZE <= 4 /* 32-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT 6
+#else /* 64-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT 7
+#endif
+#define BUF_BUDDY_LOW (1 << BUF_BUDDY_LOW_SHIFT)
+ /*!< minimum block size in the binary
+ buddy system; must be at least
+ sizeof(buf_page_t) */
+#define BUF_BUDDY_SIZES (UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
+#define BUF_BUDDY_SIZES_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT)
+ /*!< number of buddy sizes */
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to UNIV_PAGE_SIZE */
+#define BUF_BUDDY_HIGH (BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+#endif
+
diff --git a/storage/xtradb/include/data0data.h b/storage/xtradb/include/data0data.h
new file mode 100644
index 00000000000..f9fce3f3657
--- /dev/null
+++ b/storage/xtradb/include/data0data.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "univ.i"
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+typedef struct big_rec_struct big_rec_t;
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ const dfield_t* field); /*!< in: SQL data field */
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ const dfield_t* field); /*!< in: field */
+#else /* UNIV_DEBUG */
+# define dfield_get_type(field) (&(field)->type)
+# define dfield_get_data(field) ((field)->data)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ dtype_t* type); /*!< in: pointer to data type struct */
+/*********************************************************************//**
+Gets length of field data.
+@return length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len); /*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Determines if a field is externally stored
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+ const dfield_t* field); /*!< in: field */
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+ dfield_t* field); /*!< in/out: field */
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len); /*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field); /*!< in/out: field */
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len); /*!< in: SQL null size in bytes */
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap); /*!< in: memory heap where allocated */
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2);/*!< in: field */
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+ const dfield_t* field, /*!< in: field */
+ ulint len, /*!< in: data length or UNIV_SQL_NULL */
+ const byte* data); /*!< in: data */
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: tuple */
+ ulint n); /*!< in: index of field */
+#else /* UNIV_DEBUG */
+# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n))
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits); /*!< in: info bits */
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp); /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created */
+ ulint n_fields); /*!< in: number of fields */
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+ dtuple_t* tuple, /*!< in: storage for data tuple */
+ const dfield_t* fields, /*!< in: fields */
+ ulint n_fields); /*!< in: number of fields */
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields); /*!< in: number of fields */
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap); /*!< in: memory heap
+ where the tuple is created */
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+ const dtuple_t* tuple1, /*!< in: tuple 1 */
+ const dtuple_t* tuple2);/*!< in: tuple 2 */
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ const dtuple_t* tuple, /*!< in: the tuple */
+ ulint n_fields,/*!< in: number of complete fields to fold */
+ ulint n_bytes,/*!< in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id)/*!< in: index tree id */
+ __attribute__((pure));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n); /*!< in: number of fields to set */
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple); /*!< in: dtuple */
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+ const dfield_t* field); /*!< in: data field */
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+ const dtuple_t* tuple); /*!< in: tuple */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+ const dfield_t* dfield);/*!< in: dfield */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+ const dfield_t* dfield); /*!< in: dfield */
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+ FILE* f, /*!< in: output stream */
+ const dtuple_t* tuple); /*!< in: tuple */
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in/out: index entry */
+ ulint* n_ext); /*!< in/out: number of
+ externally stored columns */
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: entry whose data was put to vector */
+ big_rec_t* vector);/*!< in, own: big rec vector; it is
+ freed in this function */
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector); /*!< in, own: big rec vector; it is
+ freed in this function */
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_struct{
+ void* data; /*!< pointer to data */
+ unsigned ext:1; /*!< TRUE=externally stored, FALSE=local */
+ unsigned len:32; /*!< data length; UNIV_SQL_NULL if SQL null */
+ dtype_t type; /*!< type of data */
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_struct {
+ ulint info_bits; /*!< info bits of an index record:
+ the default is 0; this field is used
+ if an index record is built from
+ a data tuple */
+ ulint n_fields; /*!< number of fields in dtuple */
+ ulint n_fields_cmp; /*!< number of fields which should
+ be used in comparison services
+ of rem0cmp.*; the index search
+ is performed by comparing only these
+ fields, others are ignored; the
+ default value in dtuple creation is
+ the same value as n_fields */
+ dfield_t* fields; /*!< fields */
+ UT_LIST_NODE_T(dtuple_t) tuple_list;
+ /*!< data tuples can be linked into a
+ list using this field */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< magic number, used in
+ debug assertions */
+/** Value of dtuple_struct::magic_n */
+# define DATA_TUPLE_MAGIC_N 65478679
+#endif /* UNIV_DEBUG */
+};
+
+/** A slot for a field in a big rec vector */
+typedef struct big_rec_field_struct big_rec_field_t;
+/** A slot for a field in a big rec vector */
+struct big_rec_field_struct {
+ ulint field_no; /*!< field number in record */
+ ulint len; /*!< stored data length, in bytes */
+ const void* data; /*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_struct {
+ mem_heap_t* heap; /*!< memory heap from which
+ allocated */
+ ulint n_fields; /*!< number of stored fields */
+ big_rec_field_t*fields; /*!< stored fields */
+};
+
+#ifndef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/data0data.ic b/storage/xtradb/include/data0data.ic
new file mode 100644
index 00000000000..da79aa33702
--- /dev/null
+++ b/storage/xtradb/include/data0data.ic
@@ -0,0 +1,612 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields. In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+extern byte data_error;
+
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+ const dfield_t* field) /*!< in: SQL data field */
+{
+ ut_ad(field);
+
+ return((dtype_t*) &(field->type));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+ dfield_t* field, /*!< in: SQL data field */
+ dtype_t* type) /*!< in: pointer to data type struct */
+{
+ ut_ad(field && type);
+
+ field->type = *type;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return((void*) field->data);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets length of field data.
+@return length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+ ut_ad((field->len == UNIV_SQL_NULL)
+ || (field->data != &data_error));
+
+ return(field->len);
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+ dfield_t* field, /*!< in: field */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+#ifdef UNIV_VALGRIND_DEBUG
+ if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+
+ field->ext = 0;
+ field->len = len;
+}
+
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+
+ return(field->len == UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Determines if a field is externally stored
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+ const dfield_t* field) /*!< in: field */
+{
+ ut_ad(field);
+
+ return(UNIV_UNLIKELY(field->ext));
+}
+
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+ dfield_t* field) /*!< in/out: field */
+{
+ ut_ad(field);
+
+ field->ext = 1;
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+ dfield_t* field, /*!< in: field */
+ const void* data, /*!< in: data */
+ ulint len) /*!< in: length or UNIV_SQL_NULL */
+{
+ ut_ad(field);
+
+#ifdef UNIV_VALGRIND_DEBUG
+ if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+ field->data = (void*) data;
+ field->ext = 0;
+ field->len = len;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+ dfield_t* field) /*!< in/out: field */
+{
+ dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ ut_ad(field1 && field2);
+
+ field1->data = field2->data;
+ field1->len = field2->len;
+ field1->ext = field2->ext;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+ dfield_t* field1, /*!< out: field to copy to */
+ const dfield_t* field2) /*!< in: field to copy from */
+{
+ *field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+ dfield_t* field, /*!< in/out: data field */
+ mem_heap_t* heap) /*!< in: memory heap where allocated */
+{
+ if (!dfield_is_null(field)) {
+ UNIV_MEM_ASSERT_RW(field->data, field->len);
+ field->data = mem_heap_dup(heap, field->data, field->len);
+ }
+}
+
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+ const dfield_t* field1, /*!< in: field */
+ const dfield_t* field2) /*!< in: field */
+{
+ ulint len;
+
+ len = field1->len;
+
+ return(len == field2->len
+ && (len == UNIV_SQL_NULL
+ || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint info_bits) /*!< in: info bits */
+{
+ ut_ad(tuple);
+
+ tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+ dtuple_t* tuple, /*!< in: tuple */
+ ulint n_fields_cmp) /*!< in: number of fields used in
+ comparisons in rem0cmp.* */
+{
+ ut_ad(tuple);
+ ut_ad(n_fields_cmp <= tuple->n_fields);
+
+ tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ut_ad(tuple);
+
+ return(tuple->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: tuple */
+ ulint n) /*!< in: index of field */
+{
+ ut_ad(tuple);
+ ut_ad(n < tuple->n_fields);
+
+ return((dfield_t*) tuple->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+ mem_heap_t* heap, /*!< in: memory heap where the tuple
+ is created */
+ ulint n_fields) /*!< in: number of fields */
+{
+ dtuple_t* tuple;
+
+ ut_ad(heap);
+
+ tuple = (dtuple_t*) mem_heap_alloc(heap, sizeof(dtuple_t)
+ + n_fields * sizeof(dfield_t));
+ tuple->info_bits = 0;
+ tuple->n_fields = n_fields;
+ tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) &tuple[1];
+
+#ifdef UNIV_DEBUG
+ tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+ { /* In the debug version, initialize fields to an error value */
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_t* field;
+
+ field = dtuple_get_nth_field(tuple, i);
+
+ dfield_set_len(field, UNIV_SQL_NULL);
+ field->data = &data_error;
+ dfield_get_type(field)->mtype = DATA_ERROR;
+ }
+ }
+
+ UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
+#endif
+ return(tuple);
+}
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+ dtuple_t* tuple, /*!< in: storage for data tuple */
+ const dfield_t* fields, /*!< in: fields */
+ ulint n_fields) /*!< in: number of fields */
+{
+ tuple->info_bits = 0;
+ tuple->n_fields = tuple->n_fields_cmp = n_fields;
+ tuple->fields = (dfield_t*) fields;
+ ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N);
+
+ return(tuple);
+}
+
+/*********************************************************************//**
+Copies a data tuple to another. This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+ const dtuple_t* tuple, /*!< in: tuple to copy from */
+ mem_heap_t* heap) /*!< in: memory heap
+ where the tuple is created */
+{
+ ulint n_fields = dtuple_get_n_fields(tuple);
+ dtuple_t* new_tuple = dtuple_create(heap, n_fields);
+ ulint i;
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_copy(dtuple_get_nth_field(new_tuple, i),
+ dtuple_get_nth_field(tuple, i));
+ }
+
+ return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+ const dtuple_t* tuple, /*!< in: typed data tuple */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint len;
+ ulint i;
+ ulint sum = 0;
+
+ ut_ad(tuple);
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ n_fields = tuple->n_fields;
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+ len = dfield_get_len(field);
+
+ if (len == UNIV_SQL_NULL) {
+ len = dtype_get_sql_null_size(dfield_get_type(field),
+ comp);
+ }
+
+ sum += len;
+ }
+
+ return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+ const dtuple_t* tuple) /*!< in: tuple */
+{
+ ulint n_ext = 0;
+ ulint n_fields = tuple->n_fields;
+ ulint i;
+
+ ut_ad(tuple);
+ ut_ad(dtuple_check_typed(tuple));
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+ for (i = 0; i < n_fields; i++) {
+ n_ext += dtuple_get_nth_field(tuple, i)->ext;
+ }
+
+ return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+ dtuple_t* tuple, /*!< in: data tuple */
+ ulint n) /*!< in: number of fields to set */
+{
+ dtype_t* dfield_type;
+ ulint i;
+
+ for (i = 0; i < n; i++) {
+ dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+ dtype_set(dfield_type, DATA_BINARY, 0, 0);
+ }
+}
+
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+ const dtuple_t* tuple, /*!< in: the tuple */
+ ulint n_fields,/*!< in: number of complete fields to fold */
+ ulint n_bytes,/*!< in: number of bytes to fold in an
+ incomplete last field */
+ dulint tree_id)/*!< in: index tree id */
+{
+ const dfield_t* field;
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+
+ ut_ad(tuple);
+ ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+ ut_ad(dtuple_check_typed(tuple));
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ field = dtuple_get_nth_field(tuple, i);
+
+ data = (const byte*) dfield_get_data(field);
+ len = dfield_get_len(field);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+ byte* data, /*!< in: pointer to a buffer of size len */
+ ulint len) /*!< in: SQL null size in bytes */
+{
+ memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+ const dtuple_t* tuple) /*!< in: dtuple */
+{
+ ulint n;
+ ulint i;
+
+ n = dtuple_get_n_fields(tuple);
+
+ for (i = 0; i < n; i++) {
+ if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+ big_rec_t* vector) /*!< in, own: big rec vector; it is
+ freed in this function */
+{
+ mem_heap_free(vector->heap);
+}
diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h
new file mode 100644
index 00000000000..a73bed3a9f5
--- /dev/null
+++ b/storage/xtradb/include/data0type.h
@@ -0,0 +1,486 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+extern ulint data_mysql_default_charset_coll;
+#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+typedef struct dtype_struct dtype_t;
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define DATA_VARCHAR 1 /* character varying of the
+ latin1_swedish_ci charset-collation; note
+ that the MySQL format for this, DATA_BINARY,
+ DATA_VARMYSQL, is also affected by whether the
+ 'precise type' contains
+ DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR 2 /* fixed length character of the
+ latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY 3 /* binary string of fixed length */
+#define DATA_BINARY 4 /* binary string */
+#define DATA_BLOB 5 /* binary large object, or a TEXT type;
+ if prtype & DATA_BINARY_TYPE == 0, then this is
+ actually a TEXT column (or a BLOB created
+ with < 4.0.14; since column prefix indexes
+ came only in 4.0.14, the missing flag in BLOBs
+ created before that does not cause any harm) */
+#define DATA_INT 6 /* integer: can be any size 1 - 8 bytes */
+#define DATA_SYS_CHILD 7 /* address of the child page in node pointer */
+#define DATA_SYS 8 /* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT 9
+#define DATA_DOUBLE 10
+#define DATA_DECIMAL 11 /* decimal number stored as an ASCII string */
+#define DATA_VARMYSQL 12 /* any charset varying length char */
+#define DATA_MYSQL 13 /* any charset fixed length char */
+ /* NOTE that 4.1.1 used DATA_MYSQL and
+ DATA_VARMYSQL for all character sets, and the
+ charset-collation for tables created with it
+ can also be latin1_swedish_ci */
+#define DATA_MTYPE_MAX 63 /* dtype_store_for_order_and_null_size()
+ requires the values are <= 63 */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH 4 /* English language character string: this
+ is a relic from pre-MySQL time and only used
+ for InnoDB's own system tables */
+#define DATA_ERROR 111 /* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL
+ type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+ format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define DATA_ROW_ID 0 /* row id: a dulint */
+#define DATA_ROW_ID_LEN 6 /* stored length for row id */
+
+#define DATA_TRX_ID 1 /* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN 6
+
+#define DATA_ROLL_PTR 2 /* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define DATA_N_SYS_COLS 3 /* number of system columns defined above */
+
+#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL 256 /* this is ORed to the precise type when
+ the column is declared as NOT NULL */
+#define DATA_UNSIGNED 512 /* this id ORed to the precise type when
+ we have an unsigned integer type */
+#define DATA_BINARY_TYPE 1024 /* if the data type is a binary character
+ string, this is ORed to the precise type:
+ this only holds for tables created with
+ >= MySQL-4.0.14 */
+/* #define DATA_NONLATIN1 2048 This is a relic from < 4.1.2 and < 5.0.1.
+ In earlier versions this was set for some
+ BLOB columns.
+*/
+#define DATA_LONG_TRUE_VARCHAR 4096 /* this is ORed to the precise data
+ type when the column is true VARCHAR where
+ MySQL uses 2 bytes to store the data len;
+ for shorter VARCHARs MySQL uses only 1 byte */
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE 4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type); /*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of a
+ multi-byte character */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multi-byte character */
+ ulint prefix_len, /*!< in: length of the requested
+ prefix, in characters, multiplied by
+ dtype_get_mbmaxlen(dtype) */
+ ulint data_len, /*!< in: length of str (in bytes) */
+ const char* str); /*!< in: the string whose prefix
+ length is being determined */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+ ulint mtype); /*!< in: InnoDB main data type code: DATA_CHAR, ... */
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype);/*!< in: precise type */
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+ ulint mtype, /*!< in: main data type */
+ ulint prtype);/*!< in: precise type */
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2); /*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type); /*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type); /*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ ulint* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ ulint* mbmaxlen); /*!< out: maximum length of a
+ multi-byte character */
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+ ulint old_prtype, /*!< in: the MySQL type code and the flags
+ DATA_BINARY_TYPE etc. */
+ ulint charset_coll); /*!< in: MySQL charset-collation code */
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype);/*!< in: precise data type */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type); /*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type); /*!< in: type */
+/*********************************************************************//**
+Gets the padding character code for the type.
+@return padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype); /*!< in: precise type */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen, /*!< in: maximum length of a multibyte char */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen); /*!< in: maximum length of a multibyte char */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len); /*!< in: length */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len);/*!< in: prefix length to
+ replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf); /*!< in: buffer for stored type order info */
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+ const dtype_t* type); /*!< in: type struct to validate */
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+ const dtype_t* type); /*!< in: type */
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_struct{
+ unsigned mtype:8; /*!< main data type */
+ unsigned prtype:24; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+#ifndef UNIV_HOTBACKUP
+ unsigned mbminlen:2; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+#ifndef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic
new file mode 100644
index 00000000000..2bf67a941bd
--- /dev/null
+++ b/storage/xtradb/include/data0type.ic
@@ -0,0 +1,603 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+ ulint prtype) /*!< in: precise data type */
+{
+ return((prtype >> 16) & 0xFFUL);
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8. This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+ ulint prtype) /*!< in: precise data type */
+{
+ /* These codes have been copied from strings/ctype-extra.c
+ and strings/ctype-utf8.c. */
+ switch (dtype_get_charset_coll(prtype)) {
+ case 11: /* ascii_general_ci */
+ case 65: /* ascii_bin */
+ case 33: /* utf8_general_ci */
+ case 83: /* utf8_bin */
+ case 254: /* utf8_general_cs */
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+ const dtype_t* type) /*!< in: type struct */
+{
+ return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type (and collation) */
+ ulint* mbminlen, /*!< out: minimum length of a
+ multi-byte character */
+ ulint* mbmaxlen) /*!< out: maximum length of a
+ multi-byte character */
+{
+ if (dtype_is_string_type(mtype)) {
+ innobase_get_cset_width(dtype_get_charset_coll(prtype),
+ mbminlen, mbmaxlen);
+ ut_ad(*mbminlen <= *mbmaxlen);
+ ut_ad(*mbminlen <= 2); /* mbminlen in dtype_t is 0..3 */
+ ut_ad(*mbmaxlen < 1 << 3); /* mbmaxlen in dtype_t is 0..7 */
+ } else {
+ *mbminlen = *mbmaxlen = 0;
+ }
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+ dtype_t* type) /*!< in/out: type */
+{
+ ulint mbminlen;
+ ulint mbmaxlen;
+
+ dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+ type->mbminlen = mbminlen;
+ type->mbmaxlen = mbmaxlen;
+
+ ut_ad(dtype_validate(type));
+}
+#else /* !UNIV_HOTBACKUP */
+# define dtype_set_mblen(type) (void) 0
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+ dtype_t* type, /*!< in: type struct to init */
+ ulint mtype, /*!< in: main data type */
+ ulint prtype, /*!< in: precise type */
+ ulint len) /*!< in: precision of type */
+{
+ ut_ad(type);
+ ut_ad(mtype <= DATA_MTYPE_MAX);
+
+ type->mtype = mtype;
+ type->prtype = prtype;
+ type->len = len;
+
+ dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+ dtype_t* type1, /*!< in: type struct to copy to */
+ const dtype_t* type2) /*!< in: type struct to copy from */
+{
+ *type1 = *type2;
+
+ ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(type);
+
+ return(type->len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ ut_ad(type);
+ return(type->mbminlen);
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+ const dtype_t* type) /*!< in: type */
+{
+ ut_ad(type);
+ return(type->mbmaxlen);
+}
+
+/*********************************************************************//**
+Gets the padding character code for a type.
+@return padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype) /*!< in: precise type */
+{
+ switch (mtype) {
+ case DATA_FIXBINARY:
+ case DATA_BINARY:
+ if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+ /* Starting from 5.0.18, do not pad
+ VARBINARY or BINARY columns. */
+ return(ULINT_UNDEFINED);
+ }
+ /* Fall through */
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+ /* Space is the padding character for all char and binary
+ strings, and starting from 5.0.3, also for TEXT strings. */
+
+ return(0x20);
+ case DATA_BLOB:
+ if (!(prtype & DATA_BINARY_TYPE)) {
+ return(0x20);
+ }
+ /* Fall through */
+ default:
+ /* No padding specified */
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+ byte* buf, /*!< in: buffer for
+ DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ bytes where we store the info */
+ const dtype_t* type, /*!< in: type struct */
+ ulint prefix_len)/*!< in: prefix length to
+ replace type->len, or 0 */
+{
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+ ulint len;
+
+ ut_ad(type);
+ ut_ad(type->mtype >= DATA_VARCHAR);
+ ut_ad(type->mtype <= DATA_MYSQL);
+
+ buf[0] = (byte)(type->mtype & 0xFFUL);
+
+ if (type->prtype & DATA_BINARY_TYPE) {
+ buf[0] = buf[0] | 128;
+ }
+
+ /* In versions < 4.1.2 we had: if (type->prtype & DATA_NONLATIN1) {
+ buf[0] = buf[0] | 64;
+ }
+ */
+
+ buf[1] = (byte)(type->prtype & 0xFFUL);
+
+ len = prefix_len ? prefix_len : type->len;
+
+ mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+ ut_ad(dtype_get_charset_coll(type->prtype) < 256);
+ mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+ if (type->prtype & DATA_NOT_NULL) {
+ buf[4] |= 128;
+ }
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE
+# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype = type->prtype | DATA_BINARY_TYPE;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ type->prtype = dtype_form_prtype(type->prtype,
+ data_mysql_default_charset_coll);
+ dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+ dtype_t* type, /*!< in: type struct */
+ const byte* buf) /*!< in: buffer for stored type order info */
+{
+ ulint charset_coll;
+
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+ type->mtype = buf[0] & 63;
+ type->prtype = buf[1];
+
+ if (buf[0] & 128) {
+ type->prtype |= DATA_BINARY_TYPE;
+ }
+
+ if (buf[4] & 128) {
+ type->prtype |= DATA_NOT_NULL;
+ }
+
+ type->len = mach_read_from_2(buf + 2);
+
+ charset_coll = mach_read_from_2(buf + 4) & 0x7fff;
+
+ if (dtype_is_string_type(type->mtype)) {
+ ut_a(charset_coll < 256);
+
+ if (charset_coll == 0) {
+ /* This insert buffer record was inserted with MySQL
+ version < 4.1.2, and the charset-collation code was not
+ explicitly stored to dtype->prtype at that time. It
+ must be the default charset-collation of this MySQL
+ installation. */
+
+ charset_coll = data_mysql_default_charset_coll;
+ }
+
+ type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+ }
+ dtype_set_mblen(type);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen, /*!< in: maximum length of a multibyte char */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return(len);
+ case DATA_MYSQL:
+#ifndef UNIV_HOTBACKUP
+ if (prtype & DATA_BINARY_TYPE) {
+ return(len);
+ } else if (!comp) {
+ return(len);
+ } else {
+ /* We play it safe here and ask MySQL for
+ mbminlen and mbmaxlen. Although
+ mbminlen and mbmaxlen are
+ initialized if and only if prtype
+ is (in one of the 3 functions in this file),
+ it could be that none of these functions
+ has been called. */
+
+ ulint i_mbminlen, i_mbmaxlen;
+
+ innobase_get_cset_width(
+ dtype_get_charset_coll(prtype),
+ &i_mbminlen, &i_mbmaxlen);
+
+ if (UNIV_UNLIKELY(mbminlen != i_mbminlen)
+ || UNIV_UNLIKELY(mbmaxlen != i_mbmaxlen)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: "
+ "mbminlen=%lu, "
+ "mbmaxlen=%lu, "
+ "type->mbminlen=%lu, "
+ "type->mbmaxlen=%lu\n",
+ (ulong) i_mbminlen,
+ (ulong) i_mbmaxlen,
+ (ulong) mbminlen,
+ (ulong) mbmaxlen);
+ }
+ if (mbminlen == mbmaxlen) {
+ return(len);
+ }
+ }
+#else /* !UNIV_HOTBACKUP */
+ return(len);
+#endif /* !UNIV_HOTBACKUP */
+ /* fall through for variable-length charsets */
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint len, /*!< in: length */
+ ulint mbminlen, /*!< in: minimum length of a multibyte char */
+ ulint mbmaxlen) /*!< in: maximum length of a multibyte char */
+{
+ switch (mtype) {
+ case DATA_SYS:
+#ifdef UNIV_DEBUG
+ switch (prtype & DATA_MYSQL_TYPE_MASK) {
+ case DATA_ROW_ID:
+ ut_ad(len == DATA_ROW_ID_LEN);
+ break;
+ case DATA_TRX_ID:
+ ut_ad(len == DATA_TRX_ID_LEN);
+ break;
+ case DATA_ROLL_PTR:
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ break;
+ default:
+ ut_ad(0);
+ return(0);
+ }
+#endif /* UNIV_DEBUG */
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ return(len);
+ case DATA_MYSQL:
+ if ((prtype & DATA_BINARY_TYPE) || mbminlen == mbmaxlen) {
+ return(len);
+ }
+ /* this is a variable-length character set */
+ ut_a(mbminlen > 0);
+ ut_a(mbmaxlen > mbminlen);
+ ut_a(len % mbmaxlen == 0);
+ return(len * mbminlen / mbmaxlen);
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ case DATA_BLOB:
+ return(0);
+ default:
+ ut_error;
+ }
+
+ return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+ ulint mtype, /*!< in: main type */
+ ulint len) /*!< in: length */
+{
+ switch (mtype) {
+ case DATA_SYS:
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_INT:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_MYSQL:
+ case DATA_VARCHAR:
+ case DATA_BINARY:
+ case DATA_DECIMAL:
+ case DATA_VARMYSQL:
+ return(len);
+ case DATA_BLOB:
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ULINT_MAX);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+ const dtype_t* type, /*!< in: type */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+#ifndef UNIV_HOTBACKUP
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ type->mbminlen, type->mbmaxlen, comp));
+#else /* !UNIV_HOTBACKUP */
+ return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+ 0, 0, 0));
+#endif /* !UNIV_HOTBACKUP */
+}
diff --git a/storage/xtradb/include/data0types.h b/storage/xtradb/include/data0types.h
new file mode 100644
index 00000000000..04e835bc401
--- /dev/null
+++ b/storage/xtradb/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+typedef struct dfield_struct dfield_t;
+
+/* SQL data tuple struct */
+typedef struct dtuple_struct dtuple_t;
+
+#endif
+
diff --git a/storage/xtradb/include/db0err.h b/storage/xtradb/include/db0err.h
new file mode 100644
index 00000000000..c7fa6d2a444
--- /dev/null
+++ b/storage/xtradb/include/db0err.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+
+enum db_err {
+ DB_SUCCESS_LOCKED_REC = 9, /*!< like DB_SUCCESS, but a new
+ explicit record lock was created */
+ DB_SUCCESS = 10,
+
+ /* The following are error codes */
+ DB_ERROR,
+ DB_INTERRUPTED,
+ DB_OUT_OF_MEMORY,
+ DB_OUT_OF_FILE_SPACE,
+ DB_LOCK_WAIT,
+ DB_DEADLOCK,
+ DB_ROLLBACK,
+ DB_DUPLICATE_KEY,
+ DB_QUE_THR_SUSPENDED,
+ DB_MISSING_HISTORY, /* required history data has been
+ deleted due to lack of space in
+ rollback segment */
+ DB_CLUSTER_NOT_FOUND = 30,
+ DB_TABLE_NOT_FOUND,
+ DB_MUST_GET_MORE_FILE_SPACE, /* the database has to be stopped
+ and restarted with more file space */
+ DB_TABLE_IS_BEING_USED,
+ DB_TOO_BIG_RECORD, /* a record in an index would not fit
+ on a compressed page, or it would
+ become bigger than 1/2 free space in
+ an uncompressed page frame */
+ DB_LOCK_WAIT_TIMEOUT, /* lock wait lasted too long */
+ DB_NO_REFERENCED_ROW, /* referenced key value not found
+ for a foreign key in an insert or
+ update of a row */
+ DB_ROW_IS_REFERENCED, /* cannot delete or update a row
+ because it contains a key value
+ which is referenced */
+ DB_CANNOT_ADD_CONSTRAINT, /* adding a foreign key constraint
+ to a table failed */
+ DB_CORRUPTION, /* data structure corruption noticed */
+ DB_COL_APPEARS_TWICE_IN_INDEX, /* InnoDB cannot handle an index
+ where same column appears twice */
+ DB_CANNOT_DROP_CONSTRAINT, /* dropping a foreign key constraint
+ from a table failed */
+ DB_NO_SAVEPOINT, /* no savepoint exists with the given
+ name */
+ DB_TABLESPACE_ALREADY_EXISTS, /* we cannot create a new single-table
+ tablespace because a file of the same
+ name already exists */
+ DB_TABLESPACE_DELETED, /* tablespace does not exist or is
+ being dropped right now */
+ DB_LOCK_TABLE_FULL, /* lock structs have exhausted the
+ buffer pool (for big transactions,
+ InnoDB stores the lock structs in the
+ buffer pool) */
+ DB_FOREIGN_DUPLICATE_KEY, /* foreign key constraints
+ activated by the operation would
+ lead to a duplicate key in some
+ table */
+ DB_TOO_MANY_CONCURRENT_TRXS, /* when InnoDB runs out of the
+ preconfigured undo slots, this can
+ only happen when there are too many
+ concurrent transactions */
+ DB_UNSUPPORTED, /* when InnoDB sees any artefact or
+ a feature that it can't recoginize or
+ work with e.g., FT indexes created by
+ a later version of the engine. */
+
+ DB_PRIMARY_KEY_IS_NULL, /* a column in the PRIMARY KEY
+ was found to be NULL */
+ DB_FOREIGN_EXCEED_MAX_CASCADE, /* Foreign key constraint related
+ cascading delete/update exceeds
+ maximum allowed depth */
+
+ /* The following are partial failure codes */
+ DB_FAIL = 1000,
+ DB_OVERFLOW,
+ DB_UNDERFLOW,
+ DB_STRONG_FAIL,
+ DB_ZIP_OVERFLOW,
+ DB_RECORD_NOT_FOUND = 1500,
+ DB_END_OF_INDEX
+};
+
+#endif
diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h
new file mode 100644
index 00000000000..9239e031a7f
--- /dev/null
+++ b/storage/xtradb/include/dict0boot.h
@@ -0,0 +1,161 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+
+typedef byte dict_hdr_t;
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+ dulint* table_id, /*!< out: table id (not assigned if NULL) */
+ dulint* index_id, /*!< out: index id (not assigned if NULL) */
+ ulint* space_id); /*!< out: space id (not assigned if NULL) */
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void);
+/*=========================*/
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ byte* field); /*!< in: record field */
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ dulint row_id);/*!< in: row id */
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+UNIV_INTERN
+void
+dict_boot(void);
+/*===========*/
+/*****************************************************************//**
+Creates and initializes the data dictionary at the database creation. */
+UNIV_INTERN
+void
+dict_create(void);
+/*=============*/
+
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID ut_dulint_create(0, 1)
+#define DICT_COLUMNS_ID ut_dulint_create(0, 2)
+#define DICT_INDEXES_ID ut_dulint_create(0, 3)
+#define DICT_FIELDS_ID ut_dulint_create(0, 4)
+#define DICT_STATS_ID ut_dulint_create(0, 6)
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID ut_dulint_create(0, 5)
+
+#define DICT_HDR_FIRST_ID 10 /* the ids for tables etc. start
+ from this number, except for basic
+ system tables and their above defined
+ indexes; ibuf tables and indexes are
+ assigned as the id the number
+ DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0)
+
+/* The offset of the dictionary header on the page */
+#define DICT_HDR FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID 0 /* The latest assigned row id */
+#define DICT_HDR_TABLE_ID 8 /* The latest assigned table id */
+#define DICT_HDR_INDEX_ID 16 /* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID 24 /* The latest assigned space id, or 0*/
+#define DICT_HDR_MIX_ID_LOW 28 /* Obsolete,always DICT_HDR_FIRST_ID */
+#define DICT_HDR_TABLES 32 /* Root of the table index tree */
+#define DICT_HDR_TABLE_IDS 36 /* Root of the table index tree */
+#define DICT_HDR_COLUMNS 40 /* Root of the column index tree */
+#define DICT_HDR_INDEXES 44 /* Root of the index index tree */
+#define DICT_HDR_FIELDS 48 /* Root of the index field
+ index tree */
+#define DICT_HDR_STATS 52 /* Root of the stats tree */
+
+#define DICT_HDR_FSEG_HEADER 56 /* Segment header for the tablespace
+ segment into which the dictionary
+ header is created */
+
+#define DICT_HDR_XTRADB_MARK 256 /* Flag to distinguish expansion of XtraDB */
+/*-------------------------------------------------------------*/
+
+/* The field number of the page number field in the sys_indexes table
+clustered index */
+#define DICT_SYS_INDEXES_PAGE_NO_FIELD 8
+#define DICT_SYS_INDEXES_SPACE_NO_FIELD 7
+#define DICT_SYS_INDEXES_TYPE_FIELD 6
+#define DICT_SYS_INDEXES_NAME_FIELD 4
+
+#define DICT_SYS_STATS_DIFF_VALS_FIELD 4
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN 256
+
+#define DICT_HDR_XTRADB_FLAG ut_dulint_create(0x58545241UL,0x44425F31UL) /* "XTRADB_1" */
+
+#ifndef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0boot.ic b/storage/xtradb/include/dict0boot.ic
new file mode 100644
index 00000000000..d5f372e38c4
--- /dev/null
+++ b/storage/xtradb/include/dict0boot.ic
@@ -0,0 +1,93 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.ic
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+
+
+/**********************************************************************//**
+Returns a new row id.
+@return the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void)
+/*=========================*/
+{
+ dulint id;
+
+ mutex_enter(&(dict_sys->mutex));
+
+ id = dict_sys->row_id;
+
+ if (0 == (ut_dulint_get_low(id) % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+ dict_hdr_flush_row_id();
+ }
+
+ UT_DULINT_INC(dict_sys->row_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ return(id);
+}
+
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+ byte* field) /*!< in: record field */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+ return(mach_read_from_6(field));
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+ byte* field, /*!< in: record field */
+ dulint row_id) /*!< in: row id */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+ mach_write_to_6(field, row_id);
+}
+
+
diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h
new file mode 100644
index 00000000000..0249091a195
--- /dev/null
+++ b/storage/xtradb/include/dict0crea.h
@@ -0,0 +1,215 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+ dict_table_t* table, /*!< in: table to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /*!< in: heap where created */
+/*********************************************************************//**
+Creates an index create graph.
+@return own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+ dict_index_t* index, /*!< in: index to create, built as a memory data
+ structure */
+ mem_heap_t* heap); /*!< in: heap where created */
+/*********************************************************************//**
+*/
+UNIV_INTERN
+ind_node_t*
+ind_insert_stats_graph_create(
+/*==========================*/
+ dict_index_t* index,
+ mem_heap_t* heap);
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+*/
+UNIV_INTERN
+que_thr_t*
+dict_insert_stats_step(
+/*===================*/
+ que_thr_t* thr);
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+ dict_table_t* table, /*!< in: the table the index belongs to */
+ ulint space, /*!< in: 0=truncate,
+ nonzero=create the index tree in the
+ given tablespace */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to
+ record in the clustered index of
+ SYS_INDEXES table. The cursor may be
+ repositioned in this call. */
+ mtr_t* mtr); /*!< in: mtr having the latch
+ on the record page. The mtr may be
+ committed and restarted in this call. */
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+ rec_t* rec, /*!< in/out: record in the clustered index
+ of SYS_INDEXES table */
+ mtr_t* mtr); /*!< in: mtr having the latch on the record page */
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at database creation or database start if they are not found or are
+not of the right form.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_create_or_check_foreign_constraint_tables(void);
+/*================================================*/
+/********************************************************************//**
+Adds foreign key definitions to data dictionary tables in the database. We
+look at table->foreign_list, and also generate names to constraints that were
+not named by the user. A generated constraint has a name of the format
+databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and are
+given locally for this table, that is, the number is not global, as in the
+old format constraints < 4.0.18 it used to be.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+ ulint start_id,/*!< in: if we are actually doing ALTER TABLE
+ ADD CONSTRAINT, we want to generate constraint
+ numbers which are bigger than in the table so
+ far; we number the constraints from
+ start_id + 1 up; start_id should be set to 0 if
+ we are creating a new table, or if the table
+ so far has no constraints for which the name
+ was generated here */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction */
+
+/* Table create node structure */
+
+struct tab_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_TABLE_CREATE */
+ dict_table_t* table; /*!< table to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* tab_def; /* child node which does the insert of
+ the table definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* col_def; /* child node which does the inserts of
+ the column definitions; the row to be inserted
+ is built by the parent node */
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful table creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint col_no; /*!< next column definition to insert */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage */
+};
+
+/* Table create node states */
+#define TABLE_BUILD_TABLE_DEF 1
+#define TABLE_BUILD_COL_DEF 2
+#define TABLE_COMMIT_WORK 3
+#define TABLE_ADD_TO_CACHE 4
+#define TABLE_COMPLETED 5
+
+/* Index create node struct */
+
+struct ind_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_INDEX_CREATE */
+ dict_index_t* index; /*!< index to create, built as a memory data
+ structure with dict_mem_... functions */
+ ins_node_t* ind_def; /* child node which does the insert of
+ the index definition; the row to be inserted
+ is built by the parent node */
+ ins_node_t* field_def; /* child node which does the inserts of
+ the field definitions; the row to be inserted
+ is built by the parent node */
+ ins_node_t* stats_def;
+ commit_node_t* commit_node;
+ /* child node which performs a commit after
+ a successful index creation */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ ulint page_no;/* root page number of the index */
+ dict_table_t* table; /*!< table which owns the index */
+ dtuple_t* ind_row;/* index definition row built */
+ ulint field_no;/* next field definition to insert */
+ ulint stats_no;
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage */
+};
+
+/* Index create node states */
+#define INDEX_BUILD_INDEX_DEF 1
+#define INDEX_BUILD_FIELD_DEF 2
+#define INDEX_CREATE_INDEX_TREE 3
+#define INDEX_COMMIT_WORK 4
+#define INDEX_ADD_TO_CACHE 5
+#define INDEX_BUILD_STATS_COLS 6
+
+#ifndef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0crea.ic b/storage/xtradb/include/dict0crea.ic
new file mode 100644
index 00000000000..c5365ce7489
--- /dev/null
+++ b/storage/xtradb/include/dict0crea.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
new file mode 100644
index 00000000000..d18b3ecb1b0
--- /dev/null
+++ b/storage/xtradb/include/dict0dict.h
@@ -0,0 +1,1217 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "hash0hash.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+ char* a); /*!< in/out: string to put in lower case */
+/********************************************************************//**
+Get the database name length in a table name.
+@return database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+ const char* name); /*!< in: table name in the form
+ dbname '/' tablename */
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return table name */
+
+const char*
+dict_remove_db_name(
+/*================*/
+ const char* name); /*!< in: table name in the form
+ dbname '/' tablename */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+ dulint table_id, /*!< in: table id */
+ trx_t* trx); /*!< in: transaction handle */
+/********************************************************************//**
+Decrements the count of open MySQL handles to a table. */
+UNIV_INTERN
+void
+dict_table_decrement_handle_count(
+/*==============================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void);
+/*===========*/
+/********************************************************************//**
+Gets the space id of every table of the data dictionary and makes a linear
+list and a hash table of them to the data dictionary cache. This function
+can be called at database startup if we did not need to do a crash recovery.
+In crash recovery we must scan the space id's from the .ibd files in MySQL
+database directories. */
+UNIV_INTERN
+void
+dict_load_space_id_list(void);
+/*=========================*/
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type); /*!< out: data type */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type); /*!< in: data type */
+#endif /* UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col); /*!< in: column */
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col); /*!< in: column */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp); /*!< in: nonzero=ROW_FORMAT=COMPACT */
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col); /*!< in: column */
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index); /*!< in: clustered index */
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+ const char* name); /*!< in: column name */
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+ dict_table_t* table); /*!< in/out: table */
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value); /*!< in: next value to assign to a row */
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+ dict_table_t* table, /*!< in/out: table */
+ ib_uint64_t value); /*!< in: value which was assigned to a row */
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+ dict_table_t* table); /*!< in/out: table */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table */
+ mem_heap_t* heap); /*!< in: temporary heap */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap); /*!< in: temporary heap */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+ dict_table_t* table); /*!< in, own: table */
+/**********************************************************************//**
+Renames a table object.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ const char* new_name, /*!< in: new name */
+ ibool rename_also_foreigns);/*!< in: in ALTER TABLE we want
+ to preserve the original table name
+ in constraints which reference it */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+ dict_table_t* table, /*!< in/out: table object already in cache */
+ dulint new_id);/*!< in: new id to set */
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_foreign_add_to_cache(
+/*======================*/
+ dict_foreign_t* foreign, /*!< in, own: foreign key constraint */
+ ibool check_charsets);/*!< in: TRUE=check charset
+ compatibility */
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return the
+matching instance NULL otherwise.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index); /*!< in: InnoDB index */
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+ const dict_table_t* table); /*!< in: InnoDB table */
+/**********************************************************************//**
+Replace the index in the foreign key list that matches this index's
+definition with an equivalent index. */
+UNIV_INTERN
+void
+dict_table_replace_index_in_foreign_list(
+/*=====================================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in: index to be replaced */
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+ dict_table_t* table, /*!< in: InnoDB table */
+ dict_index_t* index); /*!< in: InnoDB index */
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_foreign_constraints(
+/*============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES
+ table2(c, d), table2 can be written
+ also with the database
+ name before it: test.table2; the
+ default database id the database of
+ parameter name */
+ size_t sql_length, /*!< in: length of sql_string */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks); /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+ulint
+dict_foreign_parse_drop_constraints(
+/*================================*/
+ mem_heap_t* heap, /*!< in: heap from which we can
+ allocate memory */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ ulint* n, /*!< out: number of constraints
+ to drop */
+ const char*** constraints_to_drop); /*!< out: id's of the
+ constraints to drop */
+/**********************************************************************//**
+Returns a table object and optionally increment its MySQL open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low is usually the
+appropriate function.
+@return table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get(
+/*===========*/
+ const char* table_name, /*!< in: table name */
+ ibool inc_mysql_count);
+ /*!< in: whether to increment the open
+ handle count on the table */
+/**********************************************************************//**
+Returns a index object, based on table and index id, and memoryfixes it.
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_on_id_low(
+/*=====================*/
+ dict_table_t* table, /*!< in: table */
+ dulint index_id); /*!< in: index id */
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name); /*!< in: table name */
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name); /*!< in: table name */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ dulint table_id); /*!< in: table id */
+/**********************************************************************//**
+Find an index that is equivalent to the one passed in and is not marked
+for deletion.
+@return index equivalent to foreign->foreign_index, or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_equiv_index(
+/*==========================*/
+ dict_foreign_t* foreign);/*!< in: foreign key */
+/**********************************************************************//**
+Returns an index object by matching on the name and column names and
+if more than one index matches return the index with the max id
+@return matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_by_max_id(
+/*===========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name, /*!< in: the index name to find */
+ const char** columns,/*!< in: array of column names */
+ ulint n_cols);/*!< in: number of columns */
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint col_nr);/*!< in: column number */
+
+/**********************************************************************//**
+Prints a table definition. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print_low(
+/*=================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Prints a table data when we know the table name. */
+UNIV_INTERN
+void
+dict_table_print_by_name(
+/*=====================*/
+ const char* name); /*!< in: table name */
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+ ibool create_table_format, /*!< in: if TRUE then print in
+ a format suitable to be inserted into
+ a CREATE TABLE, otherwise in the format
+ of SHOW TABLE STATUS */
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+ FILE* file, /*!< in: file where to print */
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ ibool add_newline); /*!< in: whether to add a newline */
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+ FILE* file, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index); /*!< in: index to print */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index); /*!< in: index */
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is unique.
+@return nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+ __attribute__((pure));
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table); /*!< in: table */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos); /*!< in: position of column */
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys); /*!< in: DATA_ROW_ID, ... */
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) \
+((table)->cols + (pos))
+#define dict_table_get_sys_col(table, sys) \
+((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS)
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys); /*!< in: DATA_ROW_ID, ... */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index); /*!< in: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Determine the file format of a table.
+@return file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+ const dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Set the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint format);/*!< in: file format version */
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+ const dict_table_t* table); /*!< in: table */
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+ dict_table_t* table); /*!< in: table */
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+ dict_table_t* table); /*!< in: table */
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n); /*!< in: column number */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value. This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+ dulint id); /*!< in: index id */
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+ulint
+dict_index_add_to_cache(
+/*====================*/
+ dict_table_t* table, /*!< in: table on which the index is */
+ dict_index_t* index, /*!< in, own: index; NOTE! The index memory
+ object is freed in this function! */
+ ulint page_no,/*!< in: root page number of the index */
+ ibool strict);/*!< in: TRUE=refuse to create the index
+ if records could be too big to fit in
+ an B-tree page */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index); /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index); /*!< in: an internal representation
+ of index (in the dictionary cache) */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of field */
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of the field */
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos); /*!< in: position of the field */
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+ const dict_index_t* index, /*!< in: index from which to search */
+ const dict_index_t* index2, /*!< in: index */
+ ulint n); /*!< in: field number in index2 */
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint n); /*!< in: column number */
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint type); /*!< in: DATA_ROW_ID, ... */
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+ dict_index_t* index, /*!< in/out: index */
+ const dict_table_t* table, /*!< in: table */
+ dict_col_t* col, /*!< in: column */
+ ulint prefix_len); /*!< in: column prefix length */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ const dict_index_t* index, /*!< in: index */
+ ulint n_fields); /*!< in: number of
+ field types to copy */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field); /*!< in: index field */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+ dulint index_id); /*!< in: index id */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+ dulint index_id); /*!< in: index id */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+ const dict_index_t* index, /*!< in: index tree */
+ const dtuple_t* tuple); /*!< in: tuple used in a search */
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+ const dict_table_t* table, /*!< in: Check for dup indexes
+ in this table */
+ ibool tmp_ok);/*!< in: TRUE=allow temporary
+ index names */
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to build node
+ pointer */
+ ulint page_no,/*!< in: page number to put in node
+ pointer */
+ mem_heap_t* heap, /*!< in: memory heap where pointer
+ created */
+ ulint level); /*!< in: level of rec in tree:
+ 0 means leaf level */
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record for which to
+ copy prefix */
+ ulint* n_fields,/*!< out: number of fields copied */
+ byte** buf, /*!< in/out: memory buffer for the
+ copied prefix, or NULL */
+ ulint* buf_size);/*!< in/out: buffer size */
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ rec_t* rec, /*!< in: record for which to build data tuple */
+ ulint n_fields,/*!< in: number of data fields */
+ mem_heap_t* heap); /*!< in: memory heap where tuple created */
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+ const dict_index_t* index); /*!< in: index */
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint space); /*!< in: space id */
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+ const dict_index_t* tree); /*!< in: index */
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint page); /*!< in: page number */
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+ const dict_index_t* index); /*!< in: index */
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics_low(
+/*=======================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool has_dict_mutex, /*!< in: TRUE if the caller has the
+ dictionary mutex */
+ ibool sync);
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics(
+/*===================*/
+ dict_table_t* table, /*!< in/out: table */
+ ibool sync);
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void);
+/*============================*/
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void);
+/*===========================*/
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+ const dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+ const dict_index_t* index); /*!< in: index */
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+ const char* name1, /*!< in: table name in the form
+ dbname '/' tablename */
+ const char* name2); /*!< in: table name in the form
+ dbname '/' tablename */
+/*********************************************************************//**
+Removes an index from the cache */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+ dict_table_t* table, /*!< in/out: table */
+ dict_index_t* index); /*!< in, own: index */
+/**********************************************************************//**
+Get index by name
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name); /*!< in: name of the index to find */
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*====================================*/
+ dict_table_t* table, /*!< in: table */
+ const char* name); /*!< in: name of the index to find */
+
+UNIV_INTERN
+void
+dict_table_LRU_trim(
+/*================*/
+ dict_table_t* self);
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE* dict_foreign_err_file;
+extern mutex_t dict_foreign_err_mutex; /* mutex protecting the buffers */
+
+/** the dictionary system */
+extern dict_sys_t* dict_sys;
+/** the data dictionary rw-latch protecting dict_sys */
+extern rw_lock_t dict_operation_lock;
+
+/* Dictionary system struct */
+struct dict_sys_struct{
+ mutex_t mutex; /*!< mutex protecting the data
+ dictionary; protects also the
+ disk-based dictionary system tables;
+ this mutex serializes CREATE TABLE
+ and DROP TABLE, as well as reading
+ the dictionary data for a table from
+ system tables */
+ dulint row_id; /*!< the next row id to assign;
+ NOTE that at a checkpoint this
+ must be written to the dict system
+ header and flushed to a file; in
+ recovery this must be derived from
+ the log records */
+ hash_table_t* table_hash; /*!< hash table of the tables, based
+ on name */
+ hash_table_t* table_id_hash; /*!< hash table of the tables, based
+ on id */
+ UT_LIST_BASE_NODE_T(dict_table_t)
+ table_LRU; /*!< LRU list of tables */
+ ulint size; /*!< varying space in bytes occupied
+ by the data dictionary table and
+ index objects */
+ dict_table_t* sys_tables; /*!< SYS_TABLES table */
+ dict_table_t* sys_columns; /*!< SYS_COLUMNS table */
+ dict_table_t* sys_indexes; /*!< SYS_INDEXES table */
+ dict_table_t* sys_fields; /*!< SYS_FIELDS table */
+ dict_table_t* sys_stats; /*!< SYS_STATS table */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+extern dict_index_t* dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+extern dict_index_t* dict_ind_compact;
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void);
+/*===============*/
+
+/**********************************************************************//**
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void);
+/*============*/
+
+/*************************************************************************
+set is_corrupt flag by space_id*/
+
+void
+dict_table_set_corrupt_by_space(
+/*============================*/
+ ulint space_id,
+ ibool need_mutex);
+
+#ifndef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic
new file mode 100644
index 00000000000..bd7534dc7e2
--- /dev/null
+++ b/storage/xtradb/include/dict0dict.ic
@@ -0,0 +1,861 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0load.h"
+#include "rem0types.h"
+
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+ const dict_col_t* col, /*!< in: column */
+ dtype_t* type) /*!< out: data type */
+{
+ ut_ad(col && type);
+
+ type->mtype = col->mtype;
+ type->prtype = col->prtype;
+ type->len = col->len;
+ type->mbminlen = col->mbminlen;
+ type->mbmaxlen = col->mbmaxlen;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ const dtype_t* type) /*!< in: data type */
+{
+ ut_ad(col);
+ ut_ad(type);
+
+ ut_ad(col->mtype == type->mtype);
+ ut_ad(col->prtype == type->prtype);
+ ut_ad(col->len == type->len);
+# ifndef UNIV_HOTBACKUP
+ ut_ad(col->mbminlen == type->mbminlen);
+ ut_ad(col->mbmaxlen == type->mbmaxlen);
+# endif /* !UNIV_HOTBACKUP */
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+ const dict_col_t* col) /*!< in: column */
+{
+ return(dtype_get_max_size_low(col->mtype, col->len));
+}
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+ col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+ const dict_col_t* col, /*!< in: column */
+ ulint comp) /*!< in: nonzero=ROW_FORMAT=COMPACT */
+{
+ return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+ const dict_col_t* col) /*!< in: column */
+{
+ ut_ad(col);
+
+ return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+ const dict_col_t* col, /*!< in: table column */
+ const dict_index_t* clust_index) /*!< in: clustered index */
+{
+ ulint i;
+
+ ut_ad(col);
+ ut_ad(clust_index);
+ ut_ad(dict_index_is_clust(clust_index));
+
+ for (i = 0; i < clust_index->n_def; i++) {
+ const dict_field_t* field = &clust_index->fields[i];
+
+ if (!field->prefix_len && field->col == col) {
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED));
+}
+/********************************************************************//**
+Check whether the index is unique.
+@return nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_UNIQUE));
+}
+
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(UNIV_UNLIKELY(index->type & DICT_IBUF));
+}
+
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint type;
+
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ type = index->type;
+
+ return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
+}
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols - DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+ const dict_table_t* table __attribute__((unused))) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+ ut_ad(table->cached);
+
+ return(DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint pos) /*!< in: position of column */
+{
+ ut_ad(table);
+ ut_ad(pos < table->n_def);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return((dict_col_t*) (table->cols) + pos);
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys) /*!< in: DATA_ROW_ID, ... */
+{
+ dict_col_t* col;
+
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ col = dict_table_get_nth_col(table, table->n_cols
+ - DATA_N_SYS_COLS + sys);
+ ut_ad(col->mtype == DATA_SYS);
+ ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+ return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+ const dict_table_t* table, /*!< in: table */
+ ulint sys) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(table);
+ ut_ad(sys < DATA_N_SYS_COLS);
+ ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+ return(table->n_cols - DATA_N_SYS_COLS + sys);
+}
+
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+#if DICT_TF_COMPACT != TRUE
+#error
+#endif
+
+ return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT));
+}
+
+/********************************************************************//**
+Determine the file format of a table.
+@return file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+ return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Determine the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+ dict_table_t* table, /*!< in/out: table */
+ ulint format) /*!< in: file format version */
+{
+ ut_ad(table);
+
+ table->flags = (table->flags & ~DICT_TF_FORMAT_MASK)
+ | (format << DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint zip_size = flags & DICT_TF_ZSSIZE_MASK;
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
+ << (zip_size >> DICT_TF_ZSSIZE_SHIFT));
+
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+ }
+
+ return(zip_size);
+}
+
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+
+ return(dict_table_flags_to_zip_size(table->flags));
+}
+
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+
+ ut_a(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Loop through each index of the table and lock them */
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_lock(dict_index_get_lock(index));
+ }
+}
+
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+ dict_table_t* table) /*!< in: table */
+{
+ dict_index_t* index;
+
+ ut_a(table);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ for (index = dict_table_get_first_index(table);
+ index != NULL;
+ index = dict_table_get_next_index(index)) {
+ rw_lock_x_unlock(dict_index_get_lock(index));
+ }
+}
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal
+ representation of index (in
+ the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(index->cached);
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_index_get_n_unique(index));
+ }
+
+ return(dict_index_get_n_fields(index));
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+ const dict_index_t* index) /*!< in: an internal representation
+ of index (in the dictionary cache) */
+{
+ return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of field */
+{
+ ut_ad(index);
+ ut_ad(pos < index->n_def);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint type) /*!< in: DATA_ROW_ID, ... */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+ ut_ad(!(index->type & DICT_UNIVERSAL));
+
+ if (dict_index_is_clust(index)) {
+
+ return(dict_col_get_clust_pos(
+ dict_table_get_sys_col(index->table, type),
+ index));
+ }
+
+ return(dict_index_get_nth_col_pos(
+ index, dict_table_get_sys_col_no(index->table, type)));
+}
+
+/*********************************************************************//**
+Gets the field column.
+@return field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+ const dict_field_t* field) /*!< in: index field */
+{
+ ut_ad(field);
+
+ return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+ const dict_index_t* index, /*!< in: index */
+ ulint pos) /*!< in: position of the field */
+{
+ return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ulint n = dict_index_get_n_fields(index);
+ ulint size = 0;
+
+ while (n--) {
+ size += dict_col_get_min_size(dict_index_get_nth_col(index,
+ n));
+ }
+
+ return(size);
+}
+
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->space);
+}
+
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint space) /*!< in: space id */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->space = space;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+ const dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(index->page);
+}
+
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+ dict_index_t* index, /*!< in/out: index */
+ ulint page) /*!< in: page number */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ index->page = page;
+}
+
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+ dict_index_t* index) /*!< in: index */
+{
+ ut_ad(index);
+ ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+ return(&(index->lock));
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+ return(UNIV_PAGE_SIZE / 16);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+ ulint table_fold;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Look for the table name in the hash table */
+ table_fold = ut_fold_string(table_name);
+
+ HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+ dict_table_t*, table, ut_ad(table->cached),
+ !strcmp(table->name, table_name));
+
+ /* make young in table_LRU */
+ if (table) {
+ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+ }
+
+ return(table);
+}
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+ const char* table_name) /*!< in: table name */
+{
+ dict_table_t* table;
+
+ ut_ad(table_name);
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ table = dict_table_check_if_in_cache_low(table_name);
+
+ if (table == NULL) {
+ table = dict_load_table(table_name);
+ }
+
+ ut_ad(!table || table->cached);
+
+ return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+ dulint table_id) /*!< in: table id */
+{
+ dict_table_t* table;
+ ulint fold;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ /* Look for the table name in the hash table */
+ fold = ut_fold_dulint(table_id);
+
+ HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+ dict_table_t*, table, ut_ad(table->cached),
+ !ut_dulint_cmp(table->id, table_id));
+ if (table == NULL) {
+ table = dict_load_table_on_id(table_id);
+ }
+
+ /* make young in table_LRU */
+ if (table) {
+ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+ }
+
+ ut_ad(!table || table->cached);
+
+ /* TODO: should get the type information from MySQL */
+
+ return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h
new file mode 100644
index 00000000000..f41882019d5
--- /dev/null
+++ b/storage/xtradb/include/dict0load.h
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+ ibool in_crash_recovery); /*!< in: are we doing a crash recovery */
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+ const char* name); /*!< in: database name which ends to '/' */
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+ const char* name); /*!< in: table name in the
+ databasename/tablename format */
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+ dulint table_id); /*!< in: table id */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+ dict_table_t* table); /*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_load_foreigns(
+/*===============*/
+ const char* table_name, /*!< in: table name */
+ ibool check_recursive,/*!< in: Whether to check recursive
+ load of tables chained by FK */
+ ibool check_charsets);/*!< in: TRUE=check charsets
+ compatibility */
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void);
+/*============*/
+
+
+#ifndef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0load.ic b/storage/xtradb/include/dict0load.ic
new file mode 100644
index 00000000000..ccc16db165b
--- /dev/null
+++ b/storage/xtradb/include/dict0load.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.ic
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
new file mode 100644
index 00000000000..6736c2a3a36
--- /dev/null
+++ b/storage/xtradb/include/dict0mem.h
@@ -0,0 +1,577 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0types.h"
+# include "que0types.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED 1 /*!< clustered index */
+#define DICT_UNIQUE 2 /*!< unique index */
+#define DICT_UNIVERSAL 4 /*!< index which can contain records from any
+ other index */
+#define DICT_IBUF 8 /*!< insert buffer tree */
+/* @} */
+
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY 1 /*!< ordinary table */
+#if 0 /* not implemented */
+#define DICT_TABLE_CLUSTER_MEMBER 2
+#define DICT_TABLE_CLUSTER 3 /* this means that the table is
+ really a cluster definition */
+#endif
+
+/** Table flags. All unused bits must be 0. */
+/* @{ */
+#define DICT_TF_COMPACT 1 /* Compact page format.
+ This must be set for
+ new file formats
+ (later than
+ DICT_TF_FORMAT_51). */
+
+/** Compressed page size (0=uncompressed, up to 15 compressed sizes) */
+/* @{ */
+#define DICT_TF_ZSSIZE_SHIFT 1
+#define DICT_TF_ZSSIZE_MASK (15 << DICT_TF_ZSSIZE_SHIFT)
+#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1)
+/* @} */
+
+/** File format */
+/* @{ */
+#define DICT_TF_FORMAT_SHIFT 5 /* file format */
+#define DICT_TF_FORMAT_MASK \
+((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
+#define DICT_TF_FORMAT_51 0 /*!< InnoDB/MySQL up to 5.1 */
+#define DICT_TF_FORMAT_ZIP 1 /*!< InnoDB plugin for 5.1:
+ compressed tables,
+ new BLOB treatment */
+/** Maximum supported file format */
+#define DICT_TF_FORMAT_MAX DICT_TF_FORMAT_ZIP
+/* @} */
+#define DICT_TF_BITS 6 /*!< number of flag bits */
+#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
+# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
+#endif
+/* @} */
+
+/** @brief Additional table flags.
+
+These flags will be stored in SYS_TABLES.MIX_LEN. All unused flags
+will be written as 0. The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. */
+/* @{ */
+#define DICT_TF2_SHIFT DICT_TF_BITS
+ /*!< Shift value for
+ table->flags. */
+#define DICT_TF2_TEMPORARY 1 /*!< TRUE for tables from
+ CREATE TEMPORARY TABLE. */
+#define DICT_TF2_BITS (DICT_TF2_SHIFT + 1)
+ /*!< Total number of bits
+ in table->flags. */
+/* @} */
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD 250
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL 300
+
+/**********************************************************************//**
+Creates a table memory object.
+@return own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+ const char* name, /*!< in: table name */
+ ulint space, /*!< in: space where the clustered index
+ of the table is placed; this parameter
+ is ignored if the table is made
+ a member of a cluster */
+ ulint n_cols, /*!< in: number of columns */
+ ulint flags); /*!< in: table flags */
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+ dict_table_t* table); /*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+ dict_table_t* table, /*!< in: table */
+ mem_heap_t* heap, /*!< in: temporary memory heap, or NULL */
+ const char* name, /*!< in: column name, or NULL */
+ ulint mtype, /*!< in: main datatype */
+ ulint prtype, /*!< in: precise type */
+ ulint len); /*!< in: precision */
+/**********************************************************************//**
+Creates an index memory object.
+@return own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+ const char* table_name, /*!< in: table name */
+ const char* index_name, /*!< in: index name */
+ ulint space, /*!< in: space where the index tree is
+ placed, ignored if the index is of
+ the clustered type */
+ ulint type, /*!< in: DICT_UNIQUE,
+ DICT_CLUSTERED, ... ORed */
+ ulint n_fields); /*!< in: number of fields */
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+ dict_index_t* index, /*!< in: index */
+ const char* name, /*!< in: column name */
+ ulint prefix_len); /*!< in: 0 or the column prefix length
+ in a MySQL index like
+ INDEX (textcol(25)) */
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+ dict_index_t* index); /*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/** Data structure for a column in a table */
+struct dict_col_struct{
+ /*----------------------*/
+ /** The following are copied from dtype_t,
+ so that all bit-fields can be packed tightly. */
+ /* @{ */
+ unsigned mtype:8; /*!< main data type */
+ unsigned prtype:24; /*!< precise type; MySQL data
+ type, charset code, flags to
+ indicate nullability,
+ signedness, whether this is a
+ binary string, whether this is
+ a true VARCHAR where MySQL
+ uses 2 bytes to store the length */
+
+ /* the remaining fields do not affect alphabetical ordering: */
+
+ unsigned len:16; /*!< length; for MySQL data this
+ is field->pack_length(),
+ except that for a >= 5.0.3
+ type true VARCHAR this is the
+ maximum byte length of the
+ string data (in addition to
+ the string, MySQL uses 1 or 2
+ bytes to store the string length) */
+
+ unsigned mbminlen:2; /*!< minimum length of a
+ character, in bytes */
+ unsigned mbmaxlen:3; /*!< maximum length of a
+ character, in bytes */
+ /*----------------------*/
+ /* End of definitions copied from dtype_t */
+ /* @} */
+
+ unsigned ind:10; /*!< table column position
+ (starting from 0) */
+ unsigned ord_part:1; /*!< nonzero if this column
+ appears in the ordering fields
+ of an index */
+};
+
+/** @brief DICT_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length).
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes. This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_MAX_INDEX_COL_LEN REC_MAX_INDEX_COL_LEN
+
+/** Data structure for a field in an index */
+struct dict_field_struct{
+ dict_col_t* col; /*!< pointer to the table column */
+ const char* name; /*!< name of the column */
+ unsigned prefix_len:10; /*!< 0 or the length of the column
+ prefix in bytes in a MySQL index of
+ type, e.g., INDEX (textcol(25));
+ must be smaller than
+ DICT_MAX_INDEX_COL_LEN; NOTE that
+ in the UTF-8 charset, MySQL sets this
+ to 3 * the prefix len in UTF-8 chars */
+ unsigned fixed_len:10; /*!< 0 or the fixed length of the
+ column if smaller than
+ DICT_MAX_INDEX_COL_LEN */
+};
+
+/** Data structure for an index. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_struct{
+ dulint id; /*!< id of the index */
+ mem_heap_t* heap; /*!< memory heap */
+ const char* name; /*!< index name */
+ const char* table_name;/*!< table name */
+ dict_table_t* table; /*!< back pointer to table */
+#ifndef UNIV_HOTBACKUP
+ unsigned space:32;
+ /*!< space where the index tree is placed */
+ unsigned page:32;/*!< index tree root page number */
+#endif /* !UNIV_HOTBACKUP */
+ unsigned type:4; /*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+ DICT_UNIVERSAL, DICT_IBUF) */
+ unsigned trx_id_offset:10;/*!< position of the trx id column
+ in a clustered index record, if the fields
+ before it are known to be of a fixed size,
+ 0 otherwise */
+ unsigned n_user_defined_cols:10;
+ /*!< number of columns the user defined to
+ be in the index: in the internal
+ representation we add more columns */
+ unsigned n_uniq:10;/*!< number of fields from the beginning
+ which are enough to determine an index
+ entry uniquely */
+ unsigned n_def:10;/*!< number of fields defined so far */
+ unsigned n_fields:10;/*!< number of fields in the index */
+ unsigned n_nullable:10;/*!< number of nullable fields */
+ unsigned cached:1;/*!< TRUE if the index object is in the
+ dictionary cache */
+ unsigned to_be_dropped:1;
+ /*!< TRUE if this index is marked to be
+ dropped in ha_innobase::prepare_drop_index(),
+ otherwise FALSE */
+ dict_field_t* fields; /*!< array of field descriptions */
+#ifndef UNIV_HOTBACKUP
+ UT_LIST_NODE_T(dict_index_t)
+ indexes;/*!< list of indexes of the table */
+ btr_search_t* search_info; /*!< info used in optimistic searches */
+ /*----------------------*/
+ /** Statistics for query optimization */
+ /* @{ */
+ ib_int64_t* stat_n_diff_key_vals;
+ /*!< approximate number of different
+ key values for this index, for each
+ n-column prefix where n <=
+ dict_get_n_unique(index); we
+ periodically calculate new
+ estimates */
+ ulint stat_index_size;
+ /*!< approximate index size in
+ database pages */
+ ulint stat_n_leaf_pages;
+ /*!< approximate number of leaf pages in the
+ index tree */
+ /* @} */
+ rw_lock_t lock; /*!< read-write lock protecting the
+ upper levels of the index tree */
+ ib_uint64_t trx_id; /*!< id of the transaction that created this
+ index, or 0 if the index existed
+ when InnoDB was started up */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+ ulint magic_n;/*!< magic number */
+/** Value of dict_index_struct::magic_n */
+# define DICT_INDEX_MAGIC_N 76789786
+#endif
+};
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D). Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_struct{
+ mem_heap_t* heap; /*!< this object is allocated from
+ this memory heap */
+ char* id; /*!< id of the constraint as a
+ null-terminated string */
+ unsigned n_fields:10; /*!< number of indexes' first fields
+ for which the foreign key
+ constraint is defined: we allow the
+ indexes to contain more fields than
+ mentioned in the constraint, as long
+ as the first fields are as mentioned */
+ unsigned type:6; /*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+ or DICT_FOREIGN_ON_DELETE_SET_NULL */
+ char* foreign_table_name;/*!< foreign table name */
+ dict_table_t* foreign_table; /*!< table where the foreign key is */
+ const char** foreign_col_names;/*!< names of the columns in the
+ foreign key */
+ char* referenced_table_name;/*!< referenced table name */
+ dict_table_t* referenced_table;/*!< table where the referenced key
+ is */
+ const char** referenced_col_names;/*!< names of the referenced
+ columns in the referenced table */
+ dict_index_t* foreign_index; /*!< foreign index; we require that
+ both tables contain explicitly defined
+ indexes for the constraint: InnoDB
+ does not generate new indexes
+ implicitly */
+ dict_index_t* referenced_index;/*!< referenced index */
+ UT_LIST_NODE_T(dict_foreign_t)
+ foreign_list; /*!< list node for foreign keys of the
+ table */
+ UT_LIST_NODE_T(dict_foreign_t)
+ referenced_list;/*!< list node for referenced
+ keys of the table */
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE 1 /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL 2 /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE 4 /*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL 8 /*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16 /*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32 /*!< ON UPDATE NO ACTION */
+/* @} */
+
+
+/** Data structure for a database table. Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+struct dict_table_struct{
+ dulint id; /*!< id of the table */
+ mem_heap_t* heap; /*!< memory heap */
+ char* name; /*!< table name */
+ const char* dir_path_of_temp_table;/*!< NULL or the directory path
+ where a TEMPORARY table that was explicitly
+ created by a user should be placed if
+ innodb_file_per_table is defined in my.cnf;
+ in Unix this is usually /tmp/..., in Windows
+ temp\... */
+ unsigned space:32;
+ /*!< space where the clustered index of the
+ table is placed */
+ unsigned flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
+ unsigned ibd_file_missing:1;
+ /*!< TRUE if this is in a single-table
+ tablespace and the .ibd file is missing; then
+ we must return in ha_innodb.cc an error if the
+ user tries to query such an orphaned table */
+ unsigned tablespace_discarded:1;
+ /*!< this flag is set TRUE when the user
+ calls DISCARD TABLESPACE on this
+ table, and reset to FALSE in IMPORT
+ TABLESPACE */
+ unsigned cached:1;/*!< TRUE if the table object has been added
+ to the dictionary cache */
+ unsigned n_def:10;/*!< number of columns defined so far */
+ unsigned n_cols:10;/*!< number of columns */
+ dict_col_t* cols; /*!< array of column descriptions */
+ const char* col_names;
+ /*!< Column names packed in a character string
+ "name1\0name2\0...nameN\0". Until
+ the string contains n_cols, it will be
+ allocated from a temporary heap. The final
+ string will be allocated from table->heap. */
+#ifndef UNIV_HOTBACKUP
+ hash_node_t name_hash; /*!< hash chain node */
+ hash_node_t id_hash; /*!< hash chain node */
+ UT_LIST_BASE_NODE_T(dict_index_t)
+ indexes; /*!< list of indexes of the table */
+ UT_LIST_BASE_NODE_T(dict_foreign_t)
+ foreign_list;/*!< list of foreign key constraints
+ in the table; these refer to columns
+ in other tables */
+ UT_LIST_BASE_NODE_T(dict_foreign_t)
+ referenced_list;/*!< list of foreign key constraints
+ which refer to this table */
+ UT_LIST_NODE_T(dict_table_t)
+ table_LRU; /*!< node of the LRU list of tables */
+ ulint n_mysql_handles_opened;
+ /*!< count of how many handles MySQL has opened
+ to this table; dropping of the table is
+ NOT allowed until this count gets to zero;
+ MySQL does NOT itself check the number of
+ open handles at drop */
+ unsigned fk_max_recusive_level:8;
+ /*!< maximum recursive level we support when
+ loading tables chained together with FK
+ constraints. If exceeds this level, we will
+ stop loading child table into memory along with
+ its parent table */
+ ulint n_foreign_key_checks_running;
+ /*!< count of how many foreign key check
+ operations are currently being performed
+ on the table: we cannot drop the table while
+ there are foreign key checks running on
+ it! */
+ trx_id_t query_cache_inv_trx_id;
+ /*!< transactions whose trx id is
+ smaller than this number are not
+ allowed to store to the MySQL query
+ cache or retrieve from it; when a trx
+ with undo logs commits, it sets this
+ to the value of the trx id counter for
+ the tables it had an IX lock on */
+ UT_LIST_BASE_NODE_T(lock_t)
+ locks; /*!< list of locks on the table */
+#ifdef UNIV_DEBUG
+ /*----------------------*/
+ ibool does_not_fit_in_memory;
+ /*!< this field is used to specify in
+ simulations tables which are so big
+ that disk should be accessed: disk
+ access is simulated by putting the
+ thread to sleep for a while; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about value TRUE if it has
+ to reload the table definition from
+ disk */
+#endif /* UNIV_DEBUG */
+ /*----------------------*/
+ unsigned big_rows:1;
+ /*!< flag: TRUE if the maximum length of
+ a single row exceeds BIG_ROW_SIZE;
+ initialized in dict_table_add_to_cache() */
+ /** Statistics for query optimization */
+ /* @{ */
+ unsigned stat_initialized:1; /*!< TRUE if statistics have
+ been calculated the first time
+ after database startup or table creation */
+ ib_int64_t stat_n_rows;
+ /*!< approximate number of rows in the table;
+ we periodically calculate new estimates */
+ ulint stat_clustered_index_size;
+ /*!< approximate clustered index size in
+ database pages */
+ ulint stat_sum_of_other_index_sizes;
+ /*!< other indexes in database pages */
+ ulint stat_modified_counter;
+ /*!< when a row is inserted, updated,
+ or deleted,
+ we add 1 to this number; we calculate new
+ estimates for the stat_... values for the
+ table and the indexes at an interval of 2 GB
+ or when about 1 / 16 of table has been
+ modified; also when the estimate operation is
+ called for MySQL SHOW TABLE STATUS; the
+ counter is reset to zero at statistics
+ calculation; this counter is not protected by
+ any latch, because this is only used for
+ heuristics */
+ /* @} */
+ /*----------------------*/
+ /**!< The following fields are used by the
+ AUTOINC code. The actual collection of
+ tables locked during AUTOINC read/write is
+ kept in trx_t. In order to quickly determine
+ whether a transaction has locked the AUTOINC
+ lock we keep a pointer to the transaction
+ here in the autoinc_trx variable. This is to
+ avoid acquiring the kernel mutex and scanning
+ the vector in trx_t.
+
+ When an AUTOINC lock has to wait, the
+ corresponding lock instance is created on
+ the trx lock heap rather than use the
+ pre-allocated instance in autoinc_lock below.*/
+ /* @{ */
+ lock_t* autoinc_lock;
+ /*!< a buffer for an AUTOINC lock
+ for this table: we allocate the memory here
+ so that individual transactions can get it
+ and release it without a need to allocate
+ space from the lock heap of the trx:
+ otherwise the lock heap would grow rapidly
+ if we do a large insert from a select */
+ mutex_t autoinc_mutex;
+ /*!< mutex protecting the autoincrement
+ counter */
+ ib_uint64_t autoinc;/*!< autoinc counter value to give to the
+ next inserted row */
+ ulong n_waiting_or_granted_auto_inc_locks;
+ /*!< This counter is used to track the number
+ of granted and pending autoinc locks on this
+ table. This value is set after acquiring the
+ kernel mutex but we peek the contents to
+ determine whether other transactions have
+ acquired the AUTOINC lock or not. Of course
+ only one transaction can be granted the
+ lock but there can be multiple waiters. */
+ const trx_t* autoinc_trx;
+ /*!< The transaction that currently holds the
+ the AUTOINC lock on this table. */
+ /* @} */
+ /*----------------------*/
+ ibool is_corrupt;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+ ulint magic_n;/*!< magic number */
+/** Value of dict_table_struct::magic_n */
+# define DICT_TABLE_MAGIC_N 76333786
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0mem.ic b/storage/xtradb/include/dict0mem.ic
new file mode 100644
index 00000000000..c36adb07a18
--- /dev/null
+++ b/storage/xtradb/include/dict0mem.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+
diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h
new file mode 100644
index 00000000000..7ad69193cc9
--- /dev/null
+++ b/storage/xtradb/include/dict0types.h
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+typedef struct dict_sys_struct dict_sys_t;
+typedef struct dict_col_struct dict_col_t;
+typedef struct dict_field_struct dict_field_t;
+typedef struct dict_index_struct dict_index_t;
+typedef struct dict_table_struct dict_table_t;
+typedef struct dict_foreign_struct dict_foreign_t;
+
+/* A cluster object is a table object with the type field set to
+DICT_CLUSTERED */
+
+typedef dict_table_t dict_cluster_t;
+
+typedef struct ind_node_struct ind_node_t;
+typedef struct tab_node_struct tab_node_t;
+
+/* Space id and page no where the dictionary header resides */
+#define DICT_HDR_SPACE 0 /* the SYSTEM tablespace */
+#define DICT_HDR_PAGE_NO FSP_DICT_HDR_PAGE_NO
+
+#endif
diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h
new file mode 100644
index 00000000000..121a5946ac7
--- /dev/null
+++ b/storage/xtradb/include/dyn0dyn.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.h
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dyn0dyn_h
+#define dyn0dyn_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "mem0mem.h"
+
+/** A block in a dynamically allocated array */
+typedef struct dyn_block_struct dyn_block_t;
+/** Dynamically allocated array */
+typedef dyn_block_t dyn_array_t;
+
+
+/** This is the initial 'payload' size of a dynamic array;
+this must be > MLOG_BUF_MARGIN + 30! */
+#define DYN_ARRAY_DATA_SIZE 512
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ dyn_array_t* arr); /*!< in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size); /*!< in: size in bytes of the buffer; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ byte* ptr); /*!< in: buffer space from ptr up was not used */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to
+the added element. The caller must copy the element to
+the pointer returned.
+@return pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size); /*!< in: size in bytes of the element */
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ ulint pos); /*!< in: position of element as bytes
+ from array start */
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ dyn_block_t* block); /*!< in: dyn array block */
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ const byte* str, /*!< in: string to write */
+ ulint len); /*!< in: string length */
+
+/*#################################################################*/
+
+/** @brief A block in a dynamically allocated array.
+NOTE! Do not access the fields of the struct directly: the definition
+appears here only for the compiler to know its size! */
+struct dyn_block_struct{
+ mem_heap_t* heap; /*!< in the first block this is != NULL
+ if dynamic allocation has been needed */
+ ulint used; /*!< number of data bytes used in this block;
+ DYN_BLOCK_FULL_FLAG is set when the block
+ becomes full */
+ byte data[DYN_ARRAY_DATA_SIZE];
+ /*!< storage for array elements */
+ UT_LIST_BASE_NODE_T(dyn_block_t) base;
+ /*!< linear list of dyn blocks: this node is
+ used only in the first block */
+ UT_LIST_NODE_T(dyn_block_t) list;
+ /*!< linear list node: used in all blocks */
+#ifdef UNIV_DEBUG
+ ulint buf_end;/*!< only in the debug version: if dyn
+ array is opened, this is the buffer
+ end offset, else this is 0 */
+ ulint magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */
+#endif
+};
+
+
+#ifndef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic
new file mode 100644
index 00000000000..110e674abff
--- /dev/null
+++ b/storage/xtradb/include/dyn0dyn.ic
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.ic
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+/** Value of dyn_block_struct::magic_n */
+#define DYN_BLOCK_MAGIC_N 375767
+/** Flag for dyn_block_struct::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG 0x1000000UL
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+ dyn_array_t* arr); /*!< in: dyn array */
+
+
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ return(arr);
+}
+
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ if (arr->heap == NULL) {
+
+ return(arr);
+ }
+
+ return(UT_LIST_GET_LAST(arr->base));
+}
+
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(arr && block);
+
+ if (arr->heap == NULL) {
+ ut_ad(arr == block);
+
+ return(NULL);
+ }
+
+ return(UT_LIST_GET_NEXT(list, block));
+}
+
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(block);
+
+ return((block->used) & ~DYN_BLOCK_FULL_FLAG);
+}
+
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+ dyn_block_t* block) /*!< in: dyn array block */
+{
+ ut_ad(block);
+
+ return(block->data);
+}
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+ dyn_array_t* arr) /*!< in: pointer to a memory buffer of
+ size sizeof(dyn_array_t) */
+{
+ ut_ad(arr);
+#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG
+# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG"
+#endif
+
+ arr->heap = NULL;
+ arr->used = 0;
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+ arr->magic_n = DYN_BLOCK_MAGIC_N;
+#endif
+ return(arr);
+}
+
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ if (arr->heap != NULL) {
+ mem_heap_free(arr->heap);
+ }
+
+#ifdef UNIV_DEBUG
+ arr->magic_n = 0;
+#endif
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to the added element.
+The caller must copy the element to the pointer returned.
+@return pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size) /*!< in: size in bytes of the element */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ }
+ }
+
+ block->used = used + size;
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+ return((block->data) + used);
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ ulint size) /*!< in: size in bytes of the buffer; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+ ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+ ut_ad(size);
+
+ block = arr;
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ /* Get the last array block */
+
+ block = dyn_array_get_last_block(arr);
+ used = block->used;
+
+ if (used + size > DYN_ARRAY_DATA_SIZE) {
+ block = dyn_array_add_block(arr);
+ used = block->used;
+ ut_a(size <= DYN_ARRAY_DATA_SIZE);
+ }
+ }
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+#ifdef UNIV_DEBUG
+ ut_ad(arr->buf_end == 0);
+
+ arr->buf_end = used + size;
+#endif
+ return((block->data) + used);
+}
+
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+ dyn_array_t* arr, /*!< in: dynamic array */
+ byte* ptr) /*!< in: buffer space from ptr up was not used */
+{
+ dyn_block_t* block;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ block = dyn_array_get_last_block(arr);
+
+ ut_ad(arr->buf_end + block->data >= ptr);
+
+ block->used = ptr - block->data;
+
+ ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+#ifdef UNIV_DEBUG
+ arr->buf_end = 0;
+#endif
+}
+
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ ulint pos) /*!< in: position of element as bytes
+ from array start */
+{
+ dyn_block_t* block;
+ ulint used;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ if (arr->heap != NULL) {
+ used = dyn_block_get_used(block);
+
+ while (pos >= used) {
+ pos -= used;
+ block = UT_LIST_GET_NEXT(list, block);
+ ut_ad(block);
+
+ used = dyn_block_get_used(block);
+ }
+ }
+
+ ut_ad(block);
+ ut_ad(dyn_block_get_used(block) >= pos);
+
+ return(block->data + pos);
+}
+
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+ dyn_array_t* arr) /*!< in: dyn array */
+{
+ dyn_block_t* block;
+ ulint sum = 0;
+
+ ut_ad(arr);
+ ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+ if (arr->heap == NULL) {
+
+ return(arr->used);
+ }
+
+ /* Get the first array block */
+ block = dyn_array_get_first_block(arr);
+
+ while (block != NULL) {
+ sum += dyn_block_get_used(block);
+ block = dyn_array_get_next_block(arr, block);
+ }
+
+ return(sum);
+}
+
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+ dyn_array_t* arr, /*!< in: dyn array */
+ const byte* str, /*!< in: string to write */
+ ulint len) /*!< in: string length */
+{
+ ulint n_copied;
+
+ while (len > 0) {
+ if (len > DYN_ARRAY_DATA_SIZE) {
+ n_copied = DYN_ARRAY_DATA_SIZE;
+ } else {
+ n_copied = len;
+ }
+
+ memcpy(dyn_array_push(arr, n_copied), str, n_copied);
+
+ str += n_copied;
+ len -= n_copied;
+ }
+}
diff --git a/storage/xtradb/include/eval0eval.h b/storage/xtradb/include/eval0eval.h
new file mode 100644
index 00000000000..60aefd8d453
--- /dev/null
+++ b/storage/xtradb/include/eval0eval.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node); /*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node); /*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val); /*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node); /*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len); /*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2); /*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node); /*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+ func_node_t* cmp_node); /*!< in: comparison node */
+
+
+#ifndef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/eval0eval.ic b/storage/xtradb/include/eval0eval.ic
new file mode 100644
index 00000000000..fe767f39b00
--- /dev/null
+++ b/storage/xtradb/include/eval0eval.ic
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+ func_node_t* func_node); /*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size); /*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+ que_node_t* node, /*!< in: query graph node; sets the val field
+ data field to point to the new buffer, and
+ len field equal to size */
+ ulint size) /*!< in: buffer size */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+ dfield_set_len(dfield, size);
+
+ data = dfield_get_data(dfield);
+
+ if (!data || que_node_get_val_buf_size(node) < size) {
+
+ data = eval_node_alloc_val_buf(node, size);
+ }
+
+ return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+ sym_node_t* sym_node) /*!< in: symbol table node */
+{
+
+ ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ if (sym_node->indirection) {
+ /* The symbol table node is an alias for a variable or a
+ column */
+
+ dfield_copy_data(que_node_get_val(sym_node),
+ que_node_get_val(sym_node->indirection));
+ }
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+ que_node_t* exp_node) /*!< in: expression */
+{
+ if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+ eval_sym((sym_node_t*)exp_node);
+
+ return;
+ }
+
+ eval_func(exp_node);
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+ que_node_t* node, /*!< in: expression node */
+ lint val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ data = eval_node_alloc_val_buf(node, 4);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ mach_write_to_4(data, (ulint)val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+ que_node_t* node) /*!< in: expression node */
+{
+ dfield_t* dfield;
+
+ dfield = que_node_get_val(node);
+
+ ut_ad(dfield_get_len(dfield) == 4);
+
+ return((int)mach_read_from_4(dfield_get_data(dfield)));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(node);
+
+ data = dfield_get_data(dfield);
+
+ ut_ad(data != NULL);
+
+ return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+ func_node_t* func_node, /*!< in: function node */
+ ibool val) /*!< in: value to set */
+{
+ dfield_t* dfield;
+ byte* data;
+
+ dfield = que_node_get_val(func_node);
+
+ data = dfield_get_data(dfield);
+
+ if (data == NULL) {
+ /* Allocate 1 byte to hold the value */
+
+ data = eval_node_alloc_val_buf(func_node, 1);
+ }
+
+ ut_ad(dfield_get_len(dfield) == 1);
+
+ mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+ que_node_t* node, /*!< in: query graph node */
+ const byte* str, /*!< in: binary string */
+ ulint len) /*!< in: string length or UNIV_SQL_NULL */
+{
+ byte* data;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_len(que_node_get_val(node), len);
+
+ return;
+ }
+
+ data = eval_node_ensure_val_buf(node, len);
+
+ ut_memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+ que_node_t* node1, /*!< in: node to copy to */
+ que_node_t* node2) /*!< in: node to copy from */
+{
+ dfield_t* dfield2;
+
+ dfield2 = que_node_get_val(node2);
+
+ eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
+ dfield_get_len(dfield2));
+}
diff --git a/storage/xtradb/include/eval0proc.h b/storage/xtradb/include/eval0proc.h
new file mode 100644
index 00000000000..13e2e365320
--- /dev/null
+++ b/storage/xtradb/include/eval0proc.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/eval0proc.ic b/storage/xtradb/include/eval0proc.ic
new file mode 100644
index 00000000000..c602af0a694
--- /dev/null
+++ b/storage/xtradb/include/eval0proc.ic
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ proc_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ /* Start execution from the first statement in the statement
+ list */
+
+ thr->run_node = node->stat_list;
+ } else {
+ /* Move to the next statement */
+ ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+ thr->run_node = NULL;
+ }
+
+ if (thr->run_node == NULL) {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ func_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ /* Evaluate the procedure */
+
+ eval_exp(node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
new file mode 100644
index 00000000000..07c80ef8609
--- /dev/null
+++ b/storage/xtradb/include/fil0fil.h
@@ -0,0 +1,766 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "os0file.h"
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char* fil_path_to_mysql_datadir;
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE 4
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define FIL_NULL ULINT32_UNDEFINED
+
+/* Space address data type; this is intended to be used when
+addresses accurate to a byte are stored in file pages. If the page part
+of the address is FIL_NULL, the address is considered undefined. */
+
+typedef byte fil_faddr_t; /*!< 'type' definition in C: an address
+ stored in a file page is a string of bytes */
+#define FIL_ADDR_PAGE 0 /* first in address is the page offset */
+#define FIL_ADDR_BYTE 4 /* then comes 2-byte byte offset within page*/
+
+#define FIL_ADDR_SIZE 6 /* address size is 6 bytes */
+
+/** A struct for storing a space address FIL_ADDR, when it is used
+in C program data structures. */
+
+typedef struct fil_addr_struct fil_addr_t;
+/** File space address */
+struct fil_addr_struct{
+ ulint page; /*!< page number within a space */
+ ulint boffset; /*!< byte offset within the page */
+};
+
+/** The null file address */
+extern fil_addr_t fil_addr_null;
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0 /*!< in < MySQL-4.0.14 space id the
+ page belongs to (== 0) but in later
+ versions the 'new' checksum of the
+ page */
+#define FIL_PAGE_OFFSET 4 /*!< page offset inside space */
+#define FIL_PAGE_PREV 8 /*!< if there is a 'natural'
+ predecessor of the page, its
+ offset. Otherwise FIL_NULL.
+ This field is not set on BLOB
+ pages, which are stored as a
+ singly-linked list. See also
+ FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT 12 /*!< if there is a 'natural' successor
+ of the page, its offset.
+ Otherwise FIL_NULL.
+ B-tree index pages
+ (FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+ on the same PAGE_LEVEL are maintained
+ as a doubly linked list via
+ FIL_PAGE_PREV and FIL_PAGE_NEXT
+ in the collation order of the
+ smallest user record on each page. */
+#define FIL_PAGE_LSN 16 /*!< lsn of the end of the newest
+ modification log record to the page */
+#define FIL_PAGE_TYPE 24 /*!< file page type: FIL_PAGE_INDEX,...,
+ 2 bytes.
+
+ The contents of this field can only
+ be trusted in the following case:
+ if the page is an uncompressed
+ B-tree index page, then it is
+ guaranteed that the value is
+ FIL_PAGE_INDEX.
+ The opposite does not hold.
+
+ In tablespaces created by
+ MySQL/InnoDB 5.1.7 or later, the
+ contents of this field is valid
+ for all uncompressed pages. */
+#define FIL_PAGE_FILE_FLUSH_LSN 26 /*!< this is only defined for the
+ first page in a system tablespace
+ data file (ibdata*, not *.ibd):
+ the file has been flushed to disk
+ at least up to this lsn */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
+ contains the space id of the page */
+#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
+#define FIL_PAGE_DATA_ALIGN_32 40
+/* @} */
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
+ to store the page checksum, the
+ last 4 bytes should be identical
+ to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END 8 /*!< size of the page trailer */
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_INDEX 17855 /*!< B-tree node */
+#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */
+#define FIL_PAGE_INODE 3 /*!< Index node */
+#define FIL_PAGE_IBUF_FREE_LIST 4 /*!< Insert buffer free list */
+/* File page types introduced in MySQL/InnoDB 5.1.7 */
+#define FIL_PAGE_TYPE_ALLOCATED 0 /*!< Freshly allocated page */
+#define FIL_PAGE_IBUF_BITMAP 5 /*!< Insert buffer bitmap */
+#define FIL_PAGE_TYPE_SYS 6 /*!< System page */
+#define FIL_PAGE_TYPE_TRX_SYS 7 /*!< Transaction system data */
+#define FIL_PAGE_TYPE_FSP_HDR 8 /*!< File space header */
+#define FIL_PAGE_TYPE_XDES 9 /*!< Extent descriptor page */
+#define FIL_PAGE_TYPE_BLOB 10 /*!< Uncompressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB 11 /*!< First compressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB2 12 /*!< Subsequent compressed BLOB page */
+/* @} */
+
+/** Space types @{ */
+#define FIL_TABLESPACE 501 /*!< tablespace */
+#define FIL_LOG 502 /*!< redo log */
+/* @} */
+
+/** The number of fsyncs done to the log */
+extern ulint fil_n_log_flushes;
+
+/** Number of pending redo log flushes */
+extern ulint fil_n_pending_log_flushes;
+/** Number of pending tablespace flushes */
+extern ulint fil_n_pending_tablespace_flushes;
+
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the latch of a file space.
+@return latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+ ulint id, /*!< in: space id */
+ ulint* zip_size);/*!< out: compressed page size, or
+ 0 for uncompressed tablespaces */
+/*******************************************************************//**
+Returns the type of a file space.
+@return FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed. */
+UNIV_INTERN
+void
+fil_node_create(
+/*============*/
+ const char* name, /*!< in: file name (file must be closed) */
+ ulint size, /*!< in: file size in database blocks, rounded
+ downwards to an integer */
+ ulint id, /*!< in: space id where to append */
+ ibool is_raw);/*!< in: TRUE if a raw device or
+ a raw disk partition */
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+ ulint id, /*!< in: space id */
+ ulint trunc_len); /*!< in: truncate by this much; it is an error
+ if this does not equal to the combined size of
+ some initial files in the space */
+#endif /* UNIV_LOG_ARCHIVE */
+/*******************************************************************//**
+Creates a space memory object and puts it to the 'fil system' hash table. If
+there is an error, prints an error message to the .err log.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+ const char* name, /*!< in: space name */
+ ulint id, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size, or
+ 0 for uncompressed tablespaces */
+ ulint purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+ ulint* space_id); /*!< in/out: space id */
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+ ulint hash_size, /*!< in: hash table size */
+ ulint max_n_open); /*!< in: max number of open files */
+/*******************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void);
+/*===========*/
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void);
+/*==========================================*/
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void);
+/*=====================*/
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+ ulint max_id);/*!< in: maximum known id */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+ ib_uint64_t lsn, /*!< in: lsn to write */
+ ulint arch_log_no); /*!< in: latest archived log
+ file number */
+/*******************************************************************//**
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+UNIV_INTERN
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+ os_file_t data_file, /*!< in: open data file */
+ ibool one_read_already, /*!< in: TRUE if min and max
+ parameters below already
+ contain sensible data */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint* min_arch_log_no, /*!< in/out: */
+ ulint* max_arch_log_no, /*!< in/out: */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t* min_flushed_lsn, /*!< in/out: */
+ ib_uint64_t* max_flushed_lsn); /*!< in/out: */
+/*******************************************************************//**
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return TRUE if being deleted, and ibuf merges should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Decrements the count of pending insert buffer page merges. */
+UNIV_INTERN
+void
+fil_decr_pending_ibuf_merges(
+/*=========================*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+ byte* ptr, /*!< in: buffer containing the log record body,
+ or an initial segment of it, if the record does
+ not fir completely between ptr and end_ptr */
+ byte* end_ptr, /*!< in: buffer end */
+ ulint type, /*!< in: the type of this log record */
+ ulint space_id, /*!< in: the space id of the tablespace in
+ question, or 0 if the log record should
+ only be parsed but not replayed */
+ ulint log_flags); /*!< in: redo log flags
+ (stored in the page number parameter) */
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_delete_tablespace(
+/*==================*/
+ ulint id); /*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_discard_tablespace(
+/*===================*/
+ ulint id); /*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+ const char* old_name, /*!< in: old table name in the standard
+ databasename/tablename format of
+ InnoDB, or NULL if we do the rename
+ based on the space id only */
+ ulint id, /*!< in: space id */
+ const char* new_name); /*!< in: new table name in the standard
+ databasename/tablename format
+ of InnoDB */
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fil_create_new_single_table_tablespace(
+/*===================================*/
+ ulint space_id, /*!< in: space id */
+ const char* tablename, /*!< in: the table name in the usual
+ databasename/tablename format
+ of InnoDB, or a dir path to a temp
+ table */
+ ibool is_temp, /*!< in: TRUE if a table created with
+ CREATE TEMPORARY TABLE */
+ ulint flags, /*!< in: tablespace flags */
+ ulint size); /*!< in: the initial size of the
+ tablespace file in pages,
+ must be >= FIL_IBD_FILE_INITIAL_SIZE */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+ ibool check_space_id, /*!< in: should we check that the space
+ id in the file is right; we assume
+ that this function runs much faster
+ if no check is made, since accessing
+ the file inode probably is much
+ faster (the OS caches them) than
+ accessing the first page of the file */
+ ulint id, /*!< in: space id */
+ ulint flags, /*!< in: tablespace flags */
+ const char* name); /*!< in: table name in the
+ databasename/tablename format */
+/********************************************************************//**
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+ const char* name, /*!< in: table name in the
+ databasename/tablename format */
+ ib_uint64_t current_lsn); /*!< in: reset lsn's if the lsn stamped
+ to FIL_PAGE_FILE_FLUSH_LSN in the
+ first page is too high */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_load_single_table_tablespaces(void);
+/*===================================*/
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return TRUE if does not exist or is being\ deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+ ulint id, /*!< in: space id */
+ ib_int64_t version);/*!< in: tablespace_version should be this; if
+ you pass -1 as the value of this, then this
+ parameter is ignored */
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+ ulint id); /*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+ ulint id, /*!< in: space id */
+ const char* name, /*!< in: table name in the standard
+ 'databasename/tablename' format or
+ the dir path to a temp table */
+ ibool is_temp, /*!< in: TRUE if created with CREATE
+ TEMPORARY TABLE */
+ ibool mark_space, /*!< in: in crash recovery, at database
+ startup we mark all spaces which have
+ an associated table in the InnoDB
+ data dictionary, so that
+ we can print a warning about orphaned
+ tablespaces */
+ ibool print_error_if_does_not_exist);
+ /*!< in: print detailed error
+ information to the .err log if a
+ matching tablespace is not found from
+ memory */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be appllied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void);
+/*======================================*/
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+ ulint* actual_size, /*!< out: size of the space after extension;
+ if we ran out of disk space this may be lower
+ than the desired size */
+ ulint space_id, /*!< in: space id */
+ ulint size_after_extend);/*!< in: desired size in pages after the
+ extension; if the current space size is bigger
+ than this already, the function does nothing */
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_free_now, /*!< in: number of free extents now */
+ ulint n_to_reserve); /*!< in: how many one wants to reserve */
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+ ulint id, /*!< in: space id */
+ ulint n_reserved); /*!< in: how many one reserved */
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+ ulint id); /*!< in: space id */
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \
+ _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL)
+
+UNIV_INTERN
+ulint
+_fil_io(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE,
+ ORed to OS_FILE_LOG, if a log i/o
+ and ORed to OS_AIO_SIMULATED_WAKE_LATER
+ if simulated aio and we want to post a
+ batch of i/os; NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ ibool sync, /*!< in: TRUE if synchronous aio is desired */
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len, /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+ void* buf, /*!< in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+ void* message, /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+ trx_t* trx);
+/********************************************************************//**
+Confirm whether the parameters are valid or not */
+UNIV_INTERN
+ibool
+fil_area_is_exist(
+/*==============*/
+ ulint space_id, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint block_offset, /*!< in: offset in number of blocks */
+ ulint byte_offset, /*!< in: remainder of offset in bytes; in
+ aio this must be divisible by the OS block
+ size */
+ ulint len); /*!< in: how many bytes to read or write; this
+ must not cross a file boundary; in aio this
+ must be a block size multiple */
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+ ulint segment); /*!< in: the number of the segment in the aio
+ array to wait for */
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+ ulint space_id); /*!< in: file space id (this can be a group of
+ log files or a tablespace of the database) */
+/**********************************************************************//**
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+ ulint purpose); /*!< in: FIL_TABLESPACE, FIL_LOG */
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void);
+/*==============*/
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+ fil_addr_t addr); /*!< in: address */
+/********************************************************************//**
+Get the predecessor of a file page.
+@return FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+ const byte* page); /*!< in: file page */
+/********************************************************************//**
+Get the successor of a file page.
+@return FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+ const byte* page); /*!< in: file page */
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+ byte* page, /*!< in/out: file page */
+ ulint type); /*!< in: type */
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+ const byte* page); /*!< in: file page */
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+fil_system_hash_cells(void);
+/*========================*/
+
+ulint
+fil_system_hash_nodes(void);
+/*========================*/
+
+/*************************************************************************
+functions to access is_corrupt flag of fil_space_t*/
+
+ibool
+fil_space_is_corrupt(
+/*=================*/
+ ulint space_id);
+
+void
+fil_space_set_corrupt(
+/*==================*/
+ ulint space_id);
+
+typedef struct fil_space_struct fil_space_t;
+
+#endif
diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h
new file mode 100644
index 00000000000..7abd3914eda
--- /dev/null
+++ b/storage/xtradb/include/fsp0fsp.h
@@ -0,0 +1,359 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "fsp0types.h"
+
+/**********************************************************************//**
+Initializes the file space system. */
+UNIV_INTERN
+void
+fsp_init(void);
+/*==========*/
+/**********************************************************************//**
+Gets the current free limit of the system tablespace. The free limit
+means the place of the first page which has never been put to the
+free list for allocation. The space above that address is initialized
+to zero. Sets also the global variable log_fsp_current_free_limit.
+@return free limit in megabytes */
+UNIV_INTERN
+ulint
+fsp_header_get_free_limit(void);
+/*===========================*/
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header. If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files. If there is an auto-extending data file,
+this can be smaller.
+@return size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void);
+/*================================*/
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+ page_t* page); /*!< in: header page (page 0 in the tablespace) */
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+/**********************************************************************//**
+Writes the space id and compressed page size to a tablespace header.
+This function is used past the buffer pool when we in fil0fil.c create
+a new single-table tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+ page_t* page, /*!< in/out: first page in the space */
+ ulint space_id, /*!< in: space id */
+ ulint flags); /*!< in: tablespace flags (FSP_SPACE_FLAGS):
+ 0, or table->flags if newer than COMPACT */
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint size, /*!< in: current size in blocks */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint size_inc,/*!< in: size increment in pages */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page where the segment header is placed: if
+ this is != 0, the page must belong to another segment,
+ if this is 0, a new page will be allocated and it
+ will belong to the created segment */
+ ulint byte_offset, /*!< in: byte offset of the created segment header
+ on the page */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has already
+ done the reservation for the pages with
+ fsp_reserve_free_extents (at least 2 extents: one for
+ the inode and the other for the segment) then there is
+ no need to do the check for this individual
+ operation */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+ fseg_header_t* header, /*!< in: segment header */
+ ulint* used, /*!< out: number of pages used (<= reserved) */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation.
+@return the allocated page offset FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page(
+/*=================*/
+ fseg_header_t* seg_header, /*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction, /*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+ fseg_header_t* seg_header,/*!< in: segment header */
+ ulint hint, /*!< in: hint of which page would be desirable */
+ byte direction,/*!< in: if the new page is needed because
+ of an index page split, and records are
+ inserted there in order, into which
+ direction they go alphabetically: FSP_DOWN,
+ FSP_UP, FSP_NO_DIR */
+ ibool has_done_reservation, /*!< in: TRUE if the caller has
+ already done the reservation for the page
+ with fsp_reserve_free_extents, then there
+ is no need to do the check for this individual
+ page */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+ ulint* n_reserved,/*!< out: number of extents actually reserved; if we
+ return TRUE and the tablespace size is < 64 pages,
+ then this can be 0, otherwise it is n_ext */
+ ulint space, /*!< in: space id */
+ ulint n_ext, /*!< in: number of extents to reserve */
+ ulint alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+ ulint space); /*!< in: space id */
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+ fseg_header_t* seg_header, /*!< in: segment header */
+ ulint space, /*!< in: space id */
+ ulint page, /*!< in: page offset */
+ mtr_t* mtr); /*!< in: mtr handle */
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction.
+@return TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+ fseg_header_t* header, /*!< in, own: segment header; NOTE: if the header
+ resides on the first page of the frag list
+ of the segment, this pointer becomes obsolete
+ after the last freeing step */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+ fseg_header_t* header, /*!< in: segment header which must reside on
+ the first fragment page of the segment */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no);/*!< in: page number */
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr, /*!< in: buffer end */
+ buf_block_t* block); /*!< in: block or NULL */
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+ ulint space); /*!< in: space id */
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+ ulint space); /*!< in: space id */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+ fseg_header_t* header, /*!< in: segment header */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* UNIV_BTR_PRINT */
+
+#ifndef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic
new file mode 100644
index 00000000000..434c370b527
--- /dev/null
+++ b/storage/xtradb/include/fsp0fsp.ic
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.ic
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+ == FSP_XDES_OFFSET));
+ }
+
+ return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET));
+}
diff --git a/storage/xtradb/include/fsp0types.h b/storage/xtradb/include/fsp0types.h
new file mode 100644
index 00000000000..2dd2deca671
--- /dev/null
+++ b/storage/xtradb/include/fsp0types.h
@@ -0,0 +1,110 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#ifndef fsp0types_h
+#define fsp0types_h
+
+#include "univ.i"
+
+#include "fil0fil.h" /* for FIL_PAGE_DATA */
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page) */
+/* @{ */
+#define FSP_UP ((byte)111) /*!< alphabetically upwards */
+#define FSP_DOWN ((byte)112) /*!< alphabetically downwards */
+#define FSP_NO_DIR ((byte)113) /*!< no order */
+/* @} */
+
+/** File space extent size (one megabyte) in pages */
+#define FSP_EXTENT_SIZE (1u << (20 - UNIV_PAGE_SIZE_SHIFT))
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef byte fseg_header_t;
+
+#define FSEG_HDR_SPACE 0 /*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO 4 /*!< page number of the inode */
+#define FSEG_HDR_OFFSET 8 /*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE 10 /*!< Length of the file system
+ header, in bytes */
+/* @} */
+
+/** Flags for fsp_reserve_free_extents @{ */
+#define FSP_NORMAL 1000000
+#define FSP_UNDO 2000000
+#define FSP_CLEANING 3000000
+/* @} */
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE UNIV_PAGE_SIZE */
+/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET 0 /* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET 1 /* !< insert buffer bitmap */
+ /* The ibuf bitmap pages are the ones whose
+ page number is the number above plus a
+ multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO 2 /*!< in every tablespace */
+ /* The following pages exist
+ in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO 3 /*!< insert buffer
+ header page, in
+ tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO 4 /*!< insert buffer
+ B-tree root page in
+ tablespace 0 */
+ /* The ibuf tree root page number in
+ tablespace 0; its fseg inode is on the page
+ number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO 5 /*!< transaction
+ system header, in
+ tablespace 0 */
+#define FSP_FIRST_RSEG_PAGE_NO 6 /*!< first rollback segment
+ page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO 7 /*!< data dictionary header
+ page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+#endif /* fsp0types_h */
diff --git a/storage/xtradb/include/fut0fut.h b/storage/xtradb/include/fut0fut.h
new file mode 100644
index 00000000000..dce20b3bad6
--- /dev/null
+++ b/storage/xtradb/include/fut0fut.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.h
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fil_addr_t addr, /*!< in: file address */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr); /*!< in: mtr handle */
+
+#ifndef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
+#endif
+
diff --git a/storage/xtradb/include/fut0fut.ic b/storage/xtradb/include/fut0fut.ic
new file mode 100644
index 00000000000..529f2a516d3
--- /dev/null
+++ b/storage/xtradb/include/fut0fut.ic
@@ -0,0 +1,63 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.ic
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "buf0buf.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ fil_addr_t addr, /*!< in: file address */
+ ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr handle */
+{
+ buf_block_t* block;
+ byte* ptr;
+
+ ut_ad(addr.boffset < UNIV_PAGE_SIZE);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+ block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr);
+
+ if (srv_pass_corrupt_table && !block) {
+ return(0);
+ }
+ ut_a(block);
+
+ ptr = buf_block_get_frame(block) + addr.boffset;
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ return(ptr);
+}
diff --git a/storage/xtradb/include/fut0lst.h b/storage/xtradb/include/fut0lst.h
new file mode 100644
index 00000000000..fe024c2498f
--- /dev/null
+++ b/storage/xtradb/include/fut0lst.h
@@ -0,0 +1,217 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef byte flst_base_node_t;
+typedef byte flst_node_t;
+
+/* The physical size of a list base node in bytes */
+#define FLST_BASE_NODE_SIZE (4 + 2 * FIL_ADDR_SIZE)
+
+/* The physical size of a list node in bytes */
+#define FLST_NODE_SIZE (2 * FIL_ADDR_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node1, /*!< in: node to insert after */
+ flst_node_t* node2, /*!< in: node to add */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to insert */
+ flst_node_t* node3, /*!< in: node to insert before */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: node to remove */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove,
+ must be >= 1 */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+ flst_base_node_t* base, /*!< in: pointer to base node of list */
+ flst_node_t* node2, /*!< in: first node not to remove */
+ ulint n_nodes,/*!< in: number of nodes to remove */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list length.
+@return length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list first node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list last node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list next node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list prev node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ fil_addr_t addr, /*!< in: file address */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Reads a file address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ const fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************************//**
+Validates a file-based list.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr1); /*!< in: mtr */
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+ const flst_base_node_t* base, /*!< in: pointer to base node of list */
+ mtr_t* mtr); /*!< in: mtr */
+
+
+#ifndef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/fut0lst.ic b/storage/xtradb/include/fut0lst.ic
new file mode 100644
index 00000000000..dcd13c61871
--- /dev/null
+++ b/storage/xtradb/include/fut0lst.ic
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.ic
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+#include "mtr0log.h"
+#include "buf0buf.h"
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV 0 /* 6-byte address of the previous list element;
+ the page part of address is FIL_NULL, if no
+ previous element */
+#define FLST_NEXT FIL_ADDR_SIZE /* 6-byte address of the next
+ list element; the page part of address
+ is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN 0 /* 32-bit list length field */
+#define FLST_FIRST 4 /* 6-byte address of the first element
+ of the list; undefined if empty list */
+#define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the
+ last element of the list; undefined
+ if empty list */
+
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+ fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ fil_addr_t addr, /*!< in: file address */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(faddr && mtr);
+ ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX));
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+
+ mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
+ mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
+ MLOG_2BYTES, mtr);
+}
+
+/********************************************************************//**
+Reads a file address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+ const fil_faddr_t* faddr, /*!< in: pointer to file faddress */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ fil_addr_t addr;
+
+ ut_ad(faddr && mtr);
+
+ addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr);
+ addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES,
+ mtr);
+ ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+ ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+ return(addr);
+}
+
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+ flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+
+ mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
+ flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+ flst_write_addr(base + FLST_LAST, fil_addr_null, mtr);
+}
+
+/********************************************************************//**
+Gets list length.
+@return length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr));
+}
+
+/********************************************************************//**
+Gets list first node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_FIRST, mtr));
+}
+
+/********************************************************************//**
+Gets list last node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+ const flst_base_node_t* base, /*!< in: pointer to base node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(base + FLST_LAST, mtr));
+}
+
+/********************************************************************//**
+Gets list next node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_NEXT, mtr));
+}
+
+/********************************************************************//**
+Gets list prev node address.
+@return file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+ const flst_node_t* node, /*!< in: pointer to node */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ return(flst_read_addr(node + FLST_PREV, mtr));
+}
diff --git a/storage/xtradb/include/ha0ha.h b/storage/xtradb/include/ha0ha.h
new file mode 100644
index 00000000000..3299000bf3c
--- /dev/null
+++ b/storage/xtradb/include/ha0ha.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "univ.i"
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: folded value of the searched data */
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and updates
+the pointer to data if found. */
+UNIV_INTERN
+void
+ha_search_and_update_if_found_func(
+/*===============================*/
+ hash_table_t* table, /*!< in/out: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data, /*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* new_data);/*!< in: new pointer to the data */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table in/out: hash table
+@param fold in: folded value of the searched data
+@param data in: pointer to the data
+@param new_block in: block containing new_data
+@param new_data in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found_func(table,fold,data,new_block,new_data)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table in/out: hash table
+@param fold in: folded value of the searched data
+@param data in: pointer to the data
+@param new_block ignored: block containing new_data
+@param new_data in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+ ha_search_and_update_if_found_func(table,fold,data,new_data)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/*************************************************************//**
+Creates a hash table with at least n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+ ulint n, /*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+ ulint mutex_level, /*!< in: level of the mutexes in the latching
+ order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes); /*!< in: number of mutexes to protect the
+ hash table: must be a power of 2, or 0 */
+#ifdef UNIV_SYNC_DEBUG
+/** Creates a hash table.
+@return own: created table
+@param n_c in: number of array cells. The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level in: level of the mutexes in the latching order
+@param n_m in: number of mutexes to protect the hash table;
+ must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m)
+#else /* UNIV_SYNC_DEBUG */
+/** Creates a hash table.
+@return own: created table
+@param n_c in: number of array cells. The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level in: level of the mutexes in the latching order
+@param n_m in: number of mutexes to protect the hash table;
+ must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+ hash_table_t* table); /*!< in, own: hash table */
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of data; if a node with
+ the same fold value already exists, it is
+ updated to point to the same data, and no new
+ node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data); /*!< in: data, must not be NULL */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated
+@param t in: hash table
+@param f in: folded value of data
+@param b in: buffer block containing the data
+@param d in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return TRUE if succeed, FALSE if no more memory could be allocated
+@param t in: hash table
+@param f in: folded value of data
+@param b ignored: buffer block containing the data
+@param d in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and deletes
+it from the hash table if found.
+@return TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data); /*!< in: pointer to the data */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: fold value */
+ const page_t* page); /*!< in: buffer page */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint start_index, /*!< in: start index */
+ ulint end_index); /*!< in: end index */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ hash_table_t* table); /*!< in: hash table */
+#endif /* !UNIV_HOTBACKUP */
+
+/** The hash table external chain node */
+typedef struct ha_node_struct ha_node_t;
+
+/** The hash table external chain node */
+struct ha_node_struct {
+ ha_node_t* next; /*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block; /*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data; /*!< pointer to the data */
+ ulint fold; /*!< fold value for the data */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table in: hash table
+@param fold in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold) \
+ ut_ad(!(table)->mutexes || mutex_own(hash_get_mutex(table, fold)))
+#else /* !UNIV_HOTBACKUP */
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table in: hash table
+@param fold in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ha0ha.ic b/storage/xtradb/include/ha0ha.ic
new file mode 100644
index 00000000000..734403c4cd9
--- /dev/null
+++ b/storage/xtradb/include/ha0ha.ic
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+#include "mem0mem.h"
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ha_node_t* del_node); /*!< in: node to be deleted */
+
+/******************************************************************//**
+Gets a hash node data.
+@return pointer to the data */
+UNIV_INLINE
+void*
+ha_node_get_data(
+/*=============*/
+ ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+ ha_node_t* node, /*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ buf_block_t* block, /*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ void* data) /*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+ node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n in: hash chain node
+@param b in: buffer block containing the data
+@param d in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+ ha_node_t* node) /*!< in: hash chain node */
+{
+ return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold value determining the chain */
+{
+ return((ha_node_t*)
+ hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the first hash table node in chain having the fold
+number, NULL if not found */
+UNIV_INLINE
+ha_node_t*
+ha_search(
+/*======*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: folded value of the searched data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->fold == fold) {
+
+ return(node->data);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_chain_get_first(table, fold);
+
+ while (node) {
+ if (node->data == data) {
+
+ return(node);
+ }
+
+ node = ha_chain_get_next(node);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and deletes
+it from the hash table, if found.
+@return TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold, /*!< in: folded value of the searched data */
+ void* data) /*!< in: pointer to the data */
+{
+ ha_node_t* node;
+
+ ASSERT_HASH_MUTEX_OWN(table, fold);
+
+ node = ha_search_with_data(table, fold, data);
+
+ if (node) {
+ ha_delete_hash_node(table, node);
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/storage/xtradb/include/ha0storage.h b/storage/xtradb/include/ha0storage.h
new file mode 100644
index 00000000000..c30bd840579
--- /dev/null
+++ b/storage/xtradb/include/ha0storage.h
@@ -0,0 +1,140 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES 1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS 4096
+
+/** Hash storage */
+typedef struct ha_storage_struct ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells); /*!< in: initial number of cells
+ in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return pointer to the copy */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+ ha_storage_t* storage, /*!< in/out: hash storage */
+ const void* data, /*!< in: data to store */
+ ulint data_len, /*!< in: data length */
+ ulint memlim); /*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage in/out: hash storage
+@param data in: data to store
+@param data_len in: data length
+@return pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len) \
+ ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@return pointer to the copy of the string */
+#define ha_storage_put_str(storage, str) \
+ ((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage in/out: hash storage
+@param str in: string to put
+@param memlim in: memory limit to obey
+@return pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim) \
+ ((const char*) ha_storage_put_memlim((storage), (str), \
+ strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage); /*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage); /*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage); /*!< in: hash storage */
+
+#ifndef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+#endif /* ha0storage_h */
diff --git a/storage/xtradb/include/ha0storage.ic b/storage/xtradb/include/ha0storage.ic
new file mode 100644
index 00000000000..5acbf82f005
--- /dev/null
+++ b/storage/xtradb/include/ha0storage.ic
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_struct {
+ mem_heap_t* heap; /*!< memory heap from which memory is
+ allocated */
+ hash_table_t* hash; /*!< hash table used to avoid
+ duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+typedef struct ha_storage_node_struct ha_storage_node_t;
+/** Objects of this type are stored in ha_storage_struct */
+struct ha_storage_node_struct {
+ ulint data_len;/*!< length of the data */
+ const void* data; /*!< pointer to data */
+ ha_storage_node_t* next; /*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+ ulint initial_heap_bytes, /*!< in: initial heap's size */
+ ulint initial_hash_cells) /*!< in: initial number of cells
+ in the hash table */
+{
+ ha_storage_t* storage;
+ mem_heap_t* heap;
+
+ if (initial_heap_bytes == 0) {
+
+ initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+ }
+
+ if (initial_hash_cells == 0) {
+
+ initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+ }
+
+ /* we put "storage" within "storage->heap" */
+
+ heap = mem_heap_create(sizeof(ha_storage_t)
+ + initial_heap_bytes);
+
+ storage = (ha_storage_t*) mem_heap_alloc(heap,
+ sizeof(ha_storage_t));
+
+ storage->heap = heap;
+ storage->hash = hash_create(initial_hash_cells);
+
+ return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+ ha_storage_t** storage) /*!< in/out: hash storage */
+{
+ ha_storage_t temp_storage;
+
+ temp_storage.heap = (*storage)->heap;
+ temp_storage.hash = (*storage)->hash;
+
+ hash_table_clear(temp_storage.hash);
+ mem_heap_empty(temp_storage.heap);
+
+ *storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+ sizeof(ha_storage_t));
+
+ (*storage)->heap = temp_storage.heap;
+ (*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+ ha_storage_t* storage) /*!< in, own: hash storage */
+{
+ /* order is important because the pointer storage->hash is
+ within the heap */
+ hash_table_free(storage->hash);
+ mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+ const ha_storage_t* storage) /*!< in: hash storage */
+{
+ ulint ret;
+
+ ret = mem_heap_get_size(storage->heap);
+
+ /* this assumes hash->heap and hash->heaps are NULL */
+ ret += sizeof(hash_table_t);
+ ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash);
+
+ return(ret);
+}
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
new file mode 100644
index 00000000000..445d94eeabb
--- /dev/null
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -0,0 +1,279 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code
+
+Created 5/11/2006 Osku Salerma
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "trx0types.h"
+#include "m_ctype.h" /* CHARSET_INFO */
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+ void* to, /*!< out: converted string */
+ ulint to_length, /*!< in: number of bytes reserved
+ for the converted string */
+ CHARSET_INFO* to_cs, /*!< in: character set to convert to */
+ const void* from, /*!< in: string to convert */
+ ulint from_length, /*!< in: number of bytes to convert */
+ CHARSET_INFO* from_cs, /*!< in: character set to convert from */
+ uint* errors); /*!< out: number of errors encountered
+ during the conversion */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint charset_coll, /*!< in: charset collation */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+ trx_t* trx, /*!< in: transaction which
+ modifies the table */
+ const char* full_name, /*!< in: concatenation of
+ database name, null char NUL,
+ table name, null char NUL;
+ NOTE that in Windows this is
+ always in LOWER CASE! */
+ ulint full_name_len); /*!< in: full name length where
+ also the null chars count */
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+ char* buf, /*!< out: buffer for converted identifier */
+ ulint buflen, /*!< in: length of buf, in bytes */
+ const char* id, /*!< in: identifier to convert */
+ ulint idlen, /*!< in: length of id, in bytes */
+ void* thd, /*!< in: MySQL connection thread, or NULL */
+ ibool table_id);/*!< in: TRUE=id is a table or database name;
+ FALSE=id is an index name */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+ void* thd); /*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+ void* thd); /*!< in: thread handle (THD*) */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+ FILE* f, /*!< in: output stream */
+ void* thd, /*!< in: pointer to a MySQL THD object */
+ uint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return DATA_BINARY, DATA_VARCHAR, ... */
+UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+ ulint* unsigned_flag, /*!< out: DATA_UNSIGNED if an
+ 'unsigned type';
+ at least ENUM and SET,
+ and unsigned integer
+ types are 'unsigned types' */
+ const void* field) /*!< in: MySQL Field */
+ __attribute__((nonnull));
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+ ulint cset, /*!< in: MySQL charset-collation code */
+ ulint* mbminlen, /*!< out: minimum length of a char (in bytes) */
+ ulint* mbmaxlen); /*!< out: maximum length of a char (in bytes) */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return 0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+ const char* a, /*!< in: first string to compare */
+ const char* b); /*!< in: second string to compare */
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return true if thd is executing SELECT */
+
+ibool
+thd_is_select(
+/*==========*/
+ const void* thd); /*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+ struct charset_info_st* cs, /*!< in: the 'from' character set */
+ char* to, /*!< out: converted identifier */
+ const char* from, /*!< in: identifier to convert */
+ ulint len); /*!< in: length of 'to', in bytes; should
+ be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+ char* a); /*!< in/out: string to put in lower case */
+
+/**********************************************************************//**
+Determines the connection character set.
+@return connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+ void* mysql_thd); /*!< in: MySQL thread handle */
+/**********************************************************************//**
+Determines the current SQL statement.
+@return SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+ void* mysql_thd, /*!< in: MySQL thread handle */
+ size_t* length) /*!< out: length of the SQL statement */
+ __attribute__((nonnull));
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return number of bytes occupied by the first n characters */
+UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+ ulint charset_id, /*!< in: character set id */
+ ulint prefix_len, /*!< in: prefix length in bytes of the index
+ (this has to be divided by mbmaxlen to get the
+ number of CHARACTERS n in the prefix) */
+ ulint data_len, /*!< in: length of the string in bytes */
+ const char* str); /*!< in: character string */
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return true if thd supports XA */
+
+ibool
+thd_supports_xa(
+/*============*/
+ void* thd); /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_supports_xa */
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return the lock wait timeout, in seconds */
+
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+ void* thd); /*!< in: thread handle (THD*), or NULL to query
+ the global innodb_lock_wait_timeout */
+
+/******************************************************************//**
+*/
+
+ulong
+thd_flush_log_at_trx_commit_session(
+/*================================*/
+ void* thd);
+
+#endif
diff --git a/storage/xtradb/include/handler0alter.h b/storage/xtradb/include/handler0alter.h
new file mode 100644
index 00000000000..985b76f4f50
--- /dev/null
+++ b/storage/xtradb/include/handler0alter.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+ TABLE* table, /*!< in/out: MySQL table */
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets); /*!< in: rec_get_offsets(
+ rec, index, ...) */
+
+/*************************************************************//**
+Resets table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+ TABLE* table); /*!< in/out: MySQL table */
diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h
new file mode 100644
index 00000000000..492c767acc4
--- /dev/null
+++ b/storage/xtradb/include/hash0hash.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef hash0hash_h
+#define hash0hash_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* !UNIV_HOTBACKUP */
+
+typedef struct hash_table_struct hash_table_t;
+typedef struct hash_cell_struct hash_cell_t;
+
+typedef void* hash_node_t;
+
+/* Fix Bug #13859: symbol collision between imap/mysql */
+#define hash_create hash0_create
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+ ulint n); /*!< in: number of array cells */
+
+/*************************************************************//**
+*/
+UNIV_INTERN
+ulint
+hash_create_needed(
+/*===============*/
+ ulint n);
+
+UNIV_INTERN
+void
+hash_create_init(
+/*=============*/
+ hash_table_t* table,
+ ulint n);
+
+UNIV_INTERN
+void
+hash_create_reuse(
+/*==============*/
+ hash_table_t* table);
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a mutex array to protect a hash table. */
+UNIV_INTERN
+void
+hash_create_mutexes_func(
+/*=====================*/
+ hash_table_t* table, /*!< in: hash table */
+#ifdef UNIV_SYNC_DEBUG
+ ulint sync_level, /*!< in: latching order level of the
+ mutexes: used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint n_mutexes); /*!< in: number of mutexes */
+#ifdef UNIV_SYNC_DEBUG
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n)
+#else /* UNIV_SYNC_DEBUG */
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n)
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+ hash_table_t* table); /*!< in, own: hash table */
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ ulint fold, /*!< in: folded value */
+ hash_table_t* table); /*!< in: hash table */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Assert that the mutex for the table in a hash operation is owned. */
+# define HASH_ASSERT_OWNED(TABLE, FOLD) \
+ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));
+#else /* !UNIV_HOTBACKUP */
+# define HASH_ASSERT_OWNED(TABLE, FOLD)
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ (DATA)->NAME = NULL;\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == NULL) {\
+ cell3333->node = DATA;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != NULL) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ }\
+\
+ struct3333->NAME = DATA;\
+ }\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+ hash_cell_t* cell3333;\
+ TYPE* struct3333;\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+ if (cell3333->node == DATA) {\
+ HASH_ASSERT_VALID(DATA->NAME);\
+ cell3333->node = DATA->NAME;\
+ } else {\
+ struct3333 = (TYPE*) cell3333->node;\
+\
+ while (struct3333->NAME != DATA) {\
+\
+ struct3333 = (TYPE*) struct3333->NAME;\
+ ut_a(struct3333);\
+ }\
+\
+ struct3333->NAME = DATA->NAME;\
+ }\
+ HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL)\
+ (hash_get_nth_cell(TABLE, HASH_VAL)->node)
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA) ((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+\
+ HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+ HASH_ASSERT_VALID(DATA);\
+\
+ while ((DATA) != NULL) {\
+ ASSERTION;\
+ if (TEST) {\
+ break;\
+ } else {\
+ HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+ }\
+ }\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST) \
+do { \
+ ulint i3333; \
+ \
+ for (i3333 = (TABLE)->n_cells; i3333--; ) { \
+ (DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333); \
+ \
+ while ((DATA) != NULL) { \
+ HASH_ASSERT_VALID(DATA); \
+ ASSERTION; \
+ \
+ if (TEST) { \
+ break; \
+ } \
+ \
+ (DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA); \
+ } \
+ \
+ if ((DATA) != NULL) { \
+ break; \
+ } \
+ } \
+} while (0)
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint n); /*!< in: cell index */
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+ hash_table_t* table); /*!< in/out: hash table */
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ hash_table_t* table); /*!< in: table */
+/*******************************************************************//**
+Deletes a struct which is stored in the heap of the hash table, and compacts
+the heap. The fold value must be stored in the struct NODE in a field named
+'fold'. */
+
+#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
+do {\
+ TYPE* node111;\
+ TYPE* top_node111;\
+ hash_cell_t* cell111;\
+ ulint fold111;\
+\
+ fold111 = (NODE)->fold;\
+\
+ HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
+\
+ top_node111 = (TYPE*)mem_heap_get_top(\
+ hash_get_heap(TABLE, fold111),\
+ sizeof(TYPE));\
+\
+ /* If the node to remove is not the top node in the heap, compact the\
+ heap of nodes by moving the top node in the place of NODE. */\
+\
+ if (NODE != top_node111) {\
+\
+ /* Copy the top node in place of NODE */\
+\
+ *(NODE) = *top_node111;\
+\
+ cell111 = hash_get_nth_cell(TABLE,\
+ hash_calc_hash(top_node111->fold, TABLE));\
+\
+ /* Look for the pointer to the top node, to update it */\
+\
+ if (cell111->node == top_node111) {\
+ /* The top node is the first in the chain */\
+\
+ cell111->node = NODE;\
+ } else {\
+ /* We have to look for the predecessor of the top\
+ node */\
+ node111 = cell111->node;\
+\
+ while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
+\
+ node111 = HASH_GET_NEXT(NAME, node111);\
+ }\
+\
+ /* Now we have the predecessor node */\
+\
+ node111->NAME = NODE;\
+ }\
+ }\
+\
+ /* Free the space occupied by the top node */\
+\
+ mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
+} while (0)
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Move all hash table entries from OLD_TABLE to NEW_TABLE. */
+
+#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
+do {\
+ ulint i2222;\
+ ulint cell_count2222;\
+\
+ cell_count2222 = hash_get_n_cells(OLD_TABLE);\
+\
+ for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+ NODE_TYPE* node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\
+\
+ while (node2222) {\
+ NODE_TYPE* next2222 = node2222->PTR_NAME;\
+ ulint fold2222 = FOLD_FUNC(node2222);\
+\
+ HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
+ fold2222, node2222);\
+\
+ node2222 = next2222;\
+ }\
+ }\
+} while (0)
+
+/********************************************************************//**
+Align nodes with moving location.*/
+#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \
+do {\
+ ulint i2222;\
+ ulint cell_count2222;\
+\
+ cell_count2222 = hash_get_n_cells(TABLE);\
+\
+ for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+ NODE_TYPE* node2222;\
+\
+ if ((TABLE)->array[i2222].node) \
+ (TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \
+ + (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\
+ node2222 = HASH_GET_FIRST((TABLE), i2222);\
+\
+ while (node2222) {\
+ if (node2222->PTR_NAME) \
+ node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \
+ + ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\
+\
+ node2222 = node2222->PTR_NAME;\
+ }\
+ }\
+} while (0)
+
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i); /*!< in: index of the heap */
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i); /*!< in: index of the mutex */
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold); /*!< in: fold */
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+ hash_table_t* table); /*!< in: hash table */
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+ hash_table_t* table); /*!< in: hash table */
+#else /* !UNIV_HOTBACKUP */
+# define hash_get_heap(table, fold) ((table)->heap)
+# define hash_mutex_enter(table, fold) ((void) 0)
+# define hash_mutex_exit(table, fold) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+struct hash_cell_struct{
+ void* node; /*!< hash chain node, NULL if none */
+};
+
+/* The hash table structure */
+struct hash_table_struct {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+ ibool adaptive;/* TRUE if this is the hash table of the
+ adaptive hash index */
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+ ulint n_cells;/* number of cells in the hash table */
+ hash_cell_t* array; /*!< pointer to cell array */
+#ifndef UNIV_HOTBACKUP
+ ulint n_mutexes;/* if mutexes != NULL, then the number of
+ mutexes, must be a power of 2 */
+ mutex_t* mutexes;/* NULL, or an array of mutexes used to
+ protect segments of the hash table */
+ mem_heap_t** heaps; /*!< if this is non-NULL, hash chain nodes for
+ external chaining can be allocated from these
+ memory heaps; there are then n_mutexes many of
+ these heaps */
+#endif /* !UNIV_HOTBACKUP */
+ mem_heap_t* heap;
+#ifdef UNIV_DEBUG
+ ulint magic_n;
+# define HASH_TABLE_MAGIC_N 76561114
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic
new file mode 100644
index 00000000000..0b437894e2e
--- /dev/null
+++ b/storage/xtradb/include/hash0hash.ic
@@ -0,0 +1,183 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.ic
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ut0rnd.h"
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint n) /*!< in: cell index */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_ad(n < table->n_cells);
+
+ return(table->array + n);
+}
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+ hash_table_t* table) /*!< in/out: hash table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ memset(table->array, 0x0,
+ table->n_cells * sizeof(*table->array));
+}
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+ hash_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ return(table->n_cells);
+}
+
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+ ulint fold, /*!< in: folded value */
+ hash_table_t* table) /*!< in: hash table */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ return(ut_hash_ulint(fold, table->n_cells));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_ad(ut_is_2pow(table->n_mutexes));
+ return(ut_2pow_remainder(hash_calc_hash(fold, table),
+ table->n_mutexes));
+}
+
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i) /*!< in: index of the heap */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_ad(i < table->n_mutexes);
+
+ return(table->heaps[i]);
+}
+
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+ if (table->heap) {
+ return(table->heap);
+ }
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_heap(table, i));
+}
+
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint i) /*!< in: index of the mutex */
+{
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+ ut_ad(i < table->n_mutexes);
+
+ return(table->mutexes + i);
+}
+
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+ hash_table_t* table, /*!< in: hash table */
+ ulint fold) /*!< in: fold */
+{
+ ulint i;
+
+ ut_ad(table);
+ ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+ i = hash_get_mutex_no(table, fold);
+
+ return(hash_get_nth_mutex(table, i));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/ibuf0ibuf.h b/storage/xtradb/include/ibuf0ibuf.h
new file mode 100644
index 00000000000..8aa21fb9d95
--- /dev/null
+++ b/storage/xtradb/include/ibuf0ibuf.h
@@ -0,0 +1,383 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "ibuf0types.h"
+
+/** Combinations of operations that can be buffered. Because the enum
+values are used for indexing innobase_change_buffering_values[], they
+should start at 0 and there should not be any gaps. */
+typedef enum {
+ IBUF_USE_NONE = 0,
+ IBUF_USE_INSERT, /* insert */
+
+ IBUF_USE_COUNT /* number of entries in ibuf_use_t */
+} ibuf_use_t;
+
+/** Operations that can currently be buffered. */
+extern ibuf_use_t ibuf_use;
+
+/** The insert buffer control structure */
+extern ibuf_t* ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no). When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation. This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed. To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations. Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments. The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page. It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space. It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and
+initializes the data structures for the insert buffer of each tablespace. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+ buf_block_t* block, /*!< in: bitmap page */
+ mtr_t* mtr); /*!< in: mtr */
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+ buf_block_t* block); /*!< in: index page; free bits are set to 0
+ if the index is a non-clustered
+ non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase);/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ ulint max_ins_size, /*!< in: value of
+ maximum insert size
+ with reorganize before
+ the latest operation
+ performed to the page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state. Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+ buf_block_t* block, /*!< in/out: index page */
+ mtr_t* mtr); /*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page. It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ buf_block_t* block1, /*!< in: index page */
+ buf_block_t* block2, /*!< in: index page */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique); /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INTERN
+ibool
+ibuf_inside(void);
+/*=============*/
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@return TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no);/*!< in: page number */
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page(
+/*======*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr which will contain an x-latch to the
+ bitmap page if the page is not one of the fixed
+ address ibuf pages, or NULL, in which case a new
+ transaction is created. */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint space, /*!< in: space id where to insert */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no,/*!< in: page number where to insert */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+ buf_block_t* block, /*!< in: if page has been read from
+ disk, pointer to the page x-latched,
+ else NULL */
+ ulint space, /*!< in: space id of the index page */
+ ulint page_no,/*!< in: page number of the index page */
+ ulint zip_size,/*!< in: compressed page size in bytes,
+ or 0 */
+ ibool update_ibuf_bitmap);/*!< in: normally this is set
+ to TRUE, but if we have deleted or are
+ deleting the tablespace, then we
+ naturally do not want to update a
+ non-existent bitmap page */
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+ ulint space); /*!< in: space id */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract(
+/*==========*/
+ ibool sync); /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_for_n_pages(
+/*======================*/
+ ibool sync, /*!< in: TRUE if the caller wants to wait for the
+ issued read with the highest tablespace address
+ to complete */
+ ulint n_pages);/*!< in: try to read at least this many pages to
+ the buffer pool and merge the ibuf contents to
+ them */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+ ulint space, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+#endif
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void);
+/*============*/
+
+#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
+
+#endif /* !UNIV_HOTBACKUP */
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER PAGE_DATA
+#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID 0
+
+#ifndef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic
new file mode 100644
index 00000000000..15bbe61ab30
--- /dev/null
+++ b/storage/xtradb/include/ibuf0ibuf.ic
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0lru.h"
+
+/** Counter for ibuf_should_try() */
+extern ulint ibuf_flush_count;
+
+/** An index page must contain at least UNIV_PAGE_SIZE /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page. If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
+
+/** Insert buffer struct */
+struct ibuf_struct{
+ ulint size; /*!< current size of the ibuf index
+ tree, in pages */
+ ulint max_size; /*!< recommended maximum size of the
+ ibuf index tree, in pages */
+ ulint seg_size; /*!< allocated pages of the file
+ segment containing ibuf header and
+ tree */
+ ibool empty; /*!< after an insert to the ibuf tree
+ is performed, this is set to FALSE,
+ and if a contract operation finds
+ the tree empty, this is set to
+ TRUE */
+ ulint free_list_len; /*!< length of the free list */
+ ulint height; /*!< tree height */
+ dict_index_t* index; /*!< insert buffer index */
+
+ ulint n_inserts; /*!< number of inserts made to
+ the insert buffer */
+ ulint n_merges; /*!< number of pages merged */
+ ulint n_merged_recs; /*!< number of records merged */
+};
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+ buf_block_t* block, /*!< in: index page of a non-clustered index;
+ free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+ ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
+ value which the bits must have before
+ setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+ ulint val); /*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+ dict_index_t* index, /*!< in: index where to insert */
+ ulint ignore_sec_unique) /*!< in: if != 0, we should
+ ignore UNIQUE constraint on
+ a secondary index when we
+ decide */
+{
+ if (ibuf_use != IBUF_USE_NONE
+ && !dict_index_is_clust(index)
+ && (ignore_sec_unique || !dict_index_is_unique(index))) {
+
+ ibuf_flush_count++;
+
+ if (ibuf_flush_count % 4 == 0) {
+
+ buf_LRU_try_free_flushed_blocks();
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page address.
+@return TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ ut_ad(ut_is_2pow(zip_size));
+
+ if (!zip_size) {
+ return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+ == FSP_IBUF_BITMAP_OFFSET));
+ }
+
+ return(UNIV_UNLIKELY((page_no & (zip_size - 1))
+ == FSP_IBUF_BITMAP_OFFSET));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+/*===========================*/
+ ulint zip_size, /*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint max_ins_size) /*!< in: maximum insert size after reorganize
+ for the page */
+{
+ ulint n;
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ if (zip_size) {
+ n = max_ins_size
+ / (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ } else {
+ n = max_ins_size
+ / (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ if (n == 3) {
+ n = 2;
+ }
+
+ if (n > 3) {
+ n = 3;
+ }
+
+ return(n);
+}
+
+/*********************************************************************//**
+Translates the ibuf free bits to the free space on a page in bytes.
+@return maximum insert size after reorganize for the page */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_from_bits(
+/*================================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ ulint bits) /*!< in: value for ibuf bitmap bits */
+{
+ ut_ad(bits < 4);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ if (zip_size) {
+ if (bits == 3) {
+ return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ if (bits == 3) {
+ return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+ }
+
+ return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE));
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+ ulint zip_size,
+ /*!< in: compressed page size in bytes */
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ulint max_ins_size;
+ const page_zip_des_t* page_zip;
+ lint zip_max_ins;
+
+ ut_ad(zip_size == buf_block_get_zip_size(block));
+ ut_ad(zip_size);
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ page_zip = buf_block_get_page_zip(block);
+ zip_max_ins = page_zip_max_ins_size(page_zip,
+ FALSE/* not clustered */);
+
+ if (UNIV_UNLIKELY(zip_max_ins < 0)) {
+ return(0);
+ } else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) {
+ max_ins_size = (ulint) zip_max_ins;
+ }
+
+ return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+ ulint zip_size,/*!< in: compressed page size in bytes;
+ 0 for uncompressed pages */
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ut_ad(zip_size == buf_block_get_zip_size(block));
+
+ if (!zip_size) {
+ ulint max_ins_size;
+
+ max_ins_size = page_get_max_insert_size_after_reorganize(
+ buf_block_get_frame(block), 1);
+
+ return(ibuf_index_page_calc_free_bits(0, max_ins_size));
+ } else {
+ return(ibuf_index_page_calc_free_zip(zip_size, block));
+ }
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept. NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page. It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high. It is only safe to use this function for
+decrementing the free bits. Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+ buf_block_t* block, /*!< in: index page to which we have added new
+ records; the free bits are updated if the
+ index is non-clustered and non-unique and
+ the page level is 0, and the page becomes
+ fuller */
+ ulint max_ins_size,/*!< in: value of maximum insert size with
+ reorganize before the latest operation
+ performed to the page */
+ ulint increase)/*!< in: upper limit for the additional space
+ used in the latest operation, if known, or
+ ULINT_UNDEFINED */
+{
+ ulint before;
+ ulint after;
+
+ ut_ad(!buf_block_get_page_zip(block));
+
+ before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+ if (max_ins_size >= increase) {
+#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE
+# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE"
+#endif
+ after = ibuf_index_page_calc_free_bits(0, max_ins_size
+ - increase);
+#ifdef UNIV_IBUF_DEBUG
+ ut_a(after <= ibuf_index_page_calc_free(0, block));
+#endif
+ } else {
+ after = ibuf_index_page_calc_free(0, block);
+ }
+
+ if (after == 0) {
+ /* We move the page to the front of the buffer pool LRU list:
+ the purpose of this is to prevent those pages to which we
+ cannot make inserts using the insert buffer from slipping
+ out of the buffer pool */
+
+ buf_page_make_young(&block->page);
+ }
+
+ if (before > after) {
+ ibuf_set_free_bits(block, after, before);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/ibuf0types.h b/storage/xtradb/include/ibuf0types.h
new file mode 100644
index 00000000000..55944f879b2
--- /dev/null
+++ b/storage/xtradb/include/ibuf0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0types.h
+Insert buffer global types
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+typedef struct ibuf_struct ibuf_t;
+
+#endif
diff --git a/storage/xtradb/include/lock0iter.h b/storage/xtradb/include/lock0iter.h
new file mode 100644
index 00000000000..25a57c9740c
--- /dev/null
+++ b/storage/xtradb/include/lock0iter.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "univ.i"
+#include "lock0types.h"
+
+typedef struct lock_queue_iterator_struct {
+ const lock_t* current_lock;
+ /* In case this is a record lock queue (not table lock queue)
+ then bit_no is the record number within the heap in which the
+ record is stored. */
+ ulint bit_no;
+} lock_queue_iterator_t;
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no);/*!< in: record number in the
+ heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter); /*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
new file mode 100644
index 00000000000..73f885ecf04
--- /dev/null
+++ b/storage/xtradb/include/lock0lock.h
@@ -0,0 +1,829 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "univ.i"
+#include "buf0types.h"
+#include "trx0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "lock0types.h"
+#include "read0types.h"
+#include "hash0hash.h"
+#include "ut0vec.h"
+
+#ifdef UNIV_DEBUG
+extern ibool lock_print_waits;
+#endif /* UNIV_DEBUG */
+/* Buffer for storing information about the most recent deadlock error */
+extern FILE* lock_latest_err_file;
+extern ulint srv_n_lock_deadlock_count;
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void);
+/*===============*/
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+ ulint n_cells); /*!< in: number of slots in lock hash table */
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void);
+/*================*/
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block); /*!< in: buffer block */
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock);/*!< in: copy of the old, not
+ reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec); /*!< in: record on page: this
+ is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end); /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block); /*!< in: merged index
+ page which will be
+ discarded */
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page to which copied */
+ const buf_block_t* root); /*!< in: root page */
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ which copied */
+ const buf_block_t* block); /*!< in: index page;
+ NOT the root! */
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block); /*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+ const buf_block_t* left_block, /*!< in: left page to
+ which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor
+ of supremum on the left page
+ before merge */
+ const buf_block_t* right_block); /*!< in: merged index page
+ which will be discarded */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no); /*!< in: heap_no of the
+ donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block); /*!< in: index page
+ which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec); /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record whose lock state
+ is restored */
+ const buf_block_t* donator);/*!< in: page (rec is not
+ necessarily on this page)
+ whose infimum stored the lock
+ state; lock bits are reset on
+ the infimum */
+/*********************************************************************//**
+Returns TRUE if there are explicit record locks on a page.
+@return TRUE if there are explicit record locks on the page */
+UNIV_INTERN
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+ ulint space, /*!< in: space id */
+ ulint page_no);/*!< in: page number */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ ibool* inherit);/*!< out: set to TRUE if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return TRUE if sees, or FALSE if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ const rec_t* rec, /*!< in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ read_view_t* view); /*!< in: consistent read view */
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case FALSE, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return TRUE if certainly sees, or FALSE if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ const rec_t* rec, /*!< in: user record which
+ should be read or passed over
+ by a read cursor */
+ const read_view_t* view); /*!< in: consistent read view */
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_table(
+/*=======*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /*!< in: database table in dictionary cache */
+ enum lock_mode mode, /*!< in: lock mode */
+ que_thr_t* thr); /*!< in: query thread */
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in: transaction that has
+ set a record lock */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record */
+ enum lock_mode lock_mode);/*!< in: LOCK_S or LOCK_X */
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+UNIV_INTERN
+void
+lock_release_off_kernel(
+/*====================*/
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock); /*!< in: waiting lock request */
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+ dict_table_t* table, /*!< in: table to be dropped
+ or truncated */
+ ibool remove_also_table_sx_locks);/*!< in: also removes
+ table S and X locks */
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+ __attribute__((const));
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no);/*!< in: page number */
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock); /*!< in: record lock with at least one
+ bit set */
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction. The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* dest, /*!< in: destination of ALTER TABLE */
+ enum lock_mode* mode); /*!< out: lock mode of the source table */
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2); /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ibool has_kernel_mutex);/*!< in: TRUE if the caller owns the
+ kernel mutex */
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock); /*!< in: table type lock */
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock); /*!< in: record type lock */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+ FILE* file, /*!< in: file where to print */
+ ibool nowait);/*!< in: whether to wait for the kernel mutex */
+/*************************************************************************
+Prints info of locks for each transaction. */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file); /*!< in: file where to print */
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+UNIV_INTERN
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+ const trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+UNIV_INTERN
+void
+lock_release_autoinc_locks(
+/*=======================*/
+ trx_t* trx); /*!< in/out: transaction */
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return transaction id */
+UNIV_INTERN
+ullint
+lock_get_trx_id(
+/*============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return id of the table */
+UNIV_INTERN
+ullint
+lock_get_table_id(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+ const lock_t* lock); /*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+ const lock_t* lock); /*!< in: lock */
+
+/** Lock modes and types */
+/* @{ */
+#define LOCK_MODE_MASK 0xFUL /*!< mask used to extract mode from the
+ type_mode field in a lock */
+/** Lock types */
+/* @{ */
+#define LOCK_TABLE 16 /*!< table lock */
+#define LOCK_REC 32 /*!< record lock */
+#define LOCK_TYPE_MASK 0xF0UL /*!< mask used to extract lock type from the
+ type_mode field in a lock */
+#if LOCK_MODE_MASK & LOCK_TYPE_MASK
+# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
+#endif
+
+#define LOCK_WAIT 256 /*!< Waiting lock flag; when set, it
+ means that the lock has not yet been
+ granted, it is just waiting for its
+ turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY 0 /*!< this flag denotes an ordinary
+ next-key lock in contrast to LOCK_GAP
+ or LOCK_REC_NOT_GAP */
+#define LOCK_GAP 512 /*!< when this bit is set, it means that the
+ lock holds only on the gap before the record;
+ for instance, an x-lock on the gap does not
+ give permission to modify the record on which
+ the bit is set; locks of this type are created
+ when records are removed from the index chain
+ of records */
+#define LOCK_REC_NOT_GAP 1024 /*!< this bit means that the lock is only on
+ the index record and does NOT block inserts
+ to the gap before the index record; this is
+ used in the case when we retrieve a record
+ with a unique key, and is also used in
+ locking plain SELECTs (not part of UPDATE
+ or DELETE) when the user has set the READ
+ COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting
+ gap type record lock request in order to let
+ an insert of an index record to wait until
+ there are no conflicting locks by other
+ transactions on the gap; note that this flag
+ remains set when the waiting lock is granted,
+ or if the lock is inherited to a neighboring
+ record */
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/** Lock operation struct */
+typedef struct lock_op_struct lock_op_t;
+/** Lock operation struct */
+struct lock_op_struct{
+ dict_table_t* table; /*!< table to be locked */
+ enum lock_mode mode; /*!< lock mode */
+};
+
+/** The lock system struct */
+struct lock_sys_struct{
+ hash_table_t* rec_hash; /*!< hash table of the record locks */
+};
+
+/** The lock system */
+extern lock_sys_t* lock_sys;
+
+
+#ifndef UNIV_NONINL
+#include "lock0lock.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/lock0lock.ic b/storage/xtradb/include/lock0lock.ic
new file mode 100644
index 00000000000..014722f51c4
--- /dev/null
+++ b/storage/xtradb/include/lock0lock.ic
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "row0row.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "row0vers.h"
+#include "que0que.h"
+#include "btr0cur.h"
+#include "read0read.h"
+#include "log0recv.h"
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(hash_calc_hash(lock_rec_fold(space, page_no),
+ lock_sys->rec_hash));
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ trx_id_t trx_id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+
+ trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ if (trx_is_active(trx_id)) {
+ /* The modifying or inserting transaction is active */
+
+ return(trx_get_on_id(trx_id));
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ const page_t* page = block->frame;
+
+ if (page_is_comp(page)) {
+ return(rec_get_heap_no_new(
+ page
+ + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE)));
+ } else {
+ return(rec_get_heap_no_old(
+ page
+ + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+ FALSE)));
+ }
+}
diff --git a/storage/xtradb/include/lock0priv.h b/storage/xtradb/include/lock0priv.h
new file mode 100644
index 00000000000..287c151b19f
--- /dev/null
+++ b/storage/xtradb/include/lock0priv.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "univ.i"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "ut0lst.h"
+
+/** A table lock */
+typedef struct lock_table_struct lock_table_t;
+/** A table lock */
+struct lock_table_struct {
+ dict_table_t* table; /*!< database table in dictionary
+ cache */
+ UT_LIST_NODE_T(lock_t)
+ locks; /*!< list of locks on the same
+ table */
+};
+
+/** Record lock for a page */
+typedef struct lock_rec_struct lock_rec_t;
+/** Record lock for a page */
+struct lock_rec_struct {
+ ulint space; /*!< space id */
+ ulint page_no; /*!< page number */
+ ulint n_bits; /*!< number of bits in the lock
+ bitmap; NOTE: the lock bitmap is
+ placed immediately after the
+ lock struct */
+};
+
+/** Lock struct */
+struct lock_struct {
+ trx_t* trx; /*!< transaction owning the
+ lock */
+ UT_LIST_NODE_T(lock_t)
+ trx_locks; /*!< list of the locks of the
+ transaction */
+ ulint type_mode; /*!< lock type, mode, LOCK_GAP or
+ LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION,
+ wait flag, ORed */
+ hash_node_t hash; /*!< hash chain node for a record
+ lock */
+ dict_index_t* index; /*!< index for a record lock */
+ union {
+ lock_table_t tab_lock;/*!< table lock */
+ lock_rec_t rec_lock;/*!< record lock */
+ } un_member; /*!< lock details */
+};
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock); /*!< in: lock */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no);/*!< in: heap number of the record */
+
+#ifndef UNIV_NONINL
+#include "lock0priv.ic"
+#endif
+
+#endif /* lock0priv_h */
diff --git a/storage/xtradb/include/lock0priv.ic b/storage/xtradb/include/lock0priv.ic
new file mode 100644
index 00000000000..30447c99848
--- /dev/null
+++ b/storage/xtradb/include/lock0priv.ic
@@ -0,0 +1,49 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.c.
+I.e. lock/lock0lock.c contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(lock->type_mode & LOCK_TYPE_MASK);
+}
+
+/* vim: set filetype=c: */
diff --git a/storage/xtradb/include/lock0types.h b/storage/xtradb/include/lock0types.h
new file mode 100644
index 00000000000..45f29e90fe9
--- /dev/null
+++ b/storage/xtradb/include/lock0types.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+typedef struct lock_struct lock_t;
+typedef struct lock_sys_struct lock_sys_t;
+
+/* Basic lock modes */
+enum lock_mode {
+ LOCK_IS = 0, /* intention shared */
+ LOCK_IX, /* intention exclusive */
+ LOCK_S, /* shared */
+ LOCK_X, /* exclusive */
+ LOCK_AUTO_INC, /* locks the auto-inc counter of a table
+ in an exclusive mode */
+ LOCK_NONE, /* this is used elsewhere to note consistent read */
+ LOCK_NUM = LOCK_NONE/* number of lock modes */
+};
+
+#endif
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
new file mode 100644
index 00000000000..8fce4ef96bc
--- /dev/null
+++ b/storage/xtradb/include/log0log.h
@@ -0,0 +1,969 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/** Redo log buffer */
+typedef struct log_struct log_t;
+/** Redo log group */
+typedef struct log_group_struct log_group_t;
+
+#ifdef UNIV_DEBUG
+/** Flag: write to log file? */
+extern ibool log_do_write;
+/** Flag: enable debug output when writing to the log? */
+extern ibool log_debug_writes;
+#else /* UNIV_DEBUG */
+/** Write to log */
+# define log_do_write TRUE
+#endif /* UNIV_DEBUG */
+
+/** Wait modes for log_write_up_to @{ */
+#define LOG_NO_WAIT 91
+#define LOG_WAIT_ONE_GROUP 92
+#define LOG_WAIT_ALL_GROUPS 93
+/* @} */
+/** Maximum number of log groups in log_group_struct::checkpoint_buf */
+#define LOG_MAX_N_GROUPS 32
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
+so that we know that the limit has been written to a log checkpoint field
+on disk. */
+UNIV_INTERN
+void
+log_fsp_current_free_limit_set_and_checkpoint(
+/*==========================================*/
+ ulint limit); /*!< in: limit to set */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+ ib_int64_t* log_file_offset, /*!< out: offset in that file
+ (including the header) */
+ ib_uint64_t first_header_lsn, /*!< in: first log file start
+ lsn */
+ ib_uint64_t lsn, /*!< in: lsn whose position to
+ determine */
+ ulint n_log_files, /*!< in: total number of log
+ files */
+ ib_int64_t log_file_size); /*!< in: log file size
+ (including the header) */
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+ const void* str, /*!< in: string */
+ ulint len, /*!< in: string length */
+ ib_uint64_t* start_lsn);/*!< out: start lsn of the log record */
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void);
+/*=============*/
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return start lsn of the log record */
+UNIV_INTERN
+ib_uint64_t
+log_reserve_and_open(
+/*=================*/
+ ulint len); /*!< in: length of data to be catenated */
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+ byte* str, /*!< in: string */
+ ulint str_len); /*!< in: string length */
+/************************************************************//**
+Closes the log.
+@return lsn */
+UNIV_INTERN
+ib_uint64_t
+log_close(void);
+/*===========*/
+/************************************************************//**
+Gets the current lsn.
+@return current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void);
+/*=============*/
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void);
+/*==================*/
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void);
+/*==========*/
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+ ulint id, /*!< in: group id */
+ ulint n_files, /*!< in: number of log files */
+ ulint file_size, /*!< in: log file size in bytes */
+ ulint space_id, /*!< in: space id of the file space
+ which contains the log files of this
+ group */
+ ulint archive_space_id); /*!< in: space id of the file space
+ which contains some archived log
+ files for this group; currently, only
+ for the first log group this is
+ used */
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+ log_group_t* group); /*!< in: log group */
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+ ib_uint64_t lsn, /*!< in: log sequence number up to which
+ the log should be written,
+ IB_ULONGLONG_MAX if not specified */
+ ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+ ibool flush_to_disk);
+ /*!< in: TRUE if we want the written log
+ also to be flushed to disk */
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void);
+/*==========================*/
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+ ibool flush); /*<! in: flush the logs to disk */
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool and also may make a new checkpoint. NOTE: this function may only
+be called if the calling thread owns no synchronization objects!
+@return FALSE if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+UNIV_INTERN
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+ ib_uint64_t new_oldest, /*!< in: try to advance
+ oldest_modified_lsn at least
+ to this lsn */
+ ibool sync); /*!< in: TRUE if synchronous
+ operation is desired */
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is
+ desired */
+ ibool write_always); /*!< in: the function normally checks if the
+ the new checkpoint would have a greater
+ lsn than the previous one: if not, then no
+ physical write is done; by setting this
+ parameter TRUE, a physical write will always be
+ made to log files */
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+ ib_uint64_t lsn, /*!< in: make a checkpoint at this or a
+ later lsn, if IB_ULONGLONG_MAX, makes
+ a checkpoint at the latest lsn */
+ ibool write_always); /*!< in: the function normally checks if
+ the new checkpoint would have a
+ greater lsn than the previous one: if
+ not, then no physical write is done;
+ by setting this parameter TRUE, a
+ physical write will always be made to
+ log files */
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void);
+/*=======================================*/
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+ log_group_t* group, /*!< in: log group */
+ ulint field); /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+ const byte* buf, /*!< in: buffer containing checkpoint info */
+ ulint n, /*!< in: nth slot */
+ ulint* file_no,/*!< out: archived file number */
+ ulint* offset);/*!< out: archived file offset */
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void);
+/*==================================*/
+/********************************************************************//**
+Starts an archiving operation.
+@return TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is desired */
+ ulint* n_bytes);/*!< out: archive log buffer size, 0 if nothing to
+ archive */
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from a number one higher, so that the archiving will
+not write again to the archived log files which exist when this function
+returns.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void);
+/*==================*/
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void);
+/*===================*/
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void);
+/*==========================*/
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void);
+/*========================*/
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+ char* buf, /*!< in: buffer where to write */
+ ulint id, /*!< in: group id */
+ ulint file_no);/*!< in: file number */
+#else /* !UNIV_HOTBACKUP */
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+ byte* hdr_buf,/*!< in: buffer which will be written to the
+ start of the first log file */
+ ib_uint64_t start); /*!< in: lsn of the start of the first log file;
+ we pretend that there is a checkpoint at
+ start + LOG_BLOCK_HDR_SIZE */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+ ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */
+ byte* buf, /*!< in: buffer where to read */
+ log_group_t* group, /*!< in: log group */
+ ib_uint64_t start_lsn, /*!< in: read area start */
+ ib_uint64_t end_lsn); /*!< in: read area end */
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+ log_group_t* group, /*!< in: log group */
+ byte* buf, /*!< in: buffer */
+ ulint len, /*!< in: buffer len; must be divisible
+ by OS_FILE_LOG_BLOCK_SIZE */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the buffer; must
+ be divisible by
+ OS_FILE_LOG_BLOCK_SIZE */
+ ulint new_data_offset);/*!< in: start offset of new data in
+ buf: this parameter is used to decide
+ if we have to write a new log file
+ header */
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+ log_group_t* group, /*!< in/out: group */
+ ib_uint64_t lsn); /*!< in: lsn for which the values should be
+ set */
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return capacity in bytes */
+UNIV_INTERN
+ulint
+log_group_get_capacity(
+/*===================*/
+ const log_group_t* group); /*!< in: log group */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len); /*!< in: data length */
+/************************************************************//**
+Calculates the checksum for a log block.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+ const byte* block); /*!< in: log block */
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum); /*!< in: checksum */
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset); /*!< in: offset, 0 if none */
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block); /*!< in: log block */
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn); /*!< in: lsn within the log block */
+/************************************************************//**
+Initializes a log block in the log buffer in the old, < 3.23.52 format, where
+there was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn); /*!< in: lsn within the log block */
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ ib_uint64_t lsn); /*!< in: lsn of a byte within the block */
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+ FILE* file); /*!< in: file where to print */
+/******************************************************//**
+Peeks the current lsn.
+@return TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+ ib_uint64_t* lsn); /*!< out: if returns TRUE, current lsn is here */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void);
+/*===================*/
+/**********************************************************
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void);
+/*==============*/
+/**********************************************************
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void);
+/*==============*/
+
+extern log_t* log_sys;
+
+/* Values used as flags */
+#define LOG_FLUSH 7652559
+#define LOG_CHECKPOINT 78656949
+#ifdef UNIV_LOG_ARCHIVE
+# define LOG_ARCHIVE 11122331
+#endif /* UNIV_LOG_ARCHIVE */
+#define LOG_RECOVER 98887331
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN ((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+
+#define LOG_BUFFER_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE)
+#define LOG_ARCHIVE_BUF_SIZE (srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
+
+/* Offsets of a log block header */
+#define LOG_BLOCK_HDR_NO 0 /* block number which must be > 0 and
+ is allowed to wrap around at 2G; the
+ highest bit is set to 1 if this is the
+ first log block in a log flush write
+ segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+ /* mask used to get the highest bit in
+ the preceding field */
+#define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to
+ this block */
+#define LOG_BLOCK_FIRST_REC_GROUP 6 /* offset of the first start of an
+ mtr log record group in this log block,
+ 0 if none; if the value is the same
+ as LOG_BLOCK_HDR_DATA_LEN, it means
+ that the first rec group has not yet
+ been catenated to this log block, but
+ if it will, it will start at this
+ offset; an archive recovery can
+ start parsing the log records starting
+ from this offset in this log block,
+ if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO 8 /* 4 lower bytes of the value of
+ log_sys->next_checkpoint_no when the
+ log block was last written to: if the
+ block has not yet been written full,
+ this value is only updated before a
+ log buffer flush */
+#define LOG_BLOCK_HDR_SIZE 12 /* size of the log block header in
+ bytes */
+
+/* Offsets of a log block trailer from the end of the block */
+#define LOG_BLOCK_CHECKSUM 4 /* 4 byte checksum of the log block
+ contents; in InnoDB versions
+ < 3.23.52 this did not contain the
+ checksum but the same value as
+ .._HDR_NO */
+#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */
+
+/* Offsets for a checkpoint field */
+#define LOG_CHECKPOINT_NO 0
+#define LOG_CHECKPOINT_LSN 8
+#define LOG_CHECKPOINT_OFFSET 16
+#define LOG_CHECKPOINT_LOG_BUF_SIZE 20
+#define LOG_CHECKPOINT_ARCHIVED_LSN 24
+#define LOG_CHECKPOINT_GROUP_ARRAY 32
+
+/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */
+
+#define LOG_CHECKPOINT_ARCHIVED_FILE_NO 0
+#define LOG_CHECKPOINT_ARCHIVED_OFFSET 4
+
+#define LOG_CHECKPOINT_ARRAY_END (LOG_CHECKPOINT_GROUP_ARRAY\
+ + LOG_MAX_N_GROUPS * 8)
+#define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END
+#define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END)
+ /* current fsp free limit in
+ tablespace 0, in units of one
+ megabyte; this information is only used
+ by ibbackup to decide if it can
+ truncate unused ends of
+ non-auto-extending data files in space
+ 0 */
+#define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END)
+ /* this magic number tells if the
+ checkpoint contains the above field:
+ the field was added to
+ InnoDB-3.23.50 */
+#define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END)
+
+#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243
+
+/* Offsets of a log file header */
+#define LOG_GROUP_ID 0 /* log group number */
+#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this
+ log file */
+#define LOG_FILE_NO 12 /* 4-byte archived log file number;
+ this field is only defined in an
+ archived log file */
+#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
+ /* a 32-byte field which contains
+ the string 'ibbackup' and the
+ creation time if the log file was
+ created by ibbackup --restore;
+ when mysqld is first time started
+ on the restored database, it can
+ print helpful info for the user */
+#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE
+ /* this 4-byte field is TRUE when
+ the writing of an archived log file
+ has been completed; this field is
+ only defined in an archived log file */
+#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4)
+ /* lsn where the archived log file
+ at least extends: actually the
+ archived log file may extend to a
+ later lsn, as long as it is within the
+ same log block as this lsn; this field
+ is defined only when an archived log
+ file has been completely written */
+#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE
+ /* first checkpoint field in the log
+ header; we write alternately to the
+ checkpoint fields when we make new
+ checkpoints; this field is only defined
+ in the first log file of a log group */
+#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE)
+ /* second checkpoint field in the log
+ header */
+#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_GROUP_OK 301
+#define LOG_GROUP_CORRUPTED 302
+
+/** Log group consists of a number of log files, each of the same size; a log
+group is implemented as a space in the sense of the module fil0fil. */
+struct log_group_struct{
+ /* The following fields are protected by log_sys->mutex */
+ ulint id; /*!< log group id */
+ ulint n_files; /*!< number of files in the group */
+ ulint file_size; /*!< individual log file size in bytes,
+ including the log file header */
+ ulint space_id; /*!< file space which implements the log
+ group */
+ ulint state; /*!< LOG_GROUP_OK or
+ LOG_GROUP_CORRUPTED */
+ ib_uint64_t lsn; /*!< lsn used to fix coordinates within
+ the log group */
+ ulint lsn_offset; /*!< the offset of the above lsn */
+ ulint n_pending_writes;/*!< number of currently pending flush
+ writes for this log group */
+ byte** file_header_bufs_ptr;/*!< unaligned buffers */
+ byte** file_header_bufs;/*!< buffers for each file
+ header in the group */
+#ifdef UNIV_LOG_ARCHIVE
+ /*-----------------------------*/
+ byte** archive_file_header_bufs_ptr;/*!< unaligned buffers */
+ byte** archive_file_header_bufs;/*!< buffers for each file
+ header in the group */
+ ulint archive_space_id;/*!< file space which
+ implements the log group
+ archive */
+ ulint archived_file_no;/*!< file number corresponding to
+ log_sys->archived_lsn */
+ ulint archived_offset;/*!< file offset corresponding to
+ log_sys->archived_lsn, 0 if we have
+ not yet written to the archive file
+ number archived_file_no */
+ ulint next_archived_file_no;/*!< during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_file_no here: the write
+ completion function then sets the new
+ value to ..._file_no */
+ ulint next_archived_offset; /*!< like the preceding field */
+#endif /* UNIV_LOG_ARCHIVE */
+ /*-----------------------------*/
+ ib_uint64_t scanned_lsn; /*!< used only in recovery: recovery scan
+ succeeded up to this lsn in this log
+ group */
+ byte* checkpoint_buf_ptr;/*!< unaligned checkpoint header */
+ byte* checkpoint_buf; /*!< checkpoint header is written from
+ this buffer to the group */
+ UT_LIST_NODE_T(log_group_t)
+ log_groups; /*!< list of log groups */
+};
+
+/** Redo log buffer */
+struct log_struct{
+ byte pad[64]; /*!< padding to prevent other memory
+ update hotspots from residing on the
+ same memory cache line */
+ ib_uint64_t lsn; /*!< log sequence number */
+ ulint buf_free; /*!< first free offset within the log
+ buffer */
+#ifndef UNIV_HOTBACKUP
+ mutex_t mutex; /*!< mutex protecting the log */
+#endif /* !UNIV_HOTBACKUP */
+ byte* buf_ptr; /* unaligned log buffer */
+ byte* buf; /*!< log buffer */
+ ulint buf_size; /*!< log buffer size in bytes */
+ ulint max_buf_free; /*!< recommended maximum value of
+ buf_free, after which the buffer is
+ flushed */
+ ulint old_buf_free; /*!< value of buf free when log was
+ last time opened; only in the debug
+ version */
+ ib_uint64_t old_lsn; /*!< value of lsn when log was
+ last time opened; only in the
+ debug version */
+ ibool check_flush_or_checkpoint;
+ /*!< this is set to TRUE when there may
+ be need to flush the log buffer, or
+ preflush buffer pool pages, or make
+ a checkpoint; this MUST be TRUE when
+ lsn - last_checkpoint_lsn >
+ max_checkpoint_age; this flag is
+ peeked at by log_free_check(), which
+ does not reserve the log mutex */
+ UT_LIST_BASE_NODE_T(log_group_t)
+ log_groups; /*!< log groups */
+
+#ifndef UNIV_HOTBACKUP
+ /** The fields involved in the log buffer flush @{ */
+
+ ulint buf_next_to_write;/*!< first offset in the log buffer
+ where the byte content may not exist
+ written to file, e.g., the start
+ offset of a log record catenated
+ later; this is advanced when a flush
+ operation is completed to all the log
+ groups */
+ ib_uint64_t written_to_some_lsn;
+ /*!< first log sequence number not yet
+ written to any log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for any
+ one log group */
+ ib_uint64_t written_to_all_lsn;
+ /*!< first log sequence number not yet
+ written to some log group; for this to
+ be advanced, it is enough that the
+ write i/o has been completed for all
+ log groups.
+ Note that since InnoDB currently
+ has only one log group therefore
+ this value is redundant. Also it
+ is possible that this value
+ falls behind the
+ flushed_to_disk_lsn transiently.
+ It is appropriate to use either
+ flushed_to_disk_lsn or
+ write_lsn which are always
+ up-to-date and accurate. */
+ ib_uint64_t write_lsn; /*!< end lsn for the current running
+ write */
+ ulint write_end_offset;/*!< the data in buffer has
+ been written up to this offset
+ when the current write ends:
+ this field will then be copied
+ to buf_next_to_write */
+ ib_uint64_t current_flush_lsn;/*!< end lsn for the current running
+ write + flush operation */
+ ib_uint64_t flushed_to_disk_lsn;
+ /*!< how far we have written the log
+ AND flushed to disk */
+ ulint n_pending_writes;/*!< number of currently
+ pending flushes or writes */
+ /* NOTE on the 'flush' in names of the fields below: starting from
+ 4.0.14, we separate the write of the log file and the actual fsync()
+ or other method to flush it to disk. The names below shhould really
+ be 'flush_or_write'! */
+ os_event_t no_flush_event; /*!< this event is in the reset state
+ when a flush or a write is running;
+ a thread should wait for this without
+ owning the log mutex, but NOTE that
+ to set or reset this event, the
+ thread MUST own the log mutex! */
+ ibool one_flushed; /*!< during a flush, this is
+ first FALSE and becomes TRUE
+ when one log group has been
+ written or flushed */
+ os_event_t one_flushed_event;/*!< this event is reset when the
+ flush or write has not yet completed
+ for any log group; e.g., this means
+ that a transaction has been committed
+ when this is set; a thread should wait
+ for this without owning the log mutex,
+ but NOTE that to set or reset this
+ event, the thread MUST own the log
+ mutex! */
+ ulint n_log_ios; /*!< number of log i/os initiated thus
+ far */
+ ulint n_log_ios_old; /*!< number of log i/o's at the
+ previous printout */
+ time_t last_printout_time;/*!< when log_print was last time
+ called */
+ /* @} */
+
+ /** Fields involved in checkpoints @{ */
+ ulint log_group_capacity; /*!< capacity of the log group; if
+ the checkpoint age exceeds this, it is
+ a serious error because it is possible
+ we will then overwrite log and spoil
+ crash recovery */
+ ulint max_modified_age_async;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool_get_oldest_modification()
+ is exceeded, we start an
+ asynchronous preflush of pool pages */
+ ulint max_modified_age_sync;
+ /*!< when this recommended
+ value for lsn -
+ buf_pool_get_oldest_modification()
+ is exceeded, we start a
+ synchronous preflush of pool pages */
+ ulint adm_checkpoint_interval;
+ /*!< administrator-specified checkpoint
+ interval in terms of log growth in
+ bytes; the interval actually used by
+ the database can be smaller */
+ ulint max_checkpoint_age_async;
+ /*!< when this checkpoint age
+ is exceeded we start an
+ asynchronous writing of a new
+ checkpoint */
+ ulint max_checkpoint_age;
+ /*!< this is the maximum allowed value
+ for lsn - last_checkpoint_lsn when a
+ new query step is started */
+ ib_uint64_t next_checkpoint_no;
+ /*!< next checkpoint number */
+ ib_uint64_t last_checkpoint_lsn;
+ /*!< latest checkpoint lsn */
+ ib_uint64_t next_checkpoint_lsn;
+ /*!< next checkpoint lsn */
+ ulint n_pending_checkpoint_writes;
+ /*!< number of currently pending
+ checkpoint writes */
+ rw_lock_t checkpoint_lock;/*!< this latch is x-locked when a
+ checkpoint write is running; a thread
+ should wait for this without owning
+ the log mutex */
+#endif /* !UNIV_HOTBACKUP */
+ byte* checkpoint_buf_ptr;/* unaligned checkpoint header */
+ byte* checkpoint_buf; /*!< checkpoint header is read to this
+ buffer */
+ /* @} */
+#ifdef UNIV_LOG_ARCHIVE
+ /** Fields involved in archiving @{ */
+ ulint archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
+ LOG_ARCH_STOPPED, LOG_ARCH_OFF */
+ ib_uint64_t archived_lsn; /*!< archiving has advanced to this
+ lsn */
+ ulint max_archived_lsn_age_async;
+ /*!< recommended maximum age of
+ archived_lsn, before we start
+ asynchronous copying to the archive */
+ ulint max_archived_lsn_age;
+ /*!< maximum allowed age for
+ archived_lsn */
+ ib_uint64_t next_archived_lsn;/*!< during an archive write,
+ until the write is completed, we
+ store the next value for
+ archived_lsn here: the write
+ completion function then sets the new
+ value to archived_lsn */
+ ulint archiving_phase;/*!< LOG_ARCHIVE_READ or
+ LOG_ARCHIVE_WRITE */
+ ulint n_pending_archive_ios;
+ /*!< number of currently pending reads
+ or writes in archiving */
+ rw_lock_t archive_lock; /*!< this latch is x-locked when an
+ archive write is running; a thread
+ should wait for this without owning
+ the log mutex */
+ ulint archive_buf_size;/*!< size of archive_buf */
+ byte* archive_buf; /*!< log segment is written to the
+ archive from this buffer */
+ os_event_t archiving_on; /*!< if archiving has been stopped,
+ a thread can wait for this event to
+ become signaled */
+ /* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+};
+
+#ifdef UNIV_LOG_ARCHIVE
+/** Archiving state @{ */
+#define LOG_ARCH_ON 71
+#define LOG_ARCH_STOPPING 72
+#define LOG_ARCH_STOPPING2 73
+#define LOG_ARCH_STOPPED 74
+#define LOG_ARCH_OFF 75
+/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
new file mode 100644
index 00000000000..1ce00fd7313
--- /dev/null
+++ b/storage/xtradb/include/log0log.ic
@@ -0,0 +1,446 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.ic
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "mach0data.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+ const byte* buf, /*!< in: pointer to the start of
+ the log segment in the
+ log_sys->buf log buffer */
+ ulint len, /*!< in: segment length in bytes */
+ ib_uint64_t buf_start_lsn); /*!< in: buffer start lsn */
+#endif /* UNIV_LOG_DEBUG */
+
+/************************************************************//**
+Gets a log block flush bit.
+@return TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+ const byte* log_block) /*!< in: log block */
+{
+ if (LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/************************************************************//**
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+ byte* log_block, /*!< in/out: log block */
+ ibool val) /*!< in: value to set */
+{
+ ulint field;
+
+ field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
+
+ if (val) {
+ field = field | LOG_BLOCK_FLUSH_BIT_MASK;
+ } else {
+ field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
+ }
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+}
+
+/************************************************************//**
+Gets a log block number stored in the header.
+@return log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(~LOG_BLOCK_FLUSH_BIT_MASK
+ & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+}
+
+/************************************************************//**
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint n) /*!< in: log block number: must be > 0 and
+ < LOG_BLOCK_FLUSH_BIT_MASK */
+{
+ ut_ad(n > 0);
+ ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+ mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+}
+
+/************************************************************//**
+Gets a log block data length.
+@return log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint len) /*!< in: data length */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+}
+
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint offset) /*!< in: offset, 0 if none */
+{
+ mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+}
+
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/************************************************************//**
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+ byte* log_block, /*!< in/out: log block */
+ ib_uint64_t no) /*!< in: checkpoint no */
+{
+ mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no);
+}
+
+/************************************************************//**
+Converts a lsn to a log block number.
+@return log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+ ib_uint64_t lsn) /*!< in: lsn of a byte within the block */
+{
+ return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
+}
+
+/************************************************************//**
+Calculates the checksum for a log block.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+ const byte* block) /*!< in: log block */
+{
+ ulint sum;
+ ulint sh;
+ ulint i;
+
+ sum = 1;
+ sh = 0;
+
+ for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+ ulint b = (ulint) block[i];
+ sum &= 0x7FFFFFFFUL;
+ sum += b;
+ sum += b << sh;
+ sh++;
+ if (sh > 24) {
+ sh = 0;
+ }
+ }
+
+ return(sum);
+}
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+ const byte* log_block) /*!< in: log block */
+{
+ return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM));
+}
+
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+ byte* log_block, /*!< in/out: log block */
+ ulint checksum) /*!< in: checksum */
+{
+ mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM,
+ checksum);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn) /*!< in: lsn within the log block */
+{
+ ulint no;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer in the old format, where there
+was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+ byte* log_block, /*!< in: pointer to the log buffer */
+ ib_uint64_t lsn) /*!< in: lsn within the log block */
+{
+ ulint no;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ no = log_block_convert_lsn_to_no(lsn);
+
+ log_block_set_hdr_no(log_block, no);
+ mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+ - LOG_BLOCK_CHECKSUM, no);
+ log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+ log_block_set_first_rec_group(log_block, 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+ const void* str, /*!< in: string */
+ ulint len, /*!< in: string length */
+ ib_uint64_t* start_lsn)/*!< out: start lsn of the log record */
+{
+ ulint data_len;
+#ifdef UNIV_LOG_LSN_DEBUG
+ /* length of the LSN pseudo-record */
+ ulint lsn_len;
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+ mutex_enter(&log_sys->mutex);
+#ifdef UNIV_LOG_LSN_DEBUG
+ lsn_len = 1
+ + mach_get_compressed_size(log_sys->lsn >> 32)
+ + mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+ data_len = len
+#ifdef UNIV_LOG_LSN_DEBUG
+ + lsn_len
+#endif /* UNIV_LOG_LSN_DEBUG */
+ + log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+ if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+ /* The string does not fit within the current log block
+ or the log block would become full */
+
+ mutex_exit(&log_sys->mutex);
+
+ return(0);
+ }
+
+ *start_lsn = log_sys->lsn;
+
+#ifdef UNIV_LOG_LSN_DEBUG
+ {
+ /* Write the LSN pseudo-record. */
+ byte* b = &log_sys->buf[log_sys->buf_free];
+ *b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str);
+ /* Write the LSN in two parts,
+ as a pseudo page number and space id. */
+ b += mach_write_compressed(b, log_sys->lsn >> 32);
+ b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL);
+ ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]);
+
+ memcpy(b, str, len);
+ len += lsn_len;
+ }
+#else /* UNIV_LOG_LSN_DEBUG */
+ memcpy(log_sys->buf + log_sys->buf_free, str, len);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+ log_block_set_data_len((byte*) ut_align_down(log_sys->buf
+ + log_sys->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE),
+ data_len);
+#ifdef UNIV_LOG_DEBUG
+ log_sys->old_buf_free = log_sys->buf_free;
+ log_sys->old_lsn = log_sys->lsn;
+#endif
+ log_sys->buf_free += len;
+
+ ut_ad(log_sys->buf_free <= log_sys->buf_size);
+
+ log_sys->lsn += len;
+
+#ifdef UNIV_LOG_DEBUG
+ log_check_log_recs(log_sys->buf + log_sys->old_buf_free,
+ log_sys->buf_free - log_sys->old_buf_free,
+ log_sys->old_lsn);
+#endif
+ return(log_sys->lsn);
+}
+
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void)
+/*=============*/
+{
+ mutex_exit(&(log_sys->mutex));
+}
+
+/************************************************************//**
+Gets the current lsn.
+@return current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void)
+/*=============*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(lsn);
+}
+
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void)
+/*==================*/
+{
+ return(log_sys->log_group_capacity);
+}
+
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (log_sys->check_flush_or_checkpoint) {
+
+ log_check_margins();
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h
new file mode 100644
index 00000000000..15065267250
--- /dev/null
+++ b/storage/xtradb/include/log0recv.h
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0recv_h
+#define log0recv_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "log0log.h"
+
+#ifdef UNIV_HOTBACKUP
+extern ibool recv_replay_file_ops;
+
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_cp_info_for_backup(
+/*=========================*/
+ const byte* hdr, /*!< in: buffer containing the log group
+ header */
+ ib_uint64_t* lsn, /*!< out: checkpoint lsn */
+ ulint* offset, /*!< out: checkpoint offset in the log group */
+ ulint* fsp_limit,/*!< out: fsp limit of space 0,
+ 1000000000 if the database is running
+ with < version 3.23.50 of InnoDB */
+ ib_uint64_t* cp_no, /*!< out: checkpoint number */
+ ib_uint64_t* first_header_lsn);
+ /*!< out: lsn of of the start of the
+ first log file */
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+ byte* buf, /*!< in: buffer containing log data */
+ ulint buf_len, /*!< in: data length in that buffer */
+ ib_uint64_t* scanned_lsn, /*!< in/out: lsn of buffer start,
+ we return scanned lsn */
+ ulint* scanned_checkpoint_no,
+ /*!< in/out: 4 lowest bytes of the
+ highest scanned checkpoint number so
+ far */
+ ulint* n_bytes_scanned);/*!< out: how much we were able to
+ scan, smaller than buf_len if log
+ data ended here */
+#endif /* UNIV_HOTBACKUP */
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void);
+/*=====================*/
+#ifdef UNIV_LOG_ARCHIVE
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void);
+/*=================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+ ibool just_read_in,
+ /*!< in: TRUE if the i/o handler calls
+ this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+ buf_block_t* block); /*!< in/out: buffer block */
+#ifndef UNIV_HOTBACKUP
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block in/out: the buffer block
+*/
+# define recv_recover_page(jri, block) recv_recover_page_func(jri, block)
+#else /* !UNIV_HOTBACKUP */
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block in/out: the buffer block
+*/
+# define recv_recover_page(jri, block) recv_recover_page_func(block)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+ ulint type, /*!< in: LOG_CHECKPOINT or
+ LOG_ARCHIVE */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn
+ if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn from
+ data files */
+ ib_uint64_t max_flushed_lsn);/*!< in: max flushed lsn from
+ data files */
+#ifdef UNIV_LOG_ARCHIVE
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type in: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim in: recover up to this log sequence number if possible
+@param min in: minimum flushed log sequence number from data files
+@param max in: maximum flushed log sequence number from data files
+@return error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max) \
+ recv_recovery_from_checkpoint_start_func(type,lim,min,max)
+#else /* UNIV_LOG_ARCHIVE */
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type ignored: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim ignored: recover up to this log sequence number if possible
+@param min in: minimum flushed log sequence number from data files
+@param max in: maximum flushed log sequence number from data files
+@return error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max) \
+ recv_recovery_from_checkpoint_start_func(min,max)
+#endif /* UNIV_LOG_ARCHIVE */
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void);
+/*======================================*/
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void);
+/*===============================*/
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found. Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+ ulint available_memory,/*!< in: we let the hash table of recs
+ to grow to this size, at the maximum */
+ ibool store_to_hash, /*!< in: TRUE if the records should be
+ stored to the hash table; this is set
+ to FALSE if just debug checking is
+ needed */
+ const byte* buf, /*!< in: buffer containing a log
+ segment or garbage */
+ ulint len, /*!< in: buffer length */
+ ib_uint64_t start_lsn, /*!< in: buffer start lsn */
+ ib_uint64_t* contiguous_lsn, /*!< in/out: it is known that all log
+ groups contain contiguous log data up
+ to this lsn */
+ ib_uint64_t* group_scanned_lsn);/*!< out: scanning succeeded up to
+ this lsn */
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+ ib_uint64_t lsn, /*!< in: reset to this lsn
+ rounded up to be divisible by
+ OS_FILE_LOG_BLOCK_SIZE, after
+ which we add
+ LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint arch_log_no, /*!< in: next archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+ ibool new_logs_created);/*!< in: TRUE if resetting logs
+ is done at the log creation;
+ FALSE if it is done after
+ archive recovery */
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+ const char* log_dir, /*!< in: log file directory path */
+ ulint n_log_files, /*!< in: number of log files */
+ ulint log_file_size, /*!< in: log file size */
+ ib_uint64_t lsn); /*!< in: new start lsn, must be
+ divisible by OS_FILE_LOG_BLOCK_SIZE */
+#endif /* UNIV_HOTBACKUP */
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void);
+/*=================*/
+/**********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void);
+/*================*/
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void);
+/*===================*/
+/********************************************************//**
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+ ulint available_memory); /*!< in: available memory in bytes */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void);
+/*===================*/
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+ ibool allow_ibuf); /*!< in: if TRUE, also ibuf operations are
+ allowed during the application; if FALSE,
+ no ibuf operations are allowed, and after
+ the application all file pages are flushed to
+ disk and invalidated in buffer pool: this
+ alternative means that no new log records
+ can be generated during the application */
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void);
+/*================================*/
+#endif
+#ifdef UNIV_LOG_ARCHIVE
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn field from the
+ data files */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn if
+ possible */
+ ulint first_log_no); /*!< in: number of the first archived
+ log file to use in the recovery; the
+ file will be searched from
+ INNOBASE_LOG_ARCH_DIR specified in
+ server config file */
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void);
+/*===================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Block of log record data */
+typedef struct recv_data_struct recv_data_t;
+/** Block of log record data */
+struct recv_data_struct{
+ recv_data_t* next; /*!< pointer to the next block or NULL */
+ /*!< the log record data is stored physically
+ immediately after this struct, max amount
+ RECV_DATA_BLOCK_SIZE bytes of it */
+};
+
+/** Stored log record struct */
+typedef struct recv_struct recv_t;
+/** Stored log record struct */
+struct recv_struct{
+ byte type; /*!< log record type */
+ ulint len; /*!< log record body length in bytes */
+ recv_data_t* data; /*!< chain of blocks containing the log record
+ body */
+ ib_uint64_t start_lsn;/*!< start lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the start lsn of
+ this log record */
+ ib_uint64_t end_lsn;/*!< end lsn of the log segment written by
+ the mtr which generated this log record: NOTE
+ that this is not necessarily the end lsn of
+ this log record */
+ UT_LIST_NODE_T(recv_t)
+ rec_list;/*!< list of log records for this page */
+};
+
+/** States of recv_addr_struct */
+enum recv_addr_state {
+ /** not yet processed */
+ RECV_NOT_PROCESSED,
+ /** page is being read */
+ RECV_BEING_READ,
+ /** log records are being applied on the page */
+ RECV_BEING_PROCESSED,
+ /** log records have been applied on the page, or they have
+ been discarded because the tablespace does not exist */
+ RECV_PROCESSED
+};
+
+/** Hashed page file address struct */
+typedef struct recv_addr_struct recv_addr_t;
+/** Hashed page file address struct */
+struct recv_addr_struct{
+ enum recv_addr_state state;
+ /*!< recovery state of the page */
+ unsigned space:32;/*!< space id */
+ unsigned page_no:32;/*!< page number */
+ UT_LIST_BASE_NODE_T(recv_t)
+ rec_list;/*!< list of log records for this page */
+ hash_node_t addr_hash;/*!< hash node in the hash bucket chain */
+};
+
+/** Recovery system data structure */
+typedef struct recv_sys_struct recv_sys_t;
+/** Recovery system data structure */
+struct recv_sys_struct{
+#ifndef UNIV_HOTBACKUP
+ mutex_t mutex; /*!< mutex protecting the fields apply_log_recs,
+ n_addrs, and the state field in each recv_addr
+ struct */
+#endif /* !UNIV_HOTBACKUP */
+ ibool apply_log_recs;
+ /*!< this is TRUE when log rec application to
+ pages is allowed; this flag tells the
+ i/o-handler if it should do log record
+ application */
+ ibool apply_batch_on;
+ /*!< this is TRUE when a log rec application
+ batch is running */
+ ib_uint64_t lsn; /*!< log sequence number */
+ ulint last_log_buf_size;
+ /*!< size of the log buffer when the database
+ last time wrote to the log */
+ byte* last_block;
+ /*!< possible incomplete last recovered log
+ block */
+ byte* last_block_buf_start;
+ /*!< the nonaligned start address of the
+ preceding buffer */
+ byte* buf; /*!< buffer for parsing log records */
+ ulint len; /*!< amount of data in buf */
+ ib_uint64_t parse_start_lsn;
+ /*!< this is the lsn from which we were able to
+ start parsing log records and adding them to
+ the hash table; zero if a suitable
+ start point not found yet */
+ ib_uint64_t scanned_lsn;
+ /*!< the log data has been scanned up to this
+ lsn */
+ ulint scanned_checkpoint_no;
+ /*!< the log data has been scanned up to this
+ checkpoint number (lowest 4 bytes) */
+ ulint recovered_offset;
+ /*!< start offset of non-parsed log records in
+ buf */
+ ib_uint64_t recovered_lsn;
+ /*!< the log records have been parsed up to
+ this lsn */
+ ib_uint64_t limit_lsn;/*!< recovery should be made at most
+ up to this lsn */
+ ibool found_corrupt_log;
+ /*!< this is set to TRUE if we during log
+ scan find a corrupt log block, or a corrupt
+ log record, or there is a log parsing
+ buffer overflow */
+#ifdef UNIV_LOG_ARCHIVE
+ log_group_t* archive_group;
+ /*!< in archive recovery: the log group whose
+ archive is read */
+#endif /* !UNIV_LOG_ARCHIVE */
+ mem_heap_t* heap; /*!< memory heap of log records and file
+ addresses*/
+ hash_table_t* addr_hash;/*!< hash table of file addresses of pages */
+ ulint n_addrs;/*!< number of not processed hashed file
+ addresses in the hash table */
+
+/* If you modified the following defines at original file,
+ You should also modify them. */
+/* defined in os0file.c */
+#define OS_AIO_MERGE_N_CONSECUTIVE 64
+/* defined in log0recv.c */
+#define RECV_READ_AHEAD_AREA 32
+ time_t stats_recv_start_time;
+ ulint stats_recv_turns;
+
+ ulint stats_read_requested_pages;
+ ulint stats_read_in_area[RECV_READ_AHEAD_AREA];
+
+ ulint stats_read_io_pages;
+ ulint stats_read_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
+ ulint stats_write_io_pages;
+ ulint stats_write_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
+
+ ulint stats_doublewrite_check_pages;
+ ulint stats_doublewrite_overwrite_pages;
+
+ ulint stats_recover_pages_with_read;
+ ulint stats_recover_pages_without_read;
+
+ ulint stats_log_recs;
+ ulint stats_log_len_sum;
+
+ ulint stats_applied_log_recs;
+ ulint stats_applied_log_len_sum;
+ ulint stats_pages_already_new;
+
+ ib_uint64_t stats_oldest_modified_lsn;
+ ib_uint64_t stats_newest_modified_lsn;
+};
+
+/** The recovery system */
+extern recv_sys_t* recv_sys;
+
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise. Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+extern ibool recv_recovery_on;
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern ibool recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern ibool recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+extern ibool recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+extern ibool recv_lsn_checks_on;
+#ifdef UNIV_HOTBACKUP
+/** TRUE when the redo log is being backed up */
+extern ibool recv_is_making_a_backup;
+#endif /* UNIV_HOTBACKUP */
+/** Maximum page number encountered in the redo log */
+extern ulint recv_max_parsed_page_no;
+
+/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
+times! */
+#define RECV_PARSING_BUF_SIZE (2 * 1024 * 1024)
+
+/** Size of block reads when the log groups are scanned forward to do a
+roll-forward */
+#define RECV_SCAN_SIZE (4 * UNIV_PAGE_SIZE)
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+extern ulint recv_n_pool_free_frames;
+
+#ifndef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/log0recv.ic b/storage/xtradb/include/log0recv.ic
new file mode 100644
index 00000000000..0a8e55b96fa
--- /dev/null
+++ b/storage/xtradb/include/log0recv.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.ic
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void)
+/*=====================*/
+{
+ return(UNIV_UNLIKELY(recv_recovery_on));
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+extern ibool recv_recovery_from_backup_on;
+
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void)
+/*=================================*/
+{
+ return(recv_recovery_from_backup_on);
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h
new file mode 100644
index 00000000000..44ee3df22ce
--- /dev/null
+++ b/storage/xtradb/include/mach0data.h
@@ -0,0 +1,400 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 256 */
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ const byte* b) /*!< in: pointer to byte */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored, >= 0, < 64k */
+/********************************************************//**
+The following function is used to fetch data from two consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer, >= 0, < 64k */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ const byte* b) /*!< in: pointer to two bytes */
+ __attribute__((nonnull, pure));
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+ __attribute__((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+ __attribute__((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ const byte* b) /*!< in: pointer to 3 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ const byte* b) /*!< in: pointer to four bytes */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n); /*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer to be stored */
+ __attribute__((const));
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+ __attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n); /*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ __attribute__((nonnull, pure));
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n); /*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ dulint n) /*!< in: dulint integer to be stored */
+ __attribute__((const));
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ ulint* val); /*!< out: read value */
+/*********************************************************//**
+Reads a dulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ dulint* val); /*!< out: read value */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d); /*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d); /*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n); /*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+ __attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n); /*!< in: unsigned long int to write */
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type); /*!< in: signed or unsigned flag */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic
new file mode 100644
index 00000000000..96d2417ac81
--- /dev/null
+++ b/storage/xtradb/include/mach0data.ic
@@ -0,0 +1,783 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "ut0mem.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+ byte* b, /*!< in: pointer to byte where to store */
+ ulint n) /*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+ ut_ad(b);
+ ut_ad((n | 0xFFUL) <= 0xFFUL);
+
+ b[0] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+ const byte* b) /*!< in: pointer to byte */
+{
+ ut_ad(b);
+ return((ulint)(b[0]));
+}
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+ byte* b, /*!< in: pointer to two bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad((n | 0xFFFFUL) <= 0xFFFFUL);
+
+ b[0] = (byte)(n >> 8);
+ b[1] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+ const byte* b) /*!< in: pointer to 2 bytes */
+{
+ return(((ulint)(b[0]) << 8) | (ulint)(b[1]));
+}
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return 16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+ ulint n) /*!< in: integer in machine-dependent format */
+{
+ uint16 ret;
+ ut_ad(2 == sizeof ret);
+ mach_write_to_2((byte*) &ret, n);
+ return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+ uint16 n) /*!< in: 16-bit integer in canonical format */
+{
+ ut_ad(2 == sizeof n);
+ return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+ byte* b, /*!< in: pointer to 3 bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+ ut_ad((n | 0xFFFFFFUL) <= 0xFFFFFFUL);
+
+ b[0] = (byte)(n >> 16);
+ b[1] = (byte)(n >> 8);
+ b[2] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+ const byte* b) /*!< in: pointer to 3 bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 16)
+ | ((ulint)(b[1]) << 8)
+ | (ulint)(b[2])
+ );
+}
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+ byte* b, /*!< in: pointer to four bytes where to store */
+ ulint n) /*!< in: ulint integer to be stored */
+{
+ ut_ad(b);
+
+ b[0] = (byte)(n >> 24);
+ b[1] = (byte)(n >> 16);
+ b[2] = (byte)(n >> 8);
+ b[3] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+ const byte* b) /*!< in: pointer to four bytes */
+{
+ ut_ad(b);
+ return( ((ulint)(b[0]) << 24)
+ | ((ulint)(b[1]) << 16)
+ | ((ulint)(b[2]) << 8)
+ | (ulint)(b[3])
+ );
+}
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ ut_ad(b);
+
+ if (n < 0x80UL) {
+ mach_write_to_1(b, n);
+ return(1);
+ } else if (n < 0x4000UL) {
+ mach_write_to_2(b, n | 0x8000UL);
+ return(2);
+ } else if (n < 0x200000UL) {
+ mach_write_to_3(b, n | 0xC00000UL);
+ return(3);
+ } else if (n < 0x10000000UL) {
+ mach_write_to_4(b, n | 0xE0000000UL);
+ return(4);
+ } else {
+ mach_write_to_1(b, 0xF0UL);
+ mach_write_to_4(b + 1, n);
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+ ulint n) /*!< in: ulint integer (< 2^32) to be stored */
+{
+ if (n < 0x80UL) {
+ return(1);
+ } else if (n < 0x4000UL) {
+ return(2);
+ } else if (n < 0x200000UL) {
+ return(3);
+ } else if (n < 0x10000000UL) {
+ return(4);
+ } else {
+ return(5);
+ }
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint flag;
+
+ ut_ad(b);
+
+ flag = mach_read_from_1(b);
+
+ if (flag < 0x80UL) {
+ return(flag);
+ } else if (flag < 0xC0UL) {
+ return(mach_read_from_2(b) & 0x7FFFUL);
+ } else if (flag < 0xE0UL) {
+ return(mach_read_from_3(b) & 0x3FFFFFUL);
+ } else if (flag < 0xF0UL) {
+ return(mach_read_from_4(b) & 0x1FFFFFFFUL);
+ } else {
+ ut_ad(flag == 0xF0UL);
+ return(mach_read_from_4(b + 1));
+ }
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_4(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 4, ut_dulint_get_low(n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+ byte* b, /*!< in: pointer to 8 bytes where to store */
+ ib_uint64_t n) /*!< in: 64-bit integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_4(b, (ulint) (n >> 32));
+ mach_write_to_4(b + 4, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_4(b);
+ low = mach_read_from_4(b + 4);
+
+ return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return 64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+ const byte* b) /*!< in: pointer to 8 bytes */
+{
+ ib_uint64_t ull;
+
+ ull = ((ib_uint64_t) mach_read_from_4(b)) << 32;
+ ull |= (ib_uint64_t) mach_read_from_4(b + 4);
+
+ return(ull);
+}
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+ byte* b, /*!< in: pointer to 7 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_3(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 3, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+ const byte* b) /*!< in: pointer to 7 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_3(b);
+ low = mach_read_from_4(b + 3);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+ byte* b, /*!< in: pointer to 6 bytes where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ut_ad(b);
+
+ mach_write_to_2(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + 2, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+ const byte* b) /*!< in: pointer to 6 bytes */
+{
+ ulint high;
+ ulint low;
+
+ ut_ad(b);
+
+ high = mach_read_from_2(b);
+ low = mach_read_from_4(b + 2);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ size = mach_write_compressed(b, ut_dulint_get_high(n));
+ mach_write_to_4(b + size, ut_dulint_get_low(n));
+
+ return(size + 4);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ return(4 + mach_get_compressed_size(ut_dulint_get_high(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ high = mach_read_compressed(b);
+
+ size = mach_get_compressed_size(high);
+
+ low = mach_read_from_4(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+ byte* b, /*!< in: pointer to memory where to store */
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ ulint size;
+
+ ut_ad(b);
+
+ if (ut_dulint_get_high(n) == 0) {
+ return(mach_write_compressed(b, ut_dulint_get_low(n)));
+ }
+
+ *b = (byte)0xFF;
+ size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n));
+
+ size += mach_write_compressed(b + size, ut_dulint_get_low(n));
+
+ return(size);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+ dulint n) /*!< in: dulint integer to be stored */
+{
+ if (0 == ut_dulint_get_high(n)) {
+ return(mach_get_compressed_size(ut_dulint_get_low(n)));
+ }
+
+ return(1 + mach_get_compressed_size(ut_dulint_get_high(n))
+ + mach_get_compressed_size(ut_dulint_get_low(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(b);
+
+ if (*b != (byte)0xFF) {
+ high = 0;
+ size = 0;
+ } else {
+ high = mach_read_compressed(b + 1);
+
+ size = 1 + mach_get_compressed_size(high);
+ }
+
+ low = mach_read_compressed(b + size);
+
+ return(ut_dulint_create(high, low));
+}
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ double d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(double) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ double d) /*!< in: double */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(double) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+ const byte* b) /*!< in: pointer to memory from where to read */
+{
+ float d;
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ ptr[sizeof(float) - i - 1] = b[i];
+#else
+ ptr[i] = b[i];
+#endif
+ }
+
+ return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+ byte* b, /*!< in: pointer to memory where to write */
+ float d) /*!< in: float */
+{
+ ulint i;
+ byte* ptr;
+
+ ptr = (byte*)&d;
+
+ for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+ b[i] = ptr[sizeof(float) - i - 1];
+#else
+ b[i] = ptr[i];
+#endif
+ }
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+ const byte* buf, /*!< in: from where to read */
+ ulint buf_size) /*!< in: from how many bytes to read */
+{
+ ulint n = 0;
+ const byte* ptr;
+
+ ut_ad(buf_size <= sizeof(ulint));
+ ut_ad(buf_size > 0);
+
+ ptr = buf + buf_size;
+
+ for (;;) {
+ ptr--;
+
+ n = n << 8;
+
+ n += (ulint)(*ptr);
+
+ if (ptr == buf) {
+ break;
+ }
+ }
+
+ return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint dest_size, /*!< in: into how many bytes to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ byte* end;
+
+ ut_ad(dest_size <= sizeof(ulint));
+ ut_ad(dest_size > 0);
+
+ end = dest + dest_size;
+
+ for (;;) {
+ *dest = (byte)(n & 0xFF);
+
+ n = n >> 8;
+
+ dest++;
+
+ if (dest == end) {
+ break;
+ }
+ }
+
+ ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+ const byte* buf) /*!< in: from where to read */
+{
+ return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+ byte* dest, /*!< in: where to write */
+ ulint n) /*!< in: unsigned long int to write */
+{
+ ut_ad(n < 256 * 256);
+
+ *dest = (byte)(n & 0xFFUL);
+
+ n = n >> 8;
+ dest++;
+
+ *dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+ const byte* src, /*!< in: where to read from */
+ ulint len, /*!< in: length of src */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ /* XXX this can be optimized on big-endian machines */
+
+ ullint ret;
+ uint i;
+
+ if (unsigned_type || (src[0] & 0x80)) {
+
+ ret = 0x0000000000000000ULL;
+ } else {
+
+ ret = 0xFFFFFFFFFFFFFF00ULL;
+ }
+
+ if (unsigned_type) {
+
+ ret |= src[0];
+ } else {
+
+ ret |= src[0] ^ 0x80;
+ }
+
+ for (i = 1; i < len; i++) {
+ ret <<= 8;
+ ret |= src[i];
+ }
+
+ return(ret);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h
new file mode 100644
index 00000000000..d81e1418b2b
--- /dev/null
+++ b/storage/xtradb/include/mem0dbg.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0dbg.h
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+/* In the debug version each allocated field is surrounded with
+check fields whose sizes are given below */
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables in mem0dbg.c. */
+extern mutex_t mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
+#define MEM_FIELD_HEADER_SIZE ut_calc_align(2 * sizeof(ulint),\
+ UNIV_MEM_ALIGNMENT)
+#define MEM_FIELD_TRAILER_SIZE sizeof(ulint)
+#else
+#define MEM_FIELD_HEADER_SIZE 0
+#endif
+
+
+/* Space needed when allocating for a user a field of
+length N. The space is allocated only in multiples of
+UNIV_MEM_ALIGNMENT. In the debug version there are also
+check fields at the both ends of the field. */
+#ifdef UNIV_MEM_DEBUG
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\
+ + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT)
+#else
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT)
+#endif
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ byte* top, /*!< in: calculate and validate only until
+ this top pointer in the heap is reached,
+ if this pointer is NULL, ignored */
+ ibool print, /*!< in: if TRUE, prints the contents
+ of the heap; works only in
+ the debug version */
+ ibool* error, /*!< out: TRUE if error */
+ ulint* us_size,/*!< out: allocated memory
+ (for the user) in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored; in the
+ non-debug version this is always -1 */
+ ulint* ph_size,/*!< out: physical size of the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+ ulint* n_blocks); /*!< out: number of blocks in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it)
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void);
+/*===============*/
+/*****************************************************************//**
+Validates the dynamic memory
+@return TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void);
+/*=========================*/
+/************************************************************//**
+Validates the dynamic memory
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void);
+/*===============*/
+#endif /* UNIV_MEM_DEBUG */
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+ void* ptr); /*!< in: pointer to place of possible corruption */
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void);
+/*================*/
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void);
+/*====================*/
diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic
new file mode 100644
index 00000000000..b0c8178a623
--- /dev/null
+++ b/storage/xtradb/include/mem0dbg.ic
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0dbg.ic
+The memory management: the debug code. This is not an independent
+compilation module but is included in mem0mem.*.
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+extern ulint mem_current_allocated_memory;
+
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+ byte* buf, /*!< in: memory field */
+ ulint n); /*!< in: how many bytes the user requested */
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+ byte* buf, /*!< in: memory field */
+ ulint n); /*!< in: how many bytes the user requested */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n); /*!< in: length of buffer */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n); /*!< in: length of buffer */
+/***************************************************************//**
+Inserts a created memory heap to the hash table of
+current allocated memory heaps.
+Initializes the hash table when first called. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+ mem_heap_t* heap, /*!< in: the created heap */
+ const char* file_name, /*!< in: file name of creation */
+ ulint line); /*!< in: line where created */
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+ mem_heap_t* heap, /*!< in: the heap to be freed */
+ const char* file_name, /*!< in: file name of freeing */
+ ulint line); /*!< in: line where freed */
+
+
+void
+mem_field_header_set_len(byte* field, ulint len);
+
+ulint
+mem_field_header_get_len(byte* field);
+
+void
+mem_field_header_set_check(byte* field, ulint check);
+
+ulint
+mem_field_header_get_check(byte* field);
+
+void
+mem_field_trailer_set_check(byte* field, ulint check);
+
+ulint
+mem_field_trailer_get_check(byte* field);
+#endif /* UNIV_MEM_DEBUG */
diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h
new file mode 100644
index 00000000000..ee28cf7b225
--- /dev/null
+++ b/storage/xtradb/include/mem0mem.h
@@ -0,0 +1,402 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "univ.i"
+#include "ut0mem.h"
+#include "ut0byte.h"
+#include "ut0rnd.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* UNIV_HOTBACKUP */
+#include "ut0lst.h"
+#include "mach0data.h"
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/* The info structure stored at the beginning of a heap block */
+typedef struct mem_block_info_struct mem_block_info_t;
+
+/* A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef mem_block_info_t mem_block_t;
+
+/* A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t mem_heap_t;
+
+/* Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC 0 /* the most common type */
+#define MEM_HEAP_BUFFER 1
+#define MEM_HEAP_BTR_SEARCH 2 /* this flag can optionally be
+ ORed to MEM_HEAP_BUFFER, in which
+ case heap->free_block is used in
+ some cases for memory allocations,
+ and if it's NULL, the memory
+ allocation functions can return
+ NULL. */
+
+/* The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE 64
+#define MEM_BLOCK_STANDARD_SIZE \
+ (UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/* If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF (UNIV_PAGE_SIZE - 200)
+
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+ ulint size); /*!< in: common pool size in bytes */
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void);
+/*===========*/
+
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create(N) mem_heap_create_func(\
+ (N), MEM_HEAP_DYNAMIC, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_buffer(N) mem_heap_create_func(\
+ (N), MEM_HEAP_BUFFER, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_btr_search(N) mem_heap_create_func(\
+ (N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\
+ __FILE__, __LINE__)
+
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap freeing. */
+
+#define mem_heap_free(heap) mem_heap_free_func(\
+ (heap), __FILE__, __LINE__)
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+arguments.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ ulint n, /*!< in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block */
+ ulint type, /*!< in: heap type */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap, /*!< in, own: heap to be freed */
+ const char* file_name, /*!< in: file name where freed */
+ ulint line); /*!< in: line where freed */
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ mem_heap_t* heap); /*!< in: memory heap */
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /*!< in: heap from which to free */
+ byte* old_top);/*!< in: pointer to old top of heap */
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap); /*!< in: heap to empty */
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap); /*!< in: heap */
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+
+#define mem_zalloc(N) memset(mem_alloc(N), 0, (N));
+
+#define mem_alloc(N) mem_alloc_func((N), NULL, __FILE__, __LINE__)
+#define mem_alloc2(N,S) mem_alloc_func((N), (S), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ ulint n, /*!< in: requested size in bytes */
+ ulint* size, /*!< out: allocated size in bytes,
+ or NULL */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer freeing */
+
+#define mem_free(PTR) mem_free_func((PTR), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Frees a single buffer of storage from
+the dynamic memory of C compiler. Similar to free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr, /*!< in, own: buffer to be freed */
+ const char* file_name, /*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str); /*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str, /*!< in: string to be copied */
+ ulint len); /*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2); /*!< in: string 2 */
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+ mem_heap_t* heap, /*!< in: memory heap where copy is allocated */
+ const void* data, /*!< in: data to be copied */
+ ulint len); /*!< in: length of data, in bytes */
+
+/****************************************************************//**
+A simple (s)printf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...) __attribute__ ((format (printf, 2, 3)));
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void);
+/*=========================*/
+#endif
+
+/*#######################################################################*/
+
+/* The info header of a block in a memory heap */
+
+struct mem_block_info_struct {
+ ulint magic_n;/* magic number for debugging */
+ char file_name[8];/* file name where the mem heap was created */
+ ulint line; /*!< line number where the mem heap was created */
+ UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+ the list this is the base node of the list of blocks;
+ in subsequent blocks this is undefined */
+ UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+ and prev in the list. The first block allocated
+ to the heap is also the first block in this list,
+ though it also contains the base node of the list. */
+ ulint len; /*!< physical length of this block in bytes */
+ ulint total_size; /* physical length in bytes of all blocks
+ in the heap. This is defined only in the base
+ node and is set to ULINT_UNDEFINED in others. */
+ ulint type; /*!< type of heap: MEM_HEAP_DYNAMIC, or
+ MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+ ulint free; /*!< offset in bytes of the first free position for
+ user data in the block */
+ ulint start; /*!< the value of the struct field 'free' at the
+ creation of the block */
+#ifndef UNIV_HOTBACKUP
+ void* free_block;
+ /* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+ and this is the heap root, this can contain an
+ allocated buffer frame, which can be appended as a
+ free block to the heap, if we need more space;
+ otherwise, this is NULL */
+ void* buf_block;
+ /* if this block has been allocated from the buffer
+ pool, this contains the buf_block_t handle;
+ otherwise, this is NULL */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef MEM_PERIODIC_CHECK
+ UT_LIST_NODE_T(mem_block_t) mem_block_list;
+ /* List of all mem blocks allocated; protected
+ by the mem_comm_pool mutex */
+#endif
+};
+
+#define MEM_BLOCK_MAGIC_N 764741555
+#define MEM_FREED_BLOCK_MAGIC_N 547711122
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE ut_calc_align(sizeof(mem_block_info_t),\
+ UNIV_MEM_ALIGNMENT)
+#include "mem0dbg.h"
+
+#ifndef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic
new file mode 100644
index 00000000000..cbce2edc661
--- /dev/null
+++ b/storage/xtradb/include/mem0mem.ic
@@ -0,0 +1,640 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0dbg.ic"
+#ifndef UNIV_HOTBACKUP
+# include "mem0pool.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+ ulint type, /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+ const char* file_name,/*!< in: file name where created */
+ ulint line); /*!< in: line where created */
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block); /*!< in: block to free */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap); /*!< in: heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n); /*!< in: number of bytes user needs */
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+ ut_ad(len > 0);
+
+ block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+ return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+ return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+ ut_ad(free > 0);
+ ut_ad(free <= mem_block_get_len(block));
+
+ block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+ return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+ ut_ad(start > 0);
+
+ block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+ return(block->start);
+}
+
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+{
+ ut_ad(heap);
+ ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+ return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes; if the heap is allowed
+ to grow into the buffer pool, this must be
+ <= MEM_MAX_ALLOC_IN_BUF */
+{
+ mem_block_t* block;
+ void* buf;
+ ulint free;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+ /* Check if there is enough space in block. If not, create a new
+ block to the heap */
+
+ if (mem_block_get_len(block)
+ < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+ block = mem_heap_add_block(heap, n);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+ }
+
+ free = mem_block_get_free(block);
+
+ buf = (byte*)block + free;
+
+ mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+#ifdef UNIV_MEM_DEBUG
+ UNIV_MEM_ALLOC(buf,
+ n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
+
+ /* In the debug version write debugging info to the field */
+ mem_field_init((byte*)buf, n);
+
+ /* Advance buf to point at the storage which will be given to the
+ caller */
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+#endif
+#ifdef UNIV_SET_MEM_TO_ZERO
+ UNIV_MEM_ALLOC(buf, n);
+ memset(buf, '\0', n);
+#endif
+ UNIV_MEM_ALLOC(buf, n);
+ return(buf);
+}
+
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ mem_block_t* block;
+ byte* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block);
+
+ return(buf);
+}
+
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+ mem_heap_t* heap, /*!< in: heap from which to free */
+ byte* old_top)/*!< in: pointer to old top of heap */
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+#ifdef UNIV_MEM_DEBUG
+ ibool error;
+ ulint total_size;
+ ulint size;
+#endif
+
+ ut_ad(mem_heap_check(heap));
+
+#ifdef UNIV_MEM_DEBUG
+
+ /* Validate the heap and get its total allocated size */
+ mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
+ NULL, NULL);
+ ut_a(!error);
+
+ /* Get the size below top pointer */
+ mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL,
+ NULL);
+ ut_a(!error);
+
+#endif
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ while (block != NULL) {
+ if (((byte*)block + mem_block_get_free(block) >= old_top)
+ && ((byte*)block <= old_top)) {
+ /* Found the right block */
+
+ break;
+ }
+
+ /* Store prev_block value before freeing the current block
+ (the current block will be erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+
+ ut_ad(block);
+
+ /* Set the free field of block */
+ mem_block_set_free(block, old_top - (byte*)block);
+
+#ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version erase block from top up */
+ mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+
+ /* Update allocated memory count */
+ mutex_enter(&mem_hash_mutex);
+ mem_current_allocated_memory -= (total_size - size);
+ mutex_exit(&mem_hash_mutex);
+#else /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top);
+#endif /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top);
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ }
+}
+
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+ mem_heap_t* heap) /*!< in: heap to empty */
+{
+ mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap. The size of the
+element must be given.
+@return pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+ void* buf;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ buf = (byte*)block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+#ifdef UNIV_MEM_DEBUG
+ ut_ad(mem_block_get_start(block) <=(ulint)((byte*)buf - (byte*)block));
+
+ /* In the debug version, advance buf to point at the storage which
+ was given to the caller in the allocation*/
+
+ buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+ /* Check that the field lengths agree */
+ ut_ad(n == (ulint)mem_field_header_get_len(buf));
+#endif
+
+ return(buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: size of the topmost element */
+{
+ mem_block_t* block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* Subtract the free field of block */
+ mem_block_set_free(block, mem_block_get_free(block)
+ - MEM_SPACE_NEEDED(n));
+ UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n);
+#ifdef UNIV_MEM_DEBUG
+
+ ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+ /* In the debug version check the consistency, and erase field */
+ mem_field_erase((byte*)block + mem_block_get_free(block), n);
+#endif
+
+ /* If free == start, we may free the block if it is not the first
+ one */
+
+ if ((heap != block) && (mem_block_get_free(block)
+ == mem_block_get_start(block))) {
+ mem_heap_block_free(heap, block);
+ } else {
+ /* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a
+ subsequent invocation of mem_heap_free_top().
+ Originally, this was UNIV_MEM_FREE(), to catch writes
+ to freed memory. */
+ UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n);
+ }
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+argument.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+ ulint n, /*!< in: desired start block size,
+ this means that a single user buffer
+ of size n will fit in the block,
+ 0 creates a default size block */
+ ulint type, /*!< in: heap type */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_block_t* block;
+
+ if (!n) {
+ n = MEM_BLOCK_START_SIZE;
+ }
+
+ block = mem_heap_create_block(NULL, n, type, file_name, line);
+
+ if (block == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_INIT(block->base);
+
+ /* Add the created block itself as the first block in the list */
+ UT_LIST_ADD_FIRST(list, block->base, block);
+
+#ifdef UNIV_MEM_DEBUG
+
+ mem_hash_insert(block, file_name, line);
+
+#endif
+
+ return(block);
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+ mem_heap_t* heap, /*!< in, own: heap to be freed */
+ const char* file_name __attribute__((unused)),
+ /*!< in: file name where freed */
+ ulint line __attribute__((unused)))
+{
+ mem_block_t* block;
+ mem_block_t* prev_block;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+#ifdef UNIV_MEM_DEBUG
+
+ /* In the debug version remove the heap from the hash table of heaps
+ and check its consistency */
+
+ mem_hash_remove(heap, file_name, line);
+
+#endif
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ mem_heap_free_block_free(heap);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ while (block != NULL) {
+ /* Store the contents of info before freeing current block
+ (it is erased in freeing) */
+
+ prev_block = UT_LIST_GET_PREV(list, block);
+
+ mem_heap_block_free(heap, block);
+
+ block = prev_block;
+ }
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+ ulint n, /*!< in: desired number of bytes */
+ ulint* size, /*!< out: allocated size in bytes,
+ or NULL */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_heap_t* heap;
+ void* buf;
+
+ heap = mem_heap_create_func(n, MEM_HEAP_DYNAMIC, file_name, line);
+
+ /* Note that as we created the first block in the heap big enough
+ for the buffer requested by the caller, the buffer will be in the
+ first block and thus we can calculate the pointer to the heap from
+ the pointer to the buffer when we free the memory buffer. */
+
+ if (UNIV_LIKELY_NULL(size)) {
+ /* Adjust the allocation to the actual size of the
+ memory block. */
+ ulint m = mem_block_get_len(heap)
+ - mem_block_get_free(heap);
+#ifdef UNIV_MEM_DEBUG
+ m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
+#endif /* UNIV_MEM_DEBUG */
+ ut_ad(m >= n);
+ *size = n = m;
+ }
+
+ buf = mem_heap_alloc(heap, n);
+
+ ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ return(buf);
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees a single
+buffer of storage from the dynamic memory of the C compiler. Similar to the
+free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+ void* ptr, /*!< in, own: buffer to be freed */
+ const char* file_name, /*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+ mem_heap_t* heap;
+
+ heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+ - MEM_FIELD_HEADER_SIZE);
+ mem_heap_free_func(heap, file_name, line);
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ ulint size = 0;
+
+ ut_ad(mem_heap_check(heap));
+
+ size = heap->total_size;
+
+#ifndef UNIV_HOTBACKUP
+ if (heap->free_block) {
+ size += UNIV_PAGE_SIZE;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+ const char* str) /*!< in: string to be copied */
+{
+ ulint len = strlen(str) + 1;
+ return((char*) memcpy(mem_alloc(len), str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = (char*) mem_alloc(len + 1);
+ s[len] = 0;
+ return((char*) memcpy(s, str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str, /*!< in: string to be copied */
+ ulint len) /*!< in: length of str, in bytes */
+{
+ char* s = (char*) mem_heap_alloc(heap, len + 1);
+ s[len] = 0;
+ return((char*) memcpy(s, str, len));
+}
diff --git a/storage/xtradb/include/mem0pool.h b/storage/xtradb/include/mem0pool.h
new file mode 100644
index 00000000000..fa8be296ec9
--- /dev/null
+++ b/storage/xtradb/include/mem0pool.h
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0pool.h
+The lowest-level memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0pool_h
+#define mem0pool_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "ut0lst.h"
+
+/** Memory area header */
+typedef struct mem_area_struct mem_area_t;
+/** Memory pool */
+typedef struct mem_pool_struct mem_pool_t;
+
+/** The common memory pool */
+extern mem_pool_t* mem_comm_pool;
+
+/** Memory area header */
+
+struct mem_area_struct{
+ ulint size_and_free; /*!< memory area size is obtained by
+ anding with ~MEM_AREA_FREE; area in
+ a free list if ANDing with
+ MEM_AREA_FREE results in nonzero */
+ UT_LIST_NODE_T(mem_area_t)
+ free_list; /*!< free list node */
+};
+
+/** Each memory area takes this many extra bytes for control information */
+#define MEM_AREA_EXTRA_SIZE (ut_calc_align(sizeof(struct mem_area_struct),\
+ UNIV_MEM_ALIGNMENT))
+
+/********************************************************************//**
+Creates a memory pool.
+@return memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+ ulint size); /*!< in: pool size in bytes */
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+ mem_pool_t* pool); /*!< in, own: memory pool */
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+ ulint* psize, /*!< in: requested size in bytes; for optimum
+ space usage, the size should be a power of 2
+ minus MEM_AREA_EXTRA_SIZE;
+ out: allocated size in bytes (greater than
+ or equal to the requested size) */
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+ void* ptr, /*!< in, own: pointer to allocated memory
+ buffer */
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return reserved mmeory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Validates a memory pool.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+ mem_pool_t* pool); /*!< in: memory pool */
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+ FILE* outfile,/*!< in: output file to write to */
+ mem_pool_t* pool); /*!< in: memory pool */
+
+
+#ifndef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mem0pool.ic b/storage/xtradb/include/mem0pool.ic
new file mode 100644
index 00000000000..b891dd6dea0
--- /dev/null
+++ b/storage/xtradb/include/mem0pool.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0pool.ic
+The lowest-level memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
diff --git a/storage/xtradb/include/mtr0log.h b/storage/xtradb/include/mtr0log.h
new file mode 100644
index 00000000000..6322af2a569
--- /dev/null
+++ b/storage/xtradb/include/mtr0log.h
@@ -0,0 +1,250 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.h
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0log_h
+#define mtr0log_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+ byte* ptr, /*!< in: pointer where to write */
+ ulint val, /*!< in: value to write */
+ byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_dulint(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ dulint val, /*!< in: value to write */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ const byte* str, /*!< in: string to write */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+ byte* ptr, /*!< in: pointer written to */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/*!< in: space id, if applicable */
+ ulint page_no,/*!< in: page number (not relevant currently) */
+ byte* log_ptr,/*!< in: pointer to mtr log which has been opened */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val, /*!< in: value to write */
+ ulint type); /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* str, /*!< in: string to write */
+ ulint len); /*!< in: string length */
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val); /*!< in: value to write */
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ dulint val); /*!< in: value to write */
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint size); /*!< in: buffer size in bytes; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /*!< in: mtr */
+ byte* ptr); /*!< in: buffer space from ptr up was not used */
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/*!< in: pointer to mtr log which has
+ been opened */
+ mtr_t* mtr); /*!< in: mtr */
+#else /* !UNIV_HOTBACKUP */
+# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte *) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* type, /*!< out: log record type: MLOG_1BYTE, ... */
+ ulint* space, /*!< out: space id */
+ ulint* page_no);/*!< out: page number */
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_dulint.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+ ulint type, /*!< in: log record type: MLOG_1BYTE, ... */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip);/*!< in/out: compressed page, or NULL */
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip);/*!< in/out: compressed page, or NULL */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index. Reserves space
+for further log entries. The log entry must be closed with
+mtr_close().
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* rec, /*!< in: index record or page */
+ dict_index_t* index, /*!< in: record descriptor */
+ byte type, /*!< in: log item type */
+ ulint size); /*!< in: requested buffer size in bytes
+ (if 0, calls mlog_close() and returns NULL) */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+ byte* ptr, /*!< in: buffer */
+ const byte* end_ptr,/*!< in: buffer end */
+ ibool comp, /*!< in: TRUE=compact record format */
+ dict_index_t** index); /*!< out, own: dummy index */
+
+#ifndef UNIV_HOTBACKUP
+/* Insert, update, and maybe other functions may use this value to define an
+extra mlog buffer size for variable size data */
+#define MLOG_BUF_MARGIN 256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic
new file mode 100644
index 00000000000..63af02ba409
--- /dev/null
+++ b/storage/xtradb/include/mtr0log.ic
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.ic
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+#include "fsp0types.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint size) /*!< in: buffer size in bytes; MUST be
+ smaller than DYN_ARRAY_DATA_SIZE! */
+{
+ dyn_array_t* mlog;
+
+ mtr->modifications = TRUE;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return(NULL);
+ }
+
+ mlog = &(mtr->log);
+
+ return(dyn_array_open(mlog, size));
+}
+
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+ mtr_t* mtr, /*!< in: mtr */
+ byte* ptr) /*!< in: buffer space from ptr up was not used */
+{
+ dyn_array_t* mlog;
+
+ ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
+
+ mlog = &(mtr->log);
+
+ dyn_array_close(mlog, ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val, /*!< in: value to write */
+ ulint type) /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+{
+ dyn_array_t* mlog;
+ byte* ptr;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return;
+ }
+
+ mlog = &(mtr->log);
+
+#if MLOG_1BYTE != 1
+# error "MLOG_1BYTE != 1"
+#endif
+#if MLOG_2BYTES != 2
+# error "MLOG_2BYTES != 2"
+#endif
+#if MLOG_4BYTES != 4
+# error "MLOG_4BYTES != 4"
+#endif
+#if MLOG_8BYTES != 8
+# error "MLOG_8BYTES != 8"
+#endif
+ ptr = (byte*) dyn_array_push(mlog, type);
+
+ if (type == MLOG_4BYTES) {
+ mach_write_to_4(ptr, val);
+ } else if (type == MLOG_2BYTES) {
+ mach_write_to_2(ptr, val);
+ } else {
+ ut_ad(type == MLOG_1BYTE);
+ mach_write_to_1(ptr, val);
+ }
+}
+
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint val) /*!< in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 10);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ dulint val) /*!< in: value to write */
+{
+ byte* log_ptr;
+
+ log_ptr = mlog_open(mtr, 15);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ byte* log_ptr,/*!< in: pointer to mtr log which has
+ been opened */
+ mtr_t* mtr) /*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+ buf_block_t* block;
+#endif
+ const byte* page;
+ ulint space;
+ ulint offset;
+
+ ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(ptr && log_ptr);
+
+ page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+ space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ /* check whether the page is in the doublewrite buffer;
+ the doublewrite buffer is located in pages
+ FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
+ system tablespace */
+ if ((space == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
+ && offset >= (ulint)FSP_EXTENT_SIZE && offset < 3 * (ulint)FSP_EXTENT_SIZE) {
+ if (trx_doublewrite_buf_is_being_created) {
+ /* Do nothing: we only come to this branch in an
+ InnoDB database creation. We do not redo log
+ anything for the doublewrite buffer pages. */
+ return(log_ptr);
+ } else {
+ fprintf(stderr,
+ "Error: trying to redo log a record of type "
+ "%d on page %lu of space %lu in the "
+ "doublewrite buffer, continuing anyway.\n"
+ "Please post a bug report to "
+ "bugs.mysql.com.\n",
+ type, offset, space);
+ }
+ }
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, space);
+ log_ptr += mach_write_compressed(log_ptr, offset);
+
+ mtr->n_log_recs++;
+
+#ifdef UNIV_LOG_DEBUG
+ fprintf(stderr,
+ "Adding to mtr log record type %lu space %lu page no %lu\n",
+ (ulong) type, space, offset);
+#endif
+
+#ifdef UNIV_DEBUG
+ /* We now assume that all x-latched pages have been modified! */
+ block = (buf_block_t*) buf_block_align(ptr);
+
+ if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) {
+
+ mtr_memo_push(mtr, block, MTR_MEMO_MODIFY);
+ }
+#endif
+ return(log_ptr);
+}
+
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+ ulint type, /*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+ MLOG_FILE_RENAME */
+ ulint space_id,/*!< in: space id, if applicable */
+ ulint page_no,/*!< in: page number (not relevant currently) */
+ byte* log_ptr,/*!< in: pointer to mtr log which has been opened */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(log_ptr);
+
+ mach_write_to_1(log_ptr, type);
+ log_ptr++;
+
+ /* We write dummy space id and page number */
+ log_ptr += mach_write_compressed(log_ptr, space_id);
+ log_ptr += mach_write_compressed(log_ptr, page_no);
+
+ mtr->n_log_recs++;
+
+ return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h
new file mode 100644
index 00000000000..bc3f1951be9
--- /dev/null
+++ b/storage/xtradb/include/mtr0mtr.h
@@ -0,0 +1,419 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "dyn0dyn.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Logging modes for a mini-transaction */
+#define MTR_LOG_ALL 21 /* default mode: log all operations
+ modifying disk-based data */
+#define MTR_LOG_NONE 22 /* log no operations */
+/*#define MTR_LOG_SPACE 23 */ /* log only operations modifying
+ file space page allocation data
+ (operations in fsp0fsp.* ) */
+#define MTR_LOG_SHORT_INSERTS 24 /* inserts are logged in a shorter
+ form */
+
+/* Types for the mlock objects to store in the mtr memo; NOTE that the
+first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+#define MTR_MEMO_PAGE_S_FIX RW_S_LATCH
+#define MTR_MEMO_PAGE_X_FIX RW_X_LATCH
+#define MTR_MEMO_BUF_FIX RW_NO_LATCH
+#define MTR_MEMO_MODIFY 54
+#define MTR_MEMO_S_LOCK 55
+#define MTR_MEMO_X_LOCK 56
+
+/** @name Log item types
+The log items are declared 'byte' so that the compiler can warn if val
+and type parameters are switched in a call to mlog_write_ulint. NOTE!
+For 1 - 8 bytes, the flag value must give the length also! @{ */
+#define MLOG_SINGLE_REC_FLAG 128 /*!< if the mtr contains only
+ one log record for one page,
+ i.e., write_initial_log_record
+ has been called only once,
+ this flag is ORed to the type
+ of that first log record */
+#define MLOG_1BYTE (1) /*!< one byte is written */
+#define MLOG_2BYTES (2) /*!< 2 bytes ... */
+#define MLOG_4BYTES (4) /*!< 4 bytes ... */
+#define MLOG_8BYTES (8) /*!< 8 bytes ... */
+#define MLOG_REC_INSERT ((byte)9) /*!< record insert */
+#define MLOG_REC_CLUST_DELETE_MARK ((byte)10) /*!< mark clustered index record
+ deleted */
+#define MLOG_REC_SEC_DELETE_MARK ((byte)11) /*!< mark secondary index record
+ deleted */
+#define MLOG_REC_UPDATE_IN_PLACE ((byte)13) /*!< update of a record,
+ preserves record field sizes */
+#define MLOG_REC_DELETE ((byte)14) /*!< delete a record from a
+ page */
+#define MLOG_LIST_END_DELETE ((byte)15) /*!< delete record list end on
+ index page */
+#define MLOG_LIST_START_DELETE ((byte)16) /*!< delete record list start on
+ index page */
+#define MLOG_LIST_END_COPY_CREATED ((byte)17) /*!< copy record list end to a
+ new created index page */
+#define MLOG_PAGE_REORGANIZE ((byte)18) /*!< reorganize an
+ index page in
+ ROW_FORMAT=REDUNDANT */
+#define MLOG_PAGE_CREATE ((byte)19) /*!< create an index page */
+#define MLOG_UNDO_INSERT ((byte)20) /*!< insert entry in an undo
+ log */
+#define MLOG_UNDO_ERASE_END ((byte)21) /*!< erase an undo log
+ page end */
+#define MLOG_UNDO_INIT ((byte)22) /*!< initialize a page in an
+ undo log */
+#define MLOG_UNDO_HDR_DISCARD ((byte)23) /*!< discard an update undo log
+ header */
+#define MLOG_UNDO_HDR_REUSE ((byte)24) /*!< reuse an insert undo log
+ header */
+#define MLOG_UNDO_HDR_CREATE ((byte)25) /*!< create an undo
+ log header */
+#define MLOG_REC_MIN_MARK ((byte)26) /*!< mark an index
+ record as the
+ predefined minimum
+ record */
+#define MLOG_IBUF_BITMAP_INIT ((byte)27) /*!< initialize an
+ ibuf bitmap page */
+/*#define MLOG_FULL_PAGE ((byte)28) full contents of a page */
+#ifdef UNIV_LOG_LSN_DEBUG
+# define MLOG_LSN ((byte)28) /* current LSN */
+#endif
+#define MLOG_INIT_FILE_PAGE ((byte)29) /*!< this means that a
+ file page is taken
+ into use and the prior
+ contents of the page
+ should be ignored: in
+ recovery we must not
+ trust the lsn values
+ stored to the file
+ page */
+#define MLOG_WRITE_STRING ((byte)30) /*!< write a string to
+ a page */
+#define MLOG_MULTI_REC_END ((byte)31) /*!< if a single mtr writes
+ several log records,
+ this log record ends the
+ sequence of these records */
+#define MLOG_DUMMY_RECORD ((byte)32) /*!< dummy log record used to
+ pad a log block full */
+#define MLOG_FILE_CREATE ((byte)33) /*!< log record about an .ibd
+ file creation */
+#define MLOG_FILE_RENAME ((byte)34) /*!< log record about an .ibd
+ file rename */
+#define MLOG_FILE_DELETE ((byte)35) /*!< log record about an .ibd
+ file deletion */
+#define MLOG_COMP_REC_MIN_MARK ((byte)36) /*!< mark a compact
+ index record as the
+ predefined minimum
+ record */
+#define MLOG_COMP_PAGE_CREATE ((byte)37) /*!< create a compact
+ index page */
+#define MLOG_COMP_REC_INSERT ((byte)38) /*!< compact record insert */
+#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39)
+ /*!< mark compact
+ clustered index record
+ deleted */
+#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact
+ secondary index record
+ deleted; this log
+ record type is
+ redundant, as
+ MLOG_REC_SEC_DELETE_MARK
+ is independent of the
+ record format. */
+#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a
+ compact record,
+ preserves record field
+ sizes */
+#define MLOG_COMP_REC_DELETE ((byte)42) /*!< delete a compact record
+ from a page */
+#define MLOG_COMP_LIST_END_DELETE ((byte)43) /*!< delete compact record list
+ end on index page */
+#define MLOG_COMP_LIST_START_DELETE ((byte)44) /*!< delete compact record list
+ start on index page */
+#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45)
+ /*!< copy compact
+ record list end to a
+ new created index
+ page */
+#define MLOG_COMP_PAGE_REORGANIZE ((byte)46) /*!< reorganize an index page */
+#define MLOG_FILE_CREATE2 ((byte)47) /*!< log record about creating
+ an .ibd file, with format */
+#define MLOG_ZIP_WRITE_NODE_PTR ((byte)48) /*!< write the node pointer of
+ a record on a compressed
+ non-leaf B-tree page */
+#define MLOG_ZIP_WRITE_BLOB_PTR ((byte)49) /*!< write the BLOB pointer
+ of an externally stored column
+ on a compressed page */
+#define MLOG_ZIP_WRITE_HEADER ((byte)50) /*!< write to compressed page
+ header */
+#define MLOG_ZIP_PAGE_COMPRESS ((byte)51) /*!< compress an index page */
+#define MLOG_BIGGEST_TYPE ((byte)51) /*!< biggest value (used in
+ assertions) */
+/* @} */
+
+/** @name Flags for MLOG_FILE operations
+(stored in the page number parameter, called log_flags in the
+functions). The page number parameter was originally written as 0. @{ */
+#define MLOG_FILE_FLAG_TEMP 1 /*!< identifies TEMPORARY TABLE in
+ MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
+/* @} */
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and buffer in the memory buffer given by the caller.
+@return mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ mtr_t* mtr); /*!< in: memory buffer for the mtr buffer */
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+ mtr_t* mtr); /*!< in: mini-transaction */
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************//**
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+UNIV_INTERN
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint); /*!< in: savepoint */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ rw_lock_t* lock); /*!< in: latch to release */
+#else /* !UNIV_HOTBACKUP */
+# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint mode); /*!< in: logging mode: MTR_LOG_NONE, ... */
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+/********************************************************//**
+Reads 8 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+dulint
+mtr_read_dulint(
+/*============*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This macro locks an rw-lock in s-mode. */
+#define mtr_s_lock(B, MTR) mtr_s_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+/*********************************************************************//**
+This macro locks an rw-lock in x-mode. */
+#define mtr_x_lock(B, MTR) mtr_x_lock_func((B), __FILE__, __LINE__,\
+ (MTR))
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************//**
+Releases an object in the memo stack. */
+UNIV_INTERN
+void
+mtr_memo_release(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given item.
+@return TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ mtr_t* mtr, /*!< in: mtr */
+ const void* object, /*!< in: object to search */
+ ulint type); /*!< in: type of object */
+
+/**********************************************************//**
+Checks if memo contains the given page.
+@return TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* ptr, /*!< in: pointer to buffer frame */
+ ulint type); /*!< in: type of object */
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+ mtr_t* mtr); /*!< in: mtr */
+# else /* !UNIV_HOTBACKUP */
+# define mtr_memo_contains(mtr, object, type) TRUE
+# define mtr_memo_contains_page(mtr, ptr, type) TRUE
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+/*######################################################################*/
+
+#define MTR_BUF_MEMO_SIZE 200 /* number of slots in memo */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ mtr_t* mtr); /*!< in: mini-transaction */
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type); /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+
+
+/* Type definition of a mini-transaction memo stack slot. */
+typedef struct mtr_memo_slot_struct mtr_memo_slot_t;
+struct mtr_memo_slot_struct{
+ ulint type; /*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */
+ void* object; /*!< pointer to the object */
+};
+
+/* Mini-transaction handle and buffer */
+struct mtr_struct{
+#ifdef UNIV_DEBUG
+ ulint state; /*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
+#endif
+ dyn_array_t memo; /*!< memo stack for locks etc. */
+ dyn_array_t log; /*!< mini-transaction log */
+ ibool modifications;
+ /* TRUE if the mtr made modifications to
+ buffer pool pages */
+ ulint n_log_recs;
+ /* count of how many page initial log records
+ have been written to the mtr log */
+ ulint log_mode; /* specifies which operations should be
+ logged; default value MTR_LOG_ALL */
+ ib_uint64_t start_lsn;/* start lsn of the possible log entry for
+ this mtr */
+ ib_uint64_t end_lsn;/* end lsn of the possible log entry for
+ this mtr */
+#ifdef UNIV_DEBUG
+ ulint magic_n;
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_DEBUG
+# define MTR_MAGIC_N 54551
+#endif /* UNIV_DEBUG */
+
+#define MTR_ACTIVE 12231
+#define MTR_COMMITTING 56456
+#define MTR_COMMITTED 34676
+
+#ifndef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic
new file mode 100644
index 00000000000..18f8e87b3cf
--- /dev/null
+++ b/storage/xtradb/include/mtr0mtr.ic
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.ic
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mach0data.h"
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and a buffer in the memory buffer given by the caller.
+@return mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+ mtr_t* mtr) /*!< in: memory buffer for the mtr buffer */
+{
+ dyn_array_create(&(mtr->memo));
+ dyn_array_create(&(mtr->log));
+
+ mtr->log_mode = MTR_LOG_ALL;
+ mtr->modifications = FALSE;
+ mtr->n_log_recs = 0;
+
+ ut_d(mtr->state = MTR_ACTIVE);
+ ut_d(mtr->magic_n = MTR_MAGIC_N);
+
+ return(mtr);
+}
+
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+ dyn_array_t* memo;
+ mtr_memo_slot_t* slot;
+
+ ut_ad(object);
+ ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+ ut_ad(type <= MTR_MEMO_X_LOCK);
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
+
+ slot->object = object;
+ slot->type = type;
+}
+
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ return(dyn_array_get_data_size(memo));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint, /*!< in: savepoint */
+ rw_lock_t* lock) /*!< in: latch to release */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+ slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint);
+
+ ut_ad(slot->object == lock);
+ ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+ rw_lock_s_unlock(lock);
+
+ slot->object = NULL;
+}
+
+# ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks if memo contains the given item.
+@return TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+ mtr_t* mtr, /*!< in: mtr */
+ const void* object, /*!< in: object to search */
+ ulint type) /*!< in: type of object */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+
+ slot = dyn_array_get_element(memo, offset);
+
+ if ((object == slot->object) && (type == slot->type)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+ return(&(mtr->log));
+}
+
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(mtr->log_mode >= MTR_LOG_ALL);
+ ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(mtr->log_mode);
+}
+
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint mode) /*!< in: logging mode: MTR_LOG_NONE, ... */
+{
+ ulint old_mode;
+
+ ut_ad(mtr);
+ ut_ad(mode >= MTR_LOG_ALL);
+ ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
+
+ old_mode = mtr->log_mode;
+
+ if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) {
+ /* Do nothing */
+ } else {
+ mtr->log_mode = mode;
+ }
+
+ ut_ad(old_mode >= MTR_LOG_ALL);
+ ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS);
+
+ return(old_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_s_lock_func(lock, 0, file, line);
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
+}
+
+/*********************************************************************//**
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ const char* file, /*!< in: file name */
+ ulint line, /*!< in: line number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mtr);
+ ut_ad(lock);
+
+ rw_lock_x_lock_func(lock, 0, file, line);
+
+ mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mtr0types.h b/storage/xtradb/include/mtr0types.h
new file mode 100644
index 00000000000..83a7aaf3839
--- /dev/null
+++ b/storage/xtradb/include/mtr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+typedef struct mtr_struct mtr_t;
+
+#endif
diff --git a/storage/xtradb/include/mysql_addons.h b/storage/xtradb/include/mysql_addons.h
new file mode 100644
index 00000000000..17660c18710
--- /dev/null
+++ b/storage/xtradb/include/mysql_addons.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mysql_addons.h
+This file contains functions that need to be added to
+MySQL code but have not been added yet.
+
+Whenever you add a function here submit a MySQL bug
+report (feature request) with the implementation. Then
+write the bug number in the comment before the
+function in this file.
+
+When MySQL commits the function it can be deleted from
+here. In a perfect world this file exists but is empty.
+
+Created November 07, 2007 Vasil Dimov
+*******************************************************/
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
new file mode 100644
index 00000000000..eeab8a2b5d9
--- /dev/null
+++ b/storage/xtradb/include/os0file.h
@@ -0,0 +1,794 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "univ.i"
+#include "trx0types.h"
+
+#ifndef __WIN__
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif
+
+/** File node of a tablespace or the log data space */
+typedef struct fil_node_struct fil_node_t;
+
+#ifdef UNIV_DO_FLUSH
+extern ibool os_do_not_call_flush_at_each_write;
+#endif /* UNIV_DO_FLUSH */
+extern ibool os_has_said_disk_full;
+/** Flag: enable debug printout for asynchronous i/o */
+extern ibool os_aio_print_debug;
+
+/** Number of pending os_file_pread() operations */
+extern ulint os_file_n_pending_preads;
+/** Number of pending os_file_pwrite() operations */
+extern ulint os_file_n_pending_pwrites;
+
+/** Number of pending read operations */
+extern ulint os_n_pending_reads;
+/** Number of pending write operations */
+extern ulint os_n_pending_writes;
+
+#ifdef __WIN__
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+ the OS actually supports it: Win 95 does not, NT does. */
+#define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+#define UNIV_NON_BUFFERED_IO
+
+#endif
+
+#ifdef __WIN__
+/** File handle */
+#define os_file_t HANDLE
+/** Convert a C file descriptor to a native file handle
+@param fd file descriptor
+@return native file handle */
+#define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
+#else
+/** File handle */
+typedef int os_file_t;
+/** Convert a C file descriptor to a native file handle
+@param fd file descriptor
+@return native file handle */
+#define OS_FILE_FROM_FD(fd) fd
+#endif
+
+/** Umask for creating files */
+extern ulint os_innodb_umask;
+
+/** If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+extern ibool os_aio_use_native_aio;
+
+/** The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE 512
+
+/** Options for file_create @{ */
+#define OS_FILE_OPEN 51
+#define OS_FILE_CREATE 52
+#define OS_FILE_OVERWRITE 53
+#define OS_FILE_OPEN_RAW 54
+#define OS_FILE_CREATE_PATH 55
+#define OS_FILE_OPEN_RETRY 56 /* for os_file_create() on
+ the first ibdata file */
+
+#define OS_FILE_READ_ONLY 333
+#define OS_FILE_READ_WRITE 444
+#define OS_FILE_READ_ALLOW_DELETE 555 /* for ibbackup */
+
+/* Options for file_create */
+#define OS_FILE_AIO 61
+#define OS_FILE_NORMAL 62
+/* @} */
+
+/** Types for file create @{ */
+#define OS_DATA_FILE 100
+#define OS_LOG_FILE 101
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+#define OS_FILE_NOT_FOUND 71
+#define OS_FILE_DISK_FULL 72
+#define OS_FILE_ALREADY_EXISTS 73
+#define OS_FILE_PATH_ERROR 74
+#define OS_FILE_AIO_RESOURCES_RESERVED 75 /* wait for OS aio resources
+ to become available again */
+#define OS_FILE_SHARING_VIOLATION 76
+#define OS_FILE_ERROR_NOT_SPECIFIED 77
+#define OS_FILE_INSUFFICIENT_RESOURCE 78
+#define OS_FILE_OPERATION_ABORTED 79
+/* @} */
+
+/** Types for aio operations @{ */
+#define OS_FILE_READ 10
+#define OS_FILE_WRITE 11
+
+#define OS_FILE_LOG 256 /* This can be ORed to type */
+/* @} */
+
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 32 /*!< Win NT does not allow more
+ than 64 */
+
+/** Modes for aio operations @{ */
+#define OS_AIO_NORMAL 21 /*!< Normal asynchronous i/o not for ibuf
+ pages or ibuf bitmap pages */
+#define OS_AIO_IBUF 22 /*!< Asynchronous i/o for ibuf pages or ibuf
+ bitmap pages */
+#define OS_AIO_LOG 23 /*!< Asynchronous i/o for the log */
+#define OS_AIO_SYNC 24 /*!< Asynchronous i/o where the calling thread
+ will itself wait for the i/o to complete,
+ doing also the job of the i/o-handler thread;
+ can be used for any pages, ibuf or non-ibuf.
+ This is used to save CPU time, as we can do
+ with fewer thread switches. Plain synchronous
+ i/o is not as good, because it must serialize
+ the file seek and read or write, causing a
+ bottleneck for parallelism. */
+
+#define OS_AIO_SIMULATED_WAKE_LATER 512 /*!< This can be ORed to mode
+ in the call of os_aio(...),
+ if the caller wants to post several i/o
+ requests in a batch, and only after that
+ wake the i/o-handler thread; this has
+ effect only in simulated aio */
+/* @} */
+
+#define OS_WIN31 1 /*!< Microsoft Windows 3.x */
+#define OS_WIN95 2 /*!< Microsoft Windows 95 */
+#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */
+#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */
+
+extern ulint os_n_file_reads;
+extern ulint os_n_file_writes;
+extern ulint os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_enum{
+ OS_FILE_TYPE_UNKNOWN = 0,
+ OS_FILE_TYPE_FILE, /* regular file */
+ OS_FILE_TYPE_DIR, /* directory */
+ OS_FILE_TYPE_LINK /* symbolic link */
+};
+typedef enum os_file_type_enum os_file_type_t;
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes */
+#define OS_FILE_MAX_PATH 4000
+
+/* Struct used in fetching information of a file in a directory */
+struct os_file_stat_struct{
+ char name[OS_FILE_MAX_PATH]; /*!< path to a file */
+ os_file_type_t type; /*!< file type */
+ ib_int64_t size; /*!< file size */
+ time_t ctime; /*!< creation time */
+ time_t mtime; /*!< modification time */
+ time_t atime; /*!< access time */
+};
+typedef struct os_file_stat_struct os_file_stat_t;
+
+#ifdef __WIN__
+typedef HANDLE os_file_dir_t; /*!< directory stream */
+#else
+typedef DIR* os_file_dir_t; /*!< directory stream */
+#endif
+
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
+UNIV_INTERN
+ulint
+os_get_os_version(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void);
+/*===================*/
+/***********************************************************************//**
+Creates a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+On Netware, this function is like tmpfile(3), because the C run-time
+library of Netware does not expose the delete-on-close flag.
+@return temporary file handle, or NULL on error */
+
+FILE*
+os_file_create_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ const char* dirname, /*!< in: directory name; it must not
+ contain a trailing '\' or '/' */
+ ibool error_is_fatal);/*!< in: TRUE if we should treat an
+ error as a fatal error; if we try to
+ open symlinks then we do not wish a
+ fatal error if it happens not to be
+ a directory */
+/***********************************************************************//**
+Closes a directory stream.
+@return 0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+ os_file_dir_t dir); /*!< in: directory stream */
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return 0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info); /*!< in/out: buffer where the info is returned */
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+ const char* pathname, /*!< in: directory name as
+ null-terminated string */
+ ibool fail_if_exists);/*!< in: if TRUE, pre-existing directory
+ is treated as an error. */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple(
+/*==================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+ opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error), or
+ OS_FILE_CREATE_PATH if new file
+ (if exists, error) and subdirectories along
+ its path are created (if needed)*/
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling(
+/*====================================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error) */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+ int fd, /*!< in: file descriptor to alter */
+ const char* file_name, /*!< in: file name, used in the
+ diagnostic message */
+ const char* operation_name);/*!< in: "open" or "create"; used in the
+ diagnostic message */
+/****************************************************************//**
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create(
+/*===========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error),
+ OS_FILE_OVERWRITE if a new file is created
+ or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk
+ partition should be opened */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete(
+/*===========*/
+ const char* name); /*!< in: file path as a null-terminated string */
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+ const char* name); /*!< in: file path as a null-terminated string */
+/***********************************************************************//**
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename(
+/*===========*/
+ const char* oldpath, /*!< in: old file path as a
+ null-terminated string */
+ const char* newpath); /*!< in: new file path */
+/***********************************************************************//**
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close(
+/*==========*/
+ os_file_t file); /*!< in, own: handle to a file */
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+ os_file_t file); /*!< in, own: handle to a file */
+#endif /* UNIV_HOTBACKUP */
+/***********************************************************************//**
+Gets a file size.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_get_size(
+/*=============*/
+ os_file_t file, /*!< in: handle to a file */
+ ulint* size, /*!< out: least significant 32 bits of file
+ size */
+ ulint* size_high);/*!< out: most significant 32 bits of size */
+/***********************************************************************//**
+Gets file size as a 64-bit integer ib_int64_t.
+@return size in bytes, -1 if error */
+UNIV_INTERN
+ib_int64_t
+os_file_get_size_as_iblonglong(
+/*===========================*/
+ os_file_t file); /*!< in: handle to a file */
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ ulint size, /*!< in: least significant 32 bits of file
+ size */
+ ulint size_high);/*!< in: most significant 32 bits of size */
+/***********************************************************************//**
+Truncates a file at its current position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+ FILE* file); /*!< in: file to be truncated */
+/***********************************************************************//**
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush(
+/*==========*/
+ os_file_t file); /*!< in, own: handle to a file */
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+ ibool report_all_errors); /*!< in: TRUE if we want an error message
+ printed of all errors */
+/*******************************************************************//**
+Requests a synchronous read operation.
+@return TRUE if request was successful, FALSE if fail */
+#define os_file_read(file, buf, offset, offset_high, n) \
+ _os_file_read(file, buf, offset, offset_high, n, NULL)
+
+UNIV_INTERN
+ibool
+_os_file_read(
+/*=========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ trx_t* trx);
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+ FILE* file, /*!< in: file to read from */
+ char* str, /*!< in: buffer where to read */
+ ulint size); /*!< in: size of buffer */
+/*******************************************************************//**
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n); /*!< in: number of bytes to read */
+
+/*******************************************************************//**
+Requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write(
+/*==========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high,/*!< in: most significant 32 bits of
+ offset */
+ ulint n); /*!< in: number of bytes to write */
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+ const char* path, /*!< in: pathname of the file */
+ ibool* exists, /*!< out: TRUE if file exists */
+ os_file_type_t* type); /*!< out: type of the file (if it exists) */
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' charac­
+ters are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+ path dirname basename
+ "/usr/lib" "/usr" "lib"
+ "/usr/" "/" "usr"
+ "usr" "." "usr"
+ "/" "/" "/"
+ "." "." "."
+ ".." "." ".."
+
+@return own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+ const char* path); /*!< in: pathname */
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+ const char* path); /*!< in: path name */
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+void
+os_aio_init(
+/*========*/
+ ulint n_per_seg, /*<! in: maximum number of pending aio
+ operations allowed per segment */
+ ulint n_read_segs, /*<! in: number of reader threads */
+ ulint n_write_segs, /*<! in: number of writer threads */
+ ulint n_slots_sync); /*<! in: number of slots in the sync aio
+ array */
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void);
+/*=============*/
+
+/*******************************************************************//**
+Requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ trx_t* trx);
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void);
+/*=====================================*/
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void);
+/*=====================================*/
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void);
+/*=======================================*/
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void);
+/*============================================*/
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /*!< this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type); /*!< out: OS_FILE_WRITE or ..._READ */
+/**********************************************************************//**
+Validates the consistency of the aio system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void);
+/*=================*/
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+ FILE* file); /*!< in: file where to print */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void);
+/*======================*/
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void);
+/*=======================*/
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return TRUE if stat information found */
+UNIV_INTERN
+ibool
+os_file_get_status(
+/*===============*/
+ const char* path, /*!< in: pathname of the file */
+ os_file_stat_t* stat_info); /*!< information of a file in a
+ directory */
+
+#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
+/*********************************************************************//**
+Creates a temporary file that will be deleted on close.
+This function is defined in ha_innodb.cc.
+@return temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
+
+#endif
diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h
new file mode 100644
index 00000000000..582cef6f803
--- /dev/null
+++ b/storage/xtradb/include/os0proc.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.h
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0proc_h
+#define os0proc_h
+
+#include "univ.i"
+
+#ifdef UNIV_LINUX
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#else
+# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+#include <sys/ipc.h>
+#include <sys/shm.h>
+# endif
+#endif
+
+typedef void* os_process_t;
+typedef unsigned long int os_process_id_t;
+
+extern ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+extern ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void);
+/*====================*/
+/****************************************************************//**
+Allocates large pages memory.
+@return allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+ ulint* n); /*!< in/out: number of bytes */
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+ void *ptr, /*!< in: pointer returned by
+ os_mem_alloc_large() */
+ ulint size); /*!< in: size returned by
+ os_mem_alloc_large() */
+
+
+/****************************************************************//**
+Allocates or attaches and reuses shared memory segment.
+The content is not cleared automatically.
+@return allocated memory */
+UNIV_INTERN
+void*
+os_shm_alloc(
+/*=========*/
+ ulint* n, /*!< in/out: number of bytes */
+ uint key,
+ ibool* is_new);
+
+/****************************************************************//**
+Detach shared memory segment. */
+UNIV_INTERN
+void
+os_shm_free(
+/*========*/
+ void *ptr, /*!< in: pointer returned by
+ os_shm_alloc() */
+ ulint size); /*!< in: size returned by
+ os_shm_alloc() */
+#ifndef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0proc.ic b/storage/xtradb/include/os0proc.ic
new file mode 100644
index 00000000000..c9641644525
--- /dev/null
+++ b/storage/xtradb/include/os0proc.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.ic
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h
new file mode 100644
index 00000000000..c230a03b6db
--- /dev/null
+++ b/storage/xtradb/include/os0sync.h
@@ -0,0 +1,445 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.h
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0sync_h
+#define os0sync_h
+
+#include "univ.i"
+#include "ut0lst.h"
+
+#ifdef __WIN__
+
+/** Native mutex */
+#define os_fast_mutex_t CRITICAL_SECTION
+
+/** Native event */
+typedef HANDLE os_native_event_t;
+
+/** Operating system event */
+typedef struct os_event_struct os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t* os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+ os_native_event_t handle;
+ /*!< Windows event */
+ UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+ /*!< list of all created events */
+};
+#else
+/** Native mutex */
+typedef pthread_mutex_t os_fast_mutex_t;
+
+/** Operating system event */
+typedef struct os_event_struct os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t* os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+ os_fast_mutex_t os_mutex; /*!< this mutex protects the next
+ fields */
+ ibool is_set; /*!< this is TRUE when the event is
+ in the signaled state, i.e., a thread
+ does not stop if it tries to wait for
+ this event */
+ ib_int64_t signal_count; /*!< this is incremented each time
+ the event becomes signaled */
+ pthread_cond_t cond_var; /*!< condition variable is used in
+ waiting for the event */
+ UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+ /*!< list of all created events */
+};
+#endif
+
+/** Operating system mutex */
+typedef struct os_mutex_struct os_mutex_str_t;
+/** Operating system mutex handle */
+typedef os_mutex_str_t* os_mutex_t;
+
+/** Denotes an infinite delay for os_event_wait_time() */
+#define OS_SYNC_INFINITE_TIME ((ulint)(-1))
+
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED 1
+
+/** Mutex protecting counts and the event and OS 'slow' mutex lists */
+extern os_mutex_t os_sync_mutex;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+extern ulint os_thread_count;
+
+extern ulint os_event_count;
+extern ulint os_mutex_count;
+extern ulint os_fast_mutex_count;
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void);
+/*==============*/
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void);
+/*==============*/
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two states:
+signaled and nonsignaled. The created event is manual reset: it must be reset
+explicitly by calling sync_os_reset_event.
+@return the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(
+/*============*/
+ const char* name); /*!< in: the name of the event, if NULL
+ the event is created without a name */
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+ os_event_t event); /*!< in: event to set */
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low(). */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event); /*!< in: event to reset */
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+ os_event_t event); /*!< in: event to free */
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state. If
+srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the
+waiting thread when the event becomes signaled (or immediately if the
+event is already in the signaled state).
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in: event to wait */
+ ib_int64_t reset_sig_count);/*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+
+#define os_event_wait(event) os_event_wait_low(event, 0)
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time(
+/*===============*/
+ os_event_t event, /*!< in: event to wait */
+ ulint time); /*!< in: timeout in microseconds, or
+ OS_SYNC_INFINITE_TIME */
+#ifdef __WIN__
+/**********************************************************//**
+Waits for any event in an OS native event array. Returns if even a single
+one is signaled or becomes signaled.
+@return index of the event which was signaled */
+UNIV_INTERN
+ulint
+os_event_wait_multiple(
+/*===================*/
+ ulint n, /*!< in: number of events in the
+ array */
+ os_native_event_t* native_event_array);
+ /*!< in: pointer to an array of event
+ handles */
+#endif
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+@return the mutex handle */
+UNIV_INTERN
+os_mutex_t
+os_mutex_create(
+/*============*/
+ const char* name); /*!< in: the name of the mutex, if NULL
+ the mutex is created without a name */
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+ os_mutex_t mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+ os_mutex_t mutex); /*!< in: mutex to release */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+ os_mutex_t mutex); /*!< in: mutex to free */
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return 0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock(
+/*=================*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to release */
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: fast mutex */
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to acquire */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free(
+/*===============*/
+ os_fast_mutex_t* fast_mutex); /*!< in: mutex to free */
+
+/**********************************************************//**
+Atomic compare-and-swap and increment for InnoDB. */
+
+#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap(ptr, old_val, new_val) \
+ __sync_bool_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_GCC
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ os_compare_and_swap(ptr, old_val, new_val)
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes and rw_locks use GCC atomic builtins"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes use GCC atomic builtins, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment(ptr, amount) \
+ __sync_add_and_fetch(ptr, amount)
+
+# define os_atomic_increment_lint(ptr, amount) \
+ os_atomic_increment(ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+ os_atomic_increment(ptr, amount)
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ __sync_lock_test_and_set(ptr, new_val)
+
+#elif defined(HAVE_IB_SOLARIS_ATOMICS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/* If not compiling with GCC or GCC doesn't support the atomic
+intrinsics and running on Solaris >= 10 use Solaris atomics */
+
+#include <atomic.h>
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ (atomic_cas_ulong(ptr, old_val, new_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ ((lint)atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS
+# if SIZEOF_PTHREAD_T == 4
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ ((pthread_t)atomic_cas_32(ptr, old_val, new_val) == old_val)
+# elif SIZEOF_PTHREAD_T == 8
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ ((pthread_t)atomic_cas_64(ptr, old_val, new_val) == old_val)
+# else
+# error "SIZEOF_PTHREAD_T != 4 or 8"
+# endif /* SIZEOF_PTHREAD_T CHECK */
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes and rw_locks use Solaris atomic functions"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes use Solaris atomic functions, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_lint(ptr, amount) \
+ atomic_add_long_nv((ulong_t*) ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+ atomic_add_long_nv(ptr, amount)
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ atomic_swap_uchar(ptr, new_val)
+
+#elif defined(HAVE_WINDOWS_ATOMICS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/* On Windows, use Windows atomics / interlocked */
+# ifdef _WIN64
+# define win_cmp_and_xchg InterlockedCompareExchange64
+# define win_xchg_and_add InterlockedExchangeAdd64
+# else /* _WIN64 */
+# define win_cmp_and_xchg InterlockedCompareExchange
+# define win_xchg_and_add InterlockedExchangeAdd
+# endif
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+ (win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+ (win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+
+/* windows thread objects can always be passed to windows atomic functions */
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+ (InterlockedCompareExchange(ptr, new_val, old_val) == old_val)
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes and rw_locks use Windows interlocked functions"
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_lint(ptr, amount) \
+ (win_xchg_and_add(ptr, amount) + amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+ ((ulint) (win_xchg_and_add(ptr, amount) + amount))
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val.
+InterlockedExchange() operates on LONG, and the LONG will be
+clobbered */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+ ((byte) InterlockedExchange(ptr, new_val))
+
+#else
+# define IB_ATOMICS_STARTUP_MSG \
+ "Mutexes and rw_locks use InnoDB's own implementation"
+#endif
+
+#ifndef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic
new file mode 100644
index 00000000000..1f3ce38fa65
--- /dev/null
+++ b/storage/xtradb/include/os0sync.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.ic
+The interface to the operating system synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifdef __WIN__
+#include <winbase.h>
+#endif
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return 0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+ os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+ EnterCriticalSection(fast_mutex);
+
+ return(0);
+#else
+ /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
+ so that it returns 0 on success. In the operating system
+ libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and
+ returns 1 on success (but MySQL remaps that to 0), while Linux,
+ FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
+
+ return((ulint) pthread_mutex_trylock(fast_mutex));
+#endif
+}
diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h
new file mode 100644
index 00000000000..6583de0005f
--- /dev/null
+++ b/storage/xtradb/include/os0thread.h
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.h
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Maximum number of threads which can be created in the program;
+this is also the size of the wait slot array for MySQL threads which
+can wait inside InnoDB */
+
+#define OS_THREAD_MAX_N srv_max_n_threads
+
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE 100
+#define OS_THREAD_PRIORITY_BACKGROUND 1
+#define OS_THREAD_PRIORITY_NORMAL 2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL 3
+
+#ifdef __WIN__
+typedef void* os_thread_t;
+typedef unsigned long os_thread_id_t; /*!< In Windows the thread id
+ is an unsigned long int */
+#else
+typedef pthread_t os_thread_t;
+typedef os_thread_t os_thread_id_t; /*!< In Unix we use the thread
+ handle itself as the id of
+ the thread */
+#endif
+
+/* Define a function pointer type to use in a typecast */
+typedef void* (*os_posix_f_t) (void*);
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+ os_thread_id_t a, /*!< in: OS thread or thread id */
+ os_thread_id_t b); /*!< in: OS thread or thread id */
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+ os_thread_id_t a); /*!< in: OS thread identifier */
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns a ulint.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit and not use return() to exit.
+@return handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create(
+/*=============*/
+#ifndef __WIN__
+ os_posix_f_t start_f,
+#else
+ ulint (*start_f)(void*), /*!< in: pointer to function
+ from which to start */
+#endif
+ void* arg, /*!< in: argument to start
+ function */
+ os_thread_id_t* thread_id); /*!< out: id of the created
+ thread, or NULL */
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+ void* exit_value); /*!< in: exit value; in Windows this void*
+ is cast as a DWORD */
+/*****************************************************************//**
+Returns the thread identifier of current thread.
+@return current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void);
+/*========================*/
+/*****************************************************************//**
+Returns handle to the current thread.
+@return current thread handle */
+UNIV_INTERN
+os_thread_t
+os_thread_get_curr(void);
+/*====================*/
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void);
+/*=================*/
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+ ulint tm); /*!< in: time in microseconds */
+/******************************************************************//**
+Gets a thread priority.
+@return priority */
+UNIV_INTERN
+ulint
+os_thread_get_priority(
+/*===================*/
+ os_thread_t handle);/*!< in: OS handle to the thread */
+/******************************************************************//**
+Sets a thread priority. */
+UNIV_INTERN
+void
+os_thread_set_priority(
+/*===================*/
+ os_thread_t handle, /*!< in: OS handle to the thread */
+ ulint pri); /*!< in: priority: one of OS_PRIORITY_... */
+/******************************************************************//**
+Gets the last operating system error code for the calling thread.
+@return last error on Windows, 0 otherwise */
+UNIV_INTERN
+ulint
+os_thread_get_last_error(void);
+/*==========================*/
+
+#ifndef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0thread.ic b/storage/xtradb/include/os0thread.ic
new file mode 100644
index 00000000000..f89bc40b4fa
--- /dev/null
+++ b/storage/xtradb/include/os0thread.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.ic
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/page0cur.h b/storage/xtradb/include/page0cur.h
new file mode 100644
index 00000000000..6b444b3dd96
--- /dev/null
+++ b/storage/xtradb/include/page0cur.h
@@ -0,0 +1,362 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "univ.i"
+
+#include "buf0types.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+
+#define PAGE_CUR_ADAPT
+
+/* Page cursor search modes; the values must be in this order! */
+
+#define PAGE_CUR_UNSUPP 0
+#define PAGE_CUR_G 1
+#define PAGE_CUR_GE 2
+#define PAGE_CUR_L 3
+#define PAGE_CUR_LE 4
+/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in
+ "column LIKE 'abc%' ORDER BY column DESC";
+ we have to find strings which are <= 'abc' or
+ which extend it */
+#ifdef UNIV_SEARCH_DEBUG
+# define PAGE_CUR_DBG 6 /* As PAGE_CUR_LE, but skips search shortcut */
+#endif /* UNIV_SEARCH_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur); /*!< in: page cursor */
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur); /*!< in: page cursor */
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur) page_align((cur)->rec)
+# define page_cur_get_block(cur) (cur)->block
+# define page_cur_get_page_zip(cur) buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur) (cur)->rec
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur); /*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur); /*!< out: page cursor */
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur); /*!< out: page cursor */
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; must not be after last */
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur); /*!< in/out: cursor; not before first */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const rec_t* rec, /*!< in: record to insert */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ rec_t* current_rec,/*!< in: pointer to current record after
+ which the new record is inserted */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ rec_t** current_rec,/*!< in/out: pointer to current record after
+ which the new record is inserted */
+ buf_block_t* block, /*!< in: buffer block of *current_rec */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle, or NULL */
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+ page_t* new_page, /*!< in/out: index page to copy to */
+ rec_t* rec, /*!< in: first record to copy */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor);/*!< out: page cursor */
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* iup_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ ulint* ilow_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ page_cur_t* cursor);/*!< out: page cursor */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor);/*!< out: page cursor */
+
+UNIV_INTERN
+void
+page_cur_open_on_nth_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor, /*!< out: page cursor */
+ ulint nth);
+
+UNIV_INTERN
+ibool
+page_cur_open_on_rnd_user_rec_after_nth(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor, /*!< out: page cursor */
+ ulint nth);
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+ ibool is_short,/*!< in: TRUE if short inserts */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+
+/** Index page cursor */
+
+struct page_cur_struct{
+ byte* rec; /*!< pointer to a record on page */
+ buf_block_t* block; /*!< pointer to the block containing rec */
+};
+
+#ifndef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic
new file mode 100644
index 00000000000..3520677dfb3
--- /dev/null
+++ b/storage/xtradb/include/page0cur.ic
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0page.h"
+#include "buf0types.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+
+ return(page_align(cur->rec));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(cur->block);
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+ page_cur_t* cur) /*!< in: page cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+
+ return(cur->rec);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+ const buf_block_t* block, /*!< in: index page */
+ page_cur_t* cur) /*!< in: cursor */
+{
+ cur->block = (buf_block_t*) block;
+ cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+ const page_cur_t* cur) /*!< in: cursor */
+{
+ ut_ad(cur);
+ ut_ad(page_align(cur->rec) == cur->block->frame);
+ return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+ const rec_t* rec, /*!< in: record on a page */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(rec && block && cur);
+ ut_ad(page_align(rec) == block->frame);
+
+ cur->rec = (rec_t*) rec;
+ cur->block = (buf_block_t*) block;
+}
+
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+ page_cur_t* cur) /*!< out: page cursor */
+{
+ ut_ad(cur);
+
+ cur->rec = NULL;
+ cur->block = NULL;
+}
+
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: cursor; must not be after last */
+{
+ ut_ad(!page_cur_is_after_last(cur));
+
+ cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+ page_cur_t* cur) /*!< in/out: page cursor, not before first */
+{
+ ut_ad(!page_cur_is_before_first(cur));
+
+ cur->rec = page_rec_get_prev(cur->rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ ulint low_matched_fields = 0;
+ ulint low_matched_bytes = 0;
+ ulint up_matched_fields = 0;
+ ulint up_matched_bytes = 0;
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ page_cur_search_with_match(block, index, tuple, mode,
+ &up_matched_fields,
+ &up_matched_bytes,
+ &low_matched_fields,
+ &low_matched_bytes,
+ cursor);
+ return(low_matched_fields);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const dtuple_t* tuple, /*!< in: pointer to a data tuple */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ mem_heap_t* heap;
+ ulint* offsets;
+ ulint size
+ = rec_get_converted_size(index, tuple, n_ext);
+ rec_t* rec;
+
+ heap = mem_heap_create(size
+ + (4 + REC_OFFS_HEADER_SIZE
+ + dtuple_get_n_fields(tuple))
+ * sizeof *offsets);
+ rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size),
+ index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+ if (buf_block_get_page_zip(cursor->block)) {
+ rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+ index, rec, offsets, mtr);
+ } else {
+ rec = page_cur_insert_rec_low(cursor->rec,
+ index, rec, offsets, mtr);
+ }
+
+ mem_heap_free(heap);
+ return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ const rec_t* rec, /*!< in: record to insert */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ if (buf_block_get_page_zip(cursor->block)) {
+ return(page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+ index, rec, offsets, mtr));
+ } else {
+ return(page_cur_insert_rec_low(cursor->rec,
+ index, rec, offsets, mtr));
+ }
+}
diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h
new file mode 100644
index 00000000000..5b2bcf7c054
--- /dev/null
+++ b/storage/xtradb/include/page0page.h
@@ -0,0 +1,1015 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "data0data.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/* PAGE HEADER
+ ===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef byte page_header_t;
+
+#define PAGE_HEADER FSEG_PAGE_DATA /* index page header starts at this
+ offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0 /* number of slots in page directory */
+#define PAGE_HEAP_TOP 2 /* pointer to record heap top */
+#define PAGE_N_HEAP 4 /* number of records in the heap,
+ bit 15=flag: new-style compact page format */
+#define PAGE_FREE 6 /* pointer to start of page free record list */
+#define PAGE_GARBAGE 8 /* number of bytes in deleted records */
+#define PAGE_LAST_INSERT 10 /* pointer to the last inserted record, or
+ NULL if this info has been reset by a delete,
+ for example */
+#define PAGE_DIRECTION 12 /* last insert direction: PAGE_LEFT, ... */
+#define PAGE_N_DIRECTION 14 /* number of consecutive inserts to the same
+ direction */
+#define PAGE_N_RECS 16 /* number of user records on the page */
+#define PAGE_MAX_TRX_ID 18 /* highest id of a trx which may have modified
+ a record on the page; a dulint; defined only
+ in secondary indexes and in the insert buffer
+ tree; NOTE: this may be modified only
+ when the thread has an x-latch to the page,
+ and ALSO an x-latch to btr_search_latch
+ if there is a hash index to the page! */
+#define PAGE_HEADER_PRIV_END 26 /* end of private data structure of the page
+ header which are set in a page create */
+/*----*/
+#define PAGE_LEVEL 26 /* level of the node in an index tree; the
+ leaf level is the level 0. This field should
+ not be written to after page creation. */
+#define PAGE_INDEX_ID 28 /* index id where the page belongs.
+ This field should not be written to after
+ page creation. */
+#define PAGE_BTR_SEG_LEAF 36 /* file segment header for the leaf pages in
+ a B-tree: defined only on the root page of a
+ B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+ /* in the place of PAGE_BTR_SEG_LEAF and _TOP
+ there is a free list base node if the page is
+ the root page of an ibuf tree, and at the same
+ place is the free list node if the page is in
+ a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+ /* file segment header for the non-leaf pages
+ in a B-tree: defined only on the root page of
+ a B-tree, but not in the root of an ibuf
+ tree */
+/*----*/
+#define PAGE_DATA (PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+ /* start of data on the page */
+
+#define PAGE_OLD_INFIMUM (PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+ /* offset of the page infimum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM (PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on an
+ old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+ /* offset of the page supremum record end on
+ an old-style page */
+#define PAGE_NEW_INFIMUM (PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+ /* offset of the page infimum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM (PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+ /* offset of the page supremum record on a
+ new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+ /* offset of the page supremum record end on
+ a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM 0 /* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM 1 /* page supremum */
+#define PAGE_HEAP_NO_USER_LOW 2 /* first user record in
+ creation (insertion) order,
+ not necessarily collation order;
+ this record may have been deleted */
+
+/* Directions of cursor movement */
+#define PAGE_LEFT 1
+#define PAGE_RIGHT 2
+#define PAGE_SAME_REC 3
+#define PAGE_SAME_PAGE 4
+#define PAGE_NO_DIRECTION 5
+
+/* PAGE DIRECTORY
+ ==============
+*/
+
+typedef byte page_dir_slot_t;
+typedef page_dir_slot_t page_dir_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define PAGE_DIR FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+#define PAGE_DIR_SLOT_SIZE 2
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START (PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED 8
+#define PAGE_DIR_SLOT_MIN_N_OWNED 4
+
+/************************************************************//**
+Gets the start of a page.
+@return start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+ const void* ptr) /*!< in: pointer to page frame */
+ __attribute__((const));
+/************************************************************//**
+Gets the offset within a page.
+@return offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+ const void* ptr) /*!< in: pointer to page frame */
+ __attribute__((const));
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in/out: mini-transaction */
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ const page_t* page, /*!< in: page */
+ ulint field); /*!< in: PAGE_N_DIR_SLOTS, ... */
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */
+ ulint val); /*!< in: value */
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+ __attribute__((nonnull, pure));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field) \
+ (page_header_get_offs(page, field) \
+ ? page + page_header_get_offs(page, field) : NULL)
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in/out: PAGE_FREE, ... */
+ const byte* ptr); /*!< in: pointer or NULL*/
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /*!< in: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ mtr_t* mtr); /*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+ const page_t* page); /*!< in: page which must have record(s) */
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+ const page_t* page); /*!< in: page which must have record(s) */
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+/************************************************************//**
+Returns the middle record of record list. If there are an even number
+of records in the list, returns the first record of upper half-list.
+@return middle record */
+UNIV_INTERN
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page); /*!< in: page */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes); /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+#endif /* !UNIV_HOTBACKUP */
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+ const page_t* page); /*!< in: page */
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec); /*!< in: the physical record */
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL.
+ Note that the size of the dense page directory
+ in the compressed page trailer is
+ n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+ ulint n_heap);/*!< in: number of records */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page); /*!< in: index page */
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint n_slots);/*!< in: number of slots */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ const page_t* page, /*!< in: index page */
+ ulint n); /*!< in: position */
+#else /* UNIV_DEBUG */
+# define page_dir_get_nth_slot(page, n) \
+ ((page) + UNIV_PAGE_SIZE - PAGE_DIR \
+ - (n + 1) * PAGE_DIR_SLOT_SIZE)
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec); /*!< in: record */
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ const page_dir_slot_t* slot); /*!< in: directory slot */
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /*!< in: directory slot */
+ rec_t* rec); /*!< in: record on the page */
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot); /*!< in: page directory slot */
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t*slot, /*!< in/out: directory slot */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n); /*!< in: number of records owned by the slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs); /*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec); /*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+ const page_t* page); /*!< in: index page */
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+ const rec_t* rec); /*!< in: record */
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec); /*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+ const page_t* page) /*!< in: page */
+ __attribute__((pure));
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp); /*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record */
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /*!< in: pointer to record,
+ must not be page supremum */
+ rec_t* next); /*!< in: pointer to next record,
+ must not be page infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec); /*!< in: pointer to record, must not be page
+ infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec); /*!< in: pointer to record,
+ must not be page infimum */
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+ ulint offset) /*!< in: record offset on page */
+ __attribute__((const));
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ const rec_t* rec) /*!< in: record */
+ __attribute__((const));
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ rec_t* rec); /*!< in: the physical record */
+/***********************************************************************//**
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary
+record. */
+UNIV_INTERN
+void
+page_rec_write_index_page_no(
+/*=========================*/
+ rec_t* rec, /*!< in: record to update */
+ ulint i, /*!< in: index of the field to update */
+ ulint page_no,/*!< in: value to write */
+ mtr_t* mtr); /*!< in: mtr */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page format */
+ __attribute__((const));
+/**********************************************************//**
+Returns the base extra size of a physical record. This is the
+size of the fixed header, independent of the record size.
+@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+ const rec_t* rec); /*!< in: physical record */
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ const page_t* page); /*!< in: index page */
+/************************************************************//**
+Allocates a block of memory from the head of the free list
+of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ rec_t* next_rec,/*!< in: pointer to the new head of the
+ free record list */
+ ulint need); /*!< in: number of bytes allocated */
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ ulint need, /*!< in: total number of bytes needed */
+ ulint* heap_no);/*!< out: this contains the heap number
+ of the allocated record
+ if allocation succeeds */
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ rec_t* rec, /*!< in: pointer to the (origin of) record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+ buf_block_t* block, /*!< in: a buffer block where the
+ page is created */
+ mtr_t* mtr, /*!< in: mini-transaction handle */
+ ulint comp); /*!< in: nonzero=compact page format */
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame where the
+ page is created */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr); /*!< in: mini-transaction handle */
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull));
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull(1, 2, 4, 5)));
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+@return TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+ __attribute__((nonnull(1, 2, 4, 5)));
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+ page_t* page, /*!< in: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be written, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+ __attribute__((nonnull(1)));
+/*************************************************************//**
+Tries to balance the given directory slot with too few records
+with the upper neighbor, so that there are at least the minimum number
+of records owned by the slot; this may result in the merging of
+two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+ __attribute__((nonnull(1)));
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+ byte type, /*!< in: MLOG_LIST_END_DELETE,
+ MLOG_LIST_START_DELETE,
+ MLOG_COMP_LIST_END_DELETE or
+ MLOG_COMP_LIST_START_DELETE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in/out: buffer block or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: record descriptor */
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n); /*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn); /*!< in: print rn first and last records
+ in directory */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+ rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+ const page_t* page); /*!< in: index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+ page_t* page); /*!< in: old-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+ page_t* block); /*!< in: new-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+ page_t* page, /*!< in: index page */
+ dict_index_t* index); /*!< in: data dictionary index containing
+ the page record type definition */
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no);/*!< in: heap number */
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+#include "page0page.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic
new file mode 100644
index 00000000000..dab9dc742e4
--- /dev/null
+++ b/storage/xtradb/include/page0page.ic
@@ -0,0 +1,1076 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifdef UNIV_DEBUG
+# include "log0recv.h"
+#endif /* !UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+# include "rem0cmp.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/************************************************************//**
+Gets the start of a page.
+@return start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+ const void* ptr) /*!< in: pointer to page frame */
+{
+ return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE));
+}
+/************************************************************//**
+Gets the offset within a page.
+@return offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+ const void* ptr) /*!< in: pointer to page frame */
+{
+ return(ut_align_offset(ptr, UNIV_PAGE_SIZE));
+}
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page);
+
+ return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
+}
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ut_ad(block);
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ /* During crash recovery, this function may be called on
+ something else than a leaf page of a secondary index or the
+ insert buffer index tree (dict_index_is_sec_or_ibuf() returns
+ TRUE for the dummy indexes constructed during redo log
+ application). In that case, PAGE_MAX_TRX_ID is unused,
+ and trx_id is usually zero. */
+ ut_ad(!ut_dulint_is_zero(trx_id) || recv_recovery_is_on());
+ ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+ if (ut_dulint_cmp(page_get_max_trx_id(buf_block_get_frame(block)),
+ trx_id) < 0) {
+
+ page_set_max_trx_id(block, page_zip, trx_id, mtr);
+ }
+}
+
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_LEVEL, ... */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_INDEX_ID);
+
+ return(mach_read_from_2(page + PAGE_HEADER + field));
+}
+
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_N_DIR_SLOTS, ... */
+ ulint val) /*!< in: value */
+{
+ ut_ad(page);
+ ut_ad(field <= PAGE_N_RECS);
+ ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE);
+ ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
+
+ mach_write_to_2(page + PAGE_HEADER + field, val);
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_write_header(page_zip,
+ page + PAGE_HEADER + field, 2, NULL);
+ }
+}
+
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+ const page_t* page, /*!< in: page */
+ ulint field) /*!< in: PAGE_FREE, ... */
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ offs = page_header_get_field(page, field);
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ return(offs);
+}
+
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+ page_t* page, /*!< in: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint field, /*!< in: PAGE_FREE, ... */
+ const byte* ptr) /*!< in: pointer or NULL*/
+{
+ ulint offs;
+
+ ut_ad(page);
+ ut_ad((field == PAGE_FREE)
+ || (field == PAGE_LAST_INSERT)
+ || (field == PAGE_HEAP_TOP));
+
+ if (ptr == NULL) {
+ offs = 0;
+ } else {
+ offs = ptr - page;
+ }
+
+ ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+ page_header_set_field(page, page_zip, field, offs);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(page && mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_LAST_INSERT),
+ 2, mtr);
+ } else {
+ mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0,
+ MLOG_2BYTES, mtr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+ const page_t* page) /*!< in: index page */
+{
+ return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000,
+ 0x8000));
+}
+
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_is_comp(page_align(rec)));
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ if (page_rec_is_comp(rec)) {
+ return(rec_get_heap_no_new(rec));
+ } else {
+ return(rec_get_heap_no_old(rec));
+ }
+}
+
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+ const page_t* page) /*!< in: page */
+{
+ if (!page) {
+ return(FALSE);
+ }
+ return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL)));
+}
+
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+ const page_t* page) /*!< in: page which must have record(s) */
+{
+ ut_ad(page);
+ ut_ad(!page_offset(page));
+
+ if (page_is_comp(page)) {
+ return(PAGE_NEW_INFIMUM);
+ } else {
+ return(PAGE_OLD_INFIMUM);
+ }
+}
+
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+ const page_t* page) /*!< in: page which must have record(s) */
+{
+ ut_ad(page);
+ ut_ad(!page_offset(page));
+
+ if (page_is_comp(page)) {
+ return(PAGE_NEW_SUPREMUM);
+ } else {
+ return(PAGE_OLD_SUPREMUM);
+ }
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM
+# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM"
+#endif
+#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM
+# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM"
+#endif
+#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END
+# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END"
+#endif
+#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END
+# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END"
+#endif
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM)
+ && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM)
+ && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM)
+ && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM)
+ || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+ ulint offset) /*!< in: record offset on page */
+{
+ ut_ad(offset >= PAGE_NEW_INFIMUM);
+ ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+ return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM)
+ || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM));
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+ const rec_t* rec) /*!< in: record */
+{
+ return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record on a page; may also
+ be page infimum or supremum, in which case
+ matched-parameter values below are not
+ affected */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns
+ contains the value for current comparison */
+ ulint* matched_bytes) /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns contains the
+ value for current comparison */
+{
+ ulint rec_offset;
+
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec));
+
+ rec_offset = page_offset(rec);
+
+ if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM)
+ || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) {
+ return(1);
+ }
+ if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM)
+ || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) {
+ return(-1);
+ }
+
+ return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+ matched_fields,
+ matched_bytes));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the page number.
+@return page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return(mach_read_from_4(page + FIL_PAGE_OFFSET));
+}
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+ const page_t* page) /*!< in: page */
+{
+ ut_ad(page == page_align((page_t*) page));
+ return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+ page_t* page, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ ulint n_slots)/*!< in: number of slots */
+{
+ page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots);
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+ const page_t* page) /*!< in: index page */
+{
+ return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL.
+ Note that the size of the dense page directory
+ in the compressed page trailer is
+ n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+ ulint n_heap) /*!< in: number of records */
+{
+ ut_ad(n_heap < 0x8000);
+ ut_ad(!page_zip || n_heap
+ == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1);
+
+ page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap
+ | (0x8000
+ & page_header_get_field(page, PAGE_N_HEAP)));
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+ const page_t* page, /*!< in: index page */
+ ulint n) /*!< in: position */
+{
+ ut_ad(page_dir_get_n_slots(page) > n);
+
+ return((page_dir_slot_t*)
+ page + UNIV_PAGE_SIZE - PAGE_DIR
+ - (n + 1) * PAGE_DIR_SLOT_SIZE);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+ const rec_t* rec) /*!< in: record */
+{
+ const page_t* page = page_align(rec);
+
+ ut_a(rec);
+
+ ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+ ut_a(page_offset(rec) >= PAGE_DATA);
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+ const page_dir_slot_t* slot) /*!< in: directory slot */
+{
+ return(page_align(slot) + mach_read_from_2(slot));
+}
+
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+ page_dir_slot_t* slot, /*!< in: directory slot */
+ rec_t* rec) /*!< in: record on the page */
+{
+ ut_ad(page_rec_check(rec));
+
+ mach_write_to_2(slot, page_offset(rec));
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+ const page_dir_slot_t* slot) /*!< in: page directory slot */
+{
+ const rec_t* rec = page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ return(rec_get_n_owned_new(rec));
+ } else {
+ return(rec_get_n_owned_old(rec));
+ }
+}
+
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+ page_dir_slot_t*slot, /*!< in/out: directory slot */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n) /*!< in: number of records owned by the slot */
+{
+ rec_t* rec = (rec_t*) page_dir_slot_get_rec(slot);
+ if (page_rec_is_comp(slot)) {
+ rec_set_n_owned_new(rec, page_zip, n);
+ } else {
+ ut_ad(!page_zip);
+ rec_set_n_owned_old(rec, n);
+ }
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+ ulint n_recs) /*!< in: number of records */
+{
+ return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+ / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+ const rec_t* rec, /*!< in: pointer to record */
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ ulint offs;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ offs = rec_get_next_offs(rec, comp);
+
+ if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Next record offset is nonsensical %lu"
+ " in record at offset %lu\n"
+ "InnoDB: rec address %p, space id %lu, page %lu\n",
+ (ulong)offs, (ulong) page_offset(rec),
+ (void*) rec,
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page));
+ buf_page_print(page, 0);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(offs == 0)) {
+
+ return(NULL);
+ }
+
+ return(page + offs);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record */
+{
+ return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+ rec_t* rec, /*!< in: pointer to record,
+ must not be page supremum */
+ rec_t* next) /*!< in: pointer to next record,
+ must not be page infimum */
+{
+ ulint offs;
+
+ ut_ad(page_rec_check(rec));
+ ut_ad(!page_rec_is_supremum(rec));
+ ut_ad(rec != next);
+
+ ut_ad(!next || !page_rec_is_infimum(next));
+ ut_ad(!next || page_align(rec) == page_align(next));
+
+ if (UNIV_LIKELY(next != NULL)) {
+ offs = page_offset(next);
+ } else {
+ offs = 0;
+ }
+
+ if (page_rec_is_comp(rec)) {
+ rec_set_next_offs_new(rec, offs);
+ } else {
+ rec_set_next_offs_old(rec, offs);
+ }
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+ const rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ const page_dir_slot_t* slot;
+ ulint slot_no;
+ const rec_t* rec2;
+ const rec_t* prev_rec = NULL;
+ const page_t* page;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+
+ ut_ad(!page_rec_is_infimum(rec));
+
+ slot_no = page_dir_find_owner_slot(rec);
+
+ ut_a(slot_no != 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+ rec2 = page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, TRUE);
+ }
+ } else {
+ while (rec != rec2) {
+ prev_rec = rec2;
+ rec2 = page_rec_get_next_low(rec2, FALSE);
+ }
+ }
+
+ ut_a(prev_rec);
+
+ return(prev_rec);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+ rec_t* rec) /*!< in: pointer to record, must not be page
+ infimum */
+{
+ return((rec_t*) page_rec_get_prev_const(rec));
+}
+
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+ rec_t* rec) /*!< in: the physical record */
+{
+ ut_ad(page_rec_check(rec));
+
+ if (page_rec_is_comp(rec)) {
+ while (rec_get_n_owned_new(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+ } else {
+ while (rec_get_n_owned_old(rec) == 0) {
+ rec = page_rec_get_next(rec);
+ }
+ }
+
+ return(rec);
+}
+
+/**********************************************************//**
+Returns the base extra size of a physical record. This is the
+size of the fixed header, independent of the record size.
+@return REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES
+# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES"
+#endif
+ return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec));
+}
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint ret;
+
+ ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP)
+ - (page_is_comp(page)
+ ? PAGE_NEW_SUPREMUM_END
+ : PAGE_OLD_SUPREMUM_END)
+ - page_header_get_field(page, PAGE_GARBAGE));
+
+ ut_ad(ret < UNIV_PAGE_SIZE);
+
+ return(ret);
+}
+
+
+/************************************************************//**
+Allocates a block of memory from the free list of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ rec_t* next_rec,/*!< in: pointer to the new head of the
+ free record list */
+ ulint need) /*!< in: number of bytes allocated */
+{
+ ulint garbage;
+
+#ifdef UNIV_DEBUG
+ const rec_t* old_rec = page_header_get_ptr(page, PAGE_FREE);
+ ulint next_offs;
+
+ ut_ad(old_rec);
+ next_offs = rec_get_next_offs(old_rec, page_is_comp(page));
+ ut_ad(next_rec == (next_offs ? page + next_offs : NULL));
+#endif
+
+ page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec);
+
+ garbage = page_header_get_field(page, PAGE_GARBAGE);
+ ut_ad(garbage >= need);
+
+ page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need);
+}
+
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+ ulint comp) /*!< in: nonzero=compact page layout */
+{
+ if (UNIV_LIKELY(comp)) {
+ return((ulint)(UNIV_PAGE_SIZE
+ - PAGE_NEW_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+ }
+
+ return((ulint)(UNIV_PAGE_SIZE
+ - PAGE_OLD_SUPREMUM_END
+ - PAGE_DIR
+ - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ if (page_is_comp(page)) {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_NEW_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(TRUE);
+ } else {
+ occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+ - PAGE_OLD_SUPREMUM_END
+ + page_dir_calc_reserved_space(
+ n_recs + page_dir_get_n_heap(page) - 2);
+
+ free_space = page_get_free_space_of_empty(FALSE);
+ }
+
+ /* Above the 'n_recs +' part reserves directory space for the new
+ inserted records; the '- 2' excludes page infimum and supremum
+ records */
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+ const page_t* page, /*!< in: index page */
+ ulint n_recs) /*!< in: number of records */
+{
+ ulint occupied;
+ ulint free_space;
+
+ occupied = page_get_data_size(page)
+ + page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+ free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+ if (occupied > free_space) {
+
+ return(0);
+ }
+
+ return(free_space - occupied);
+}
+
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ rec_t* rec, /*!< in: pointer to the (origin of) record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ rec_t* free;
+ ulint garbage;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ free = page_header_get_ptr(page, PAGE_FREE);
+
+ page_rec_set_next(rec, free);
+ page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
+
+ garbage = page_header_get_field(page, PAGE_GARBAGE);
+
+ page_header_set_field(page, page_zip, PAGE_GARBAGE,
+ garbage + rec_offs_size(offsets));
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_dir_delete(page_zip, rec, index, offsets, free);
+ } else {
+ page_header_set_field(page, page_zip, PAGE_N_RECS,
+ page_get_n_recs(page) - 1);
+ }
+}
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/page0types.h b/storage/xtradb/include/page0types.h
new file mode 100644
index 00000000000..49fe9d6abbe
--- /dev/null
+++ b/storage/xtradb/include/page0types.h
@@ -0,0 +1,151 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "mtr0types.h"
+
+/** Eliminates a name collision on HP-UX */
+#define page_t ib_page_t
+/** Type of the index page */
+typedef byte page_t;
+/** Index page cursor */
+typedef struct page_cur_struct page_cur_t;
+
+/** Compressed index page */
+typedef byte page_zip_t;
+/** Compressed page descriptor */
+typedef struct page_zip_des_struct page_zip_des_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** log2 of smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE_SHIFT 10
+/** Smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE (1 << PAGE_ZIP_MIN_SIZE_SHIFT)
+
+/** Number of supported compressed page sizes */
+#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
+#define PAGE_ZIP_NUM_SSIZE_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
+#if PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/** Compressed page descriptor */
+struct page_zip_des_struct
+{
+ page_zip_t* data; /*!< compressed page data */
+
+#ifdef UNIV_DEBUG
+ unsigned m_start:16; /*!< start offset of modification log */
+#endif /* UNIV_DEBUG */
+ unsigned m_end:16; /*!< end offset of modification log */
+ unsigned m_nonempty:1; /*!< TRUE if the modification log
+ is not empty */
+ unsigned n_blobs:12; /*!< number of externally stored
+ columns on the page; the maximum
+ is 744 on a 16 KiB page */
+ unsigned ssize:PAGE_ZIP_SSIZE_BITS;
+ /*!< 0 or compressed page size;
+ the size in bytes is
+ PAGE_ZIP_MIN_SIZE << (ssize - 1). */
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_struct {
+ /** Number of page compressions */
+ ulint compressed;
+ /** Number of successful page compressions */
+ ulint compressed_ok;
+ /** Number of page decompressions */
+ ulint decompressed;
+ /** Duration of page compressions in microseconds */
+ ib_uint64_t compressed_usec;
+ /** Duration of page decompressions in microseconds */
+ ib_uint64_t decompressed_usec;
+};
+
+/** Compression statistics */
+typedef struct page_zip_stat_struct page_zip_stat_t;
+
+/** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page. The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the owned flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Shift the dense page directory when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: deleted record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of the free list */
+ __attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint is_clustered) /*!< in: nonzero for clustered index,
+ zero for others */
+ __attribute__((nonnull));
+#endif
diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h
new file mode 100644
index 00000000000..4d37302ed20
--- /dev/null
+++ b/storage/xtradb/include/page0zip.h
@@ -0,0 +1,475 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "mtr0types.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+ __attribute__((nonnull, pure));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size); /*!< in: size in bytes */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+ ulint rec_size, /*!< in: length of the record in bytes */
+ ulint comp, /*!< in: nonzero=compact format */
+ ulint n_fields, /*!< in: number of fields in the record;
+ ignored if zip_size == 0 */
+ ulint zip_size) /*!< in: compressed page size in bytes, or 0 */
+ __attribute__((const));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+ __attribute__((const));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip); /*!< in/out: compressed page
+ descriptor */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+ page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ const page_t* page, /*!< in: uncompressed page */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,3)));
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+ __attribute__((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip); /*!< in: compressed page
+ descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+ __attribute__((nonnull));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page) /*!< in: uncompressed page */
+ __attribute__((nonnull));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+ __attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+ __attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page. */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write an entire record on the compressed page. The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint create) /*!< in: nonzero=insert, zero=update */
+ __attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in: mini-transaction handle,
+ or NULL if no logging is needed */
+ __attribute__((nonnull(1,2,3,4)));
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint trx_id_col,/*!< in: column number of TRX_ID in rec */
+ trx_id_t trx_id, /*!< in: transaction identifier */
+ roll_ptr_t roll_ptr)/*!< in: roll_ptr */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page. The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the owned flag (nonzero=TRUE) */
+ __attribute__((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* prev_rec,/*!< in: record after which to insert */
+ const byte* free_rec,/*!< in: record from which rec was
+ allocated, or NULL */
+ byte* rec); /*!< in: record to insert */
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: deleted record */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of the free list */
+ __attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint is_clustered) /*!< in: nonzero for clustered index,
+ zero for others */
+ __attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_low(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction */
+ __attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+ page_zip_des_t* page_zip, /*!< out: copy of src_zip
+ (n_blobs, m_start, m_end,
+ m_nonempty, data[0..size-1]) */
+ page_t* page, /*!< out: copy of src */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+ __attribute__((nonnull(1,2,3,4)));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< out: compressed page */
+ __attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+ const void* data, /*!< in: compressed page */
+ ulint size) /*!< in: size of compressed page */
+ __attribute__((nonnull));
+
+#ifndef UNIV_HOTBACKUP
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr pointer to an uncompressed page frame
+@param page_zip compressed page descriptor
+@return TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip) \
+ (buf_frame_get_page_zip(ptr) == (page_zip))
+#else /* !UNIV_HOTBACKUP */
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr pointer to an uncompressed page frame
+@param page_zip compressed page descriptor
+@return TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip) \
+ (page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+
+#endif /* page0zip_h */
diff --git a/storage/xtradb/include/page0zip.ic b/storage/xtradb/include/page0zip.ic
new file mode 100644
index 00000000000..75cc7a9fcc4
--- /dev/null
+++ b/storage/xtradb/include/page0zip.ic
@@ -0,0 +1,397 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "page0zip.h"
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list. The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page. The infimum and supremum records are
+excluded. The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored. We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header. These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+ - (heap_no - 1) << 1 (1..2 bytes)
+ - extra bytes backwards
+ - data bytes
+- clear record:
+ - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+ - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+ - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+ - indexed by heap_no
+ - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+ - REC_NODE_PTR_SIZE for non-leaf pages
+ - 0 otherwise
+(8) dense page directory, stored backwards
+ - n_dense = n_heap - 2
+ - existing records in ascending collation order
+ - deleted records (free list) in link order
+*/
+
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE 2
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK 0x3fff
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED 0x4000
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL 0x8000
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size;
+
+ if (UNIV_UNLIKELY(!page_zip->ssize)) {
+ return(0);
+ }
+
+ size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize;
+
+ ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+ ut_ad(size <= UNIV_PAGE_SIZE);
+
+ return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint size) /*!< in: size in bytes */
+{
+ if (size) {
+ int ssize;
+
+ ut_ad(ut_is_2pow(size));
+
+ for (ssize = 1; size > (ulint) (512 << ssize); ssize++) {
+ }
+
+ page_zip->ssize = ssize;
+ } else {
+ page_zip->ssize = 0;
+ }
+
+ ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+ ulint rec_size, /*!< in: length of the record in bytes */
+ ulint comp, /*!< in: nonzero=compact format */
+ ulint n_fields, /*!< in: number of fields in the record;
+ ignored if zip_size == 0 */
+ ulint zip_size) /*!< in: compressed page size in bytes, or 0 */
+{
+ ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES);
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE
+ if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) {
+ return(TRUE);
+ }
+#endif
+
+ if (UNIV_UNLIKELY(zip_size)) {
+ ut_ad(comp);
+ /* On a compressed page, there is a two-byte entry in
+ the dense page directory for every record. But there
+ is no record header. There should be enough room for
+ one record on an empty leaf page. Subtract 1 byte for
+ the encoded heap number. Check also the available space
+ on the uncompressed page. */
+ return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2)
+ >= (page_zip_empty_size(n_fields, zip_size) - 1)
+ || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+ }
+
+ return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+ const page_zip_des_t* page_zip)/*!< in: compressed page descriptor */
+{
+ ut_ad(page_zip);
+ ut_ad(page_zip->data);
+ ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE);
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+ ut_ad(page_zip->m_start <= page_zip->m_end);
+ ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+ ut_ad(page_zip->n_blobs
+ < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint* entry_size)/*!< out: size of the uncompressed
+ portion of a user record */
+{
+ ulint uncompressed_size;
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + REC_NODE_PTR_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ } else if (UNIV_UNLIKELY(is_clust)) {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ ut_ad(!page_zip->n_blobs);
+ }
+
+ if (entry_size) {
+ *entry_size = uncompressed_size;
+ }
+
+ return((page_dir_get_n_heap(page_zip->data) - 2)
+ * uncompressed_size
+ + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust)/*!< in: TRUE if clustered index */
+{
+ ulint uncompressed_size;
+ ulint trailer_len;
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+ &uncompressed_size);
+
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += uncompressed_size;
+
+ return((lint) page_zip_get_size(page_zip)
+ - trailer_len - page_zip->m_end
+ - (REC_N_NEW_EXTRA_BYTES - 2));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ ibool is_clust,/*!< in: TRUE if clustered index */
+ ulint length, /*!< in: combined size of the record */
+ ulint create) /*!< in: nonzero=add the record to
+ the heap */
+{
+ ulint uncompressed_size;
+ ulint trailer_len;
+
+ ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+ trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+ &uncompressed_size);
+
+ /* Subtract the fixed extra bytes and add the maximum
+ space needed for identifying the record (encoded heap_no). */
+ length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+ if (UNIV_UNLIKELY(create)) {
+ /* When a record is created, a pointer may be added to
+ the dense directory.
+ Likewise, space for the columns that will not be
+ compressed will be allocated from the page trailer.
+ Also the BLOB pointers will be allocated from there, but
+ we may as well count them in the length of the record. */
+
+ trailer_len += uncompressed_size;
+ }
+
+ return(UNIV_LIKELY(length
+ + trailer_len
+ + page_zip->m_end
+ < page_zip_get_size(page_zip)));
+}
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+ page_zip_des_t* page_zip) /*!< in/out: compressed page
+ descriptor */
+{
+ memset(page_zip, 0, sizeof *page_zip);
+}
+
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+ const byte* data,/*!< in: data on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr); /*!< in: mini-transaction */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page. The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_zip(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* str, /*!< in: address on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+{
+ ulint pos;
+
+ ut_ad(PAGE_ZIP_MATCH(str, page_zip));
+ ut_ad(page_zip_simple_validate(page_zip));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ pos = page_offset(str);
+
+ ut_ad(pos < PAGE_DATA);
+
+ memcpy(page_zip->data + pos, str, length);
+
+ /* The following would fail in page_cur_insert_rec_zip(). */
+ /* ut_ad(page_zip_validate(page_zip, str - pos)); */
+
+ if (UNIV_LIKELY_NULL(mtr)) {
+#ifndef UNIV_HOTBACKUP
+ page_zip_write_header_log(str, length, mtr);
+#endif /* !UNIV_HOTBACKUP */
+ }
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/pars0grm.h b/storage/xtradb/include/pars0grm.h
new file mode 100644
index 00000000000..3de233eed3a
--- /dev/null
+++ b/storage/xtradb/include/pars0grm.h
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
+
+As a special exception, when this file is copied by Bison into a
+Bison output file, you may use that output file without restriction.
+This special exception was added by the Free Software Foundation
+in version 1.24 of Bison.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* A Bison parser, made by GNU Bison 1.875d. */
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ PARS_INT_LIT = 258,
+ PARS_FLOAT_LIT = 259,
+ PARS_STR_LIT = 260,
+ PARS_FIXBINARY_LIT = 261,
+ PARS_BLOB_LIT = 262,
+ PARS_NULL_LIT = 263,
+ PARS_ID_TOKEN = 264,
+ PARS_AND_TOKEN = 265,
+ PARS_OR_TOKEN = 266,
+ PARS_NOT_TOKEN = 267,
+ PARS_GE_TOKEN = 268,
+ PARS_LE_TOKEN = 269,
+ PARS_NE_TOKEN = 270,
+ PARS_PROCEDURE_TOKEN = 271,
+ PARS_IN_TOKEN = 272,
+ PARS_OUT_TOKEN = 273,
+ PARS_BINARY_TOKEN = 274,
+ PARS_BLOB_TOKEN = 275,
+ PARS_INT_TOKEN = 276,
+ PARS_INTEGER_TOKEN = 277,
+ PARS_FLOAT_TOKEN = 278,
+ PARS_CHAR_TOKEN = 279,
+ PARS_IS_TOKEN = 280,
+ PARS_BEGIN_TOKEN = 281,
+ PARS_END_TOKEN = 282,
+ PARS_IF_TOKEN = 283,
+ PARS_THEN_TOKEN = 284,
+ PARS_ELSE_TOKEN = 285,
+ PARS_ELSIF_TOKEN = 286,
+ PARS_LOOP_TOKEN = 287,
+ PARS_WHILE_TOKEN = 288,
+ PARS_RETURN_TOKEN = 289,
+ PARS_SELECT_TOKEN = 290,
+ PARS_SUM_TOKEN = 291,
+ PARS_COUNT_TOKEN = 292,
+ PARS_DISTINCT_TOKEN = 293,
+ PARS_FROM_TOKEN = 294,
+ PARS_WHERE_TOKEN = 295,
+ PARS_FOR_TOKEN = 296,
+ PARS_DDOT_TOKEN = 297,
+ PARS_READ_TOKEN = 298,
+ PARS_ORDER_TOKEN = 299,
+ PARS_BY_TOKEN = 300,
+ PARS_ASC_TOKEN = 301,
+ PARS_DESC_TOKEN = 302,
+ PARS_INSERT_TOKEN = 303,
+ PARS_INTO_TOKEN = 304,
+ PARS_VALUES_TOKEN = 305,
+ PARS_UPDATE_TOKEN = 306,
+ PARS_SET_TOKEN = 307,
+ PARS_DELETE_TOKEN = 308,
+ PARS_CURRENT_TOKEN = 309,
+ PARS_OF_TOKEN = 310,
+ PARS_CREATE_TOKEN = 311,
+ PARS_TABLE_TOKEN = 312,
+ PARS_INDEX_TOKEN = 313,
+ PARS_UNIQUE_TOKEN = 314,
+ PARS_CLUSTERED_TOKEN = 315,
+ PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+ PARS_ON_TOKEN = 317,
+ PARS_ASSIGN_TOKEN = 318,
+ PARS_DECLARE_TOKEN = 319,
+ PARS_CURSOR_TOKEN = 320,
+ PARS_SQL_TOKEN = 321,
+ PARS_OPEN_TOKEN = 322,
+ PARS_FETCH_TOKEN = 323,
+ PARS_CLOSE_TOKEN = 324,
+ PARS_NOTFOUND_TOKEN = 325,
+ PARS_TO_CHAR_TOKEN = 326,
+ PARS_TO_NUMBER_TOKEN = 327,
+ PARS_TO_BINARY_TOKEN = 328,
+ PARS_BINARY_TO_NUMBER_TOKEN = 329,
+ PARS_SUBSTR_TOKEN = 330,
+ PARS_REPLSTR_TOKEN = 331,
+ PARS_CONCAT_TOKEN = 332,
+ PARS_INSTR_TOKEN = 333,
+ PARS_LENGTH_TOKEN = 334,
+ PARS_SYSDATE_TOKEN = 335,
+ PARS_PRINTF_TOKEN = 336,
+ PARS_ASSERT_TOKEN = 337,
+ PARS_RND_TOKEN = 338,
+ PARS_RND_STR_TOKEN = 339,
+ PARS_ROW_PRINTF_TOKEN = 340,
+ PARS_COMMIT_TOKEN = 341,
+ PARS_ROLLBACK_TOKEN = 342,
+ PARS_WORK_TOKEN = 343,
+ PARS_UNSIGNED_TOKEN = 344,
+ PARS_EXIT_TOKEN = 345,
+ PARS_FUNCTION_TOKEN = 346,
+ PARS_LOCK_TOKEN = 347,
+ PARS_SHARE_TOKEN = 348,
+ PARS_MODE_TOKEN = 349,
+ NEG = 350
+ };
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define NEG 350
+
+
+
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+extern YYSTYPE yylval;
+
+
+
diff --git a/storage/xtradb/include/pars0opt.h b/storage/xtradb/include/pars0opt.h
new file mode 100644
index 00000000000..42d956068f8
--- /dev/null
+++ b/storage/xtradb/include/pars0opt.h
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0sym.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node); /*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp); /*!< in: expression or condition */
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node); /*!< in: select node */
+
+#ifndef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0opt.ic b/storage/xtradb/include/pars0opt.ic
new file mode 100644
index 00000000000..e0bb6bf1af2
--- /dev/null
+++ b/storage/xtradb/include/pars0opt.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.ic
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0pars.h b/storage/xtradb/include/pars0pars.h
new file mode 100644
index 00000000000..fe5d76ebbb0
--- /dev/null
+++ b/storage/xtradb/include/pars0pars.h
@@ -0,0 +1,748 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int yydebug;
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+extern ibool pars_print_lexed;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t* pars_sym_tab_global;
+
+extern pars_res_word_t pars_to_char_token;
+extern pars_res_word_t pars_to_number_token;
+extern pars_res_word_t pars_to_binary_token;
+extern pars_res_word_t pars_binary_to_number_token;
+extern pars_res_word_t pars_substr_token;
+extern pars_res_word_t pars_replstr_token;
+extern pars_res_word_t pars_concat_token;
+extern pars_res_word_t pars_length_token;
+extern pars_res_word_t pars_instr_token;
+extern pars_res_word_t pars_sysdate_token;
+extern pars_res_word_t pars_printf_token;
+extern pars_res_word_t pars_assert_token;
+extern pars_res_word_t pars_rnd_token;
+extern pars_res_word_t pars_rnd_str_token;
+extern pars_res_word_t pars_count_token;
+extern pars_res_word_t pars_sum_token;
+extern pars_res_word_t pars_distinct_token;
+extern pars_res_word_t pars_binary_token;
+extern pars_res_word_t pars_blob_token;
+extern pars_res_word_t pars_int_token;
+extern pars_res_word_t pars_char_token;
+extern pars_res_word_t pars_float_token;
+extern pars_res_word_t pars_update_token;
+extern pars_res_word_t pars_asc_token;
+extern pars_res_word_t pars_desc_token;
+extern pars_res_word_t pars_open_token;
+extern pars_res_word_t pars_close_token;
+extern pars_res_word_t pars_share_token;
+extern pars_res_word_t pars_unique_token;
+extern pars_res_word_t pars_clustered_token;
+
+extern ulint pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT 0
+#define PARS_OUTPUT 1
+#define PARS_NOT_PARAM 2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str); /*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+void
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ int* result, /*!< out: number of characters copied or EOF */
+ int max_size); /*!< in: maximum number of characters which fit
+ in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+ const char* s); /*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg); /*!< in: first argument in the argument list */
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2); /*!< in: second argument or NULL for an unary
+ operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc); /*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list); /*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node); /*!< in: function id node in the symbol
+ table */
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* consistent_read,/*!< in: NULL or
+ &pars_consistent_token */
+ order_node_t* order_by); /*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+ if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond); /*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select); /*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the parameter */
+ ulint param_type,
+ /*!< in: PARS_INPUT or PARS_OUTPUT */
+ pars_res_word_t* type); /*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part); /*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list); /*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args); /*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val); /*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func); /*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor); /*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node); /*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_unsigned, /*!< in: if not NULL, column
+ is of type UNSIGNED. */
+ void* is_not_null); /*!< in: if not NULL, column
+ is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs, /*!< in: list of column names */
+ void* not_fit_in_memory);/*!< in: a non-NULL pointer means that
+ this is a table which in simulations
+ should be simulated as not fitting
+ in memory; thread is put to sleep
+ to simulate disk accesses; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about non-NULL value if
+ it has to reload the table definition
+ from disk */
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list); /*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ sym_node_t* param_list, /*!< in: parameter declaration list */
+ que_node_t* stat_list); /*!< in: statement list */
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ sym_node_t* sym_node); /*!< in: stored procedure name */
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+ que_node_t* node, /*!< in: root node for an incomplete
+ query graph */
+ trx_t* trx, /*!< in: transaction handle */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+ pars_info_t* info); /*!< in, own: info struct */
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype); /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str); /*!< in: string */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ lint val); /*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_BINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_dulint_literal(
+/*=========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ dulint val); /*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_add_function(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg); /*!< in: user-supplied argument */
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* id); /*!< in: id */
+
+/****************************************************************//**
+Get user function with the given name.
+@return user func, or NULL if not found */
+UNIV_INTERN
+pars_user_func_t*
+pars_info_get_user_func(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: function name to find*/
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name); /*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_struct {
+ mem_heap_t* heap; /*!< our own memory heap */
+
+ ib_vector_t* funcs; /*!< user functions, or NUll
+ (pars_user_func_t*) */
+ ib_vector_t* bound_lits; /*!< bound literals, or NULL
+ (pars_bound_lit_t*) */
+ ib_vector_t* bound_ids; /*!< bound ids, or NULL
+ (pars_bound_id_t*) */
+
+ ibool graph_owns_us; /*!< if TRUE (which is the default),
+ que_graph_free() will free us */
+};
+
+/** User-supplied function and argument. */
+struct pars_user_func_struct {
+ const char* name; /*!< function name */
+ pars_user_func_cb_t func; /*!< function address */
+ void* arg; /*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_struct {
+ const char* name; /*!< name */
+ const void* address; /*!< address */
+ ulint length; /*!< length of data */
+ ulint type; /*!< type, e.g. DATA_FIXBINARY */
+ ulint prtype; /*!< precise type, e.g. DATA_UNSIGNED */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_struct {
+ const char* name; /*!< name */
+ const char* id; /*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_struct{
+ int code; /*!< the token code for the reserved word from
+ pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FUNC */
+ int func; /*!< token code of the function name */
+ ulint class; /*!< class of the function */
+ que_node_t* args; /*!< argument(s) of the function */
+ UT_LIST_NODE_T(func_node_t) cond_list;
+ /*!< list of comparison conditions; defined
+ only for comparison operator nodes except,
+ presently, for OPT_SCROLL_TYPE ones */
+ UT_LIST_NODE_T(func_node_t) func_node_list;
+ /*!< list of function nodes in a parsed
+ query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ORDER */
+ sym_node_t* column; /*!< order-by column */
+ ibool asc; /*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_PROC */
+ sym_node_t* proc_id; /*!< procedure name symbol in the symbol
+ table of this same procedure */
+ sym_node_t* param_list; /*!< input and output parameters */
+ que_node_t* stat_list; /*!< statement list */
+ sym_tab_t* sym_tab; /*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ELSIF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_IF */
+ que_node_t* cond; /*!< if condition */
+ que_node_t* stat_list; /*!< statement list */
+ que_node_t* else_part; /*!< else-part statement list */
+ elsif_node_t* elsif_list; /*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_WHILE */
+ que_node_t* cond; /*!< while condition */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FOR */
+ sym_node_t* loop_var; /*!< loop variable: this is the
+ dereferenced symbol from the
+ variable declarations, not the
+ symbol occurrence in the for loop
+ definition */
+ que_node_t* loop_start_limit;/*!< initial value of loop variable */
+ que_node_t* loop_end_limit; /*!< end value of loop variable */
+ lint loop_end_value; /*!< evaluated value for the end value:
+ it is calculated only when the loop
+ is entered, and will not change within
+ the loop */
+ que_node_t* stat_list; /*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ASSIGNMENT */
+ sym_node_t* var; /*!< variable to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_COL_ASSIGN */
+ sym_node_t* col; /*!< column to set */
+ que_node_t* val; /*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH 1 /*!< +, -, *, / */
+#define PARS_FUNC_LOGICAL 2 /*!< AND, OR, NOT */
+#define PARS_FUNC_CMP 3 /*!< comparison operators */
+#define PARS_FUNC_PREDEFINED 4 /*!< TO_NUMBER, SUBSTR, ... */
+#define PARS_FUNC_AGGREGATE 5 /*!< COUNT, DISTINCT, SUM */
+#define PARS_FUNC_OTHER 6 /*!< these are not real functions,
+ e.g., := */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0pars.ic b/storage/xtradb/include/pars0pars.ic
new file mode 100644
index 00000000000..ae6c13cd671
--- /dev/null
+++ b/storage/xtradb/include/pars0pars.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.ic
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0sym.h b/storage/xtradb/include/pars0sym.h
new file mode 100644
index 00000000000..6d1a4b82414
--- /dev/null
+++ b/storage/xtradb/include/pars0sym.h
@@ -0,0 +1,244 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "dict0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap); /*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val); /*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len); /*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type); /*!< out: type of literal (PARS_*_LIT) */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab); /*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len); /*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name); /*!< in: name of bound id */
+
+/** Index of sym_node_struct::field_nos corresponding to the clustered index */
+#define SYM_CLUST_FIELD_NO 0
+/** Index of sym_node_struct::field_nos corresponding to a secondary index */
+#define SYM_SEC_FIELD_NO 1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+ SYM_VAR = 91, /*!< declared parameter or local
+ variable of a procedure */
+ SYM_IMPLICIT_VAR, /*!< storage for a intermediate result
+ of a calculation */
+ SYM_LIT, /*!< literal */
+ SYM_TABLE, /*!< database table name */
+ SYM_COLUMN, /*!< database table name */
+ SYM_CURSOR, /*!< named cursor */
+ SYM_PROCEDURE_NAME, /*!< stored procedure name */
+ SYM_INDEX, /*!< database index name */
+ SYM_FUNCTION /*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_struct{
+ que_common_t common; /*!< node type:
+ QUE_NODE_SYMBOL */
+ /* NOTE: if the data field in 'common.val' is not NULL and the symbol
+ table node is not for a temporary column, the memory for the value has
+ been allocated from dynamic memory and it should be freed when the
+ symbol table is discarded */
+
+ /* 'alias' and 'indirection' are almost the same, but not quite.
+ 'alias' always points to the primary instance of the variable, while
+ 'indirection' does the same only if we should use the primary
+ instance's values for the node's data. This is usually the case, but
+ when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+ t WHERE id = x;"), we copy the values from the primary instance to
+ the cursor's instance so that they are fixed for the duration of the
+ cursor, and set 'indirection' to NULL. If we did not, the value of
+ 'x' could change between fetches and things would break horribly.
+
+ TODO: It would be cleaner to make 'indirection' a boolean field and
+ always use 'alias' to refer to the primary node. */
+
+ sym_node_t* indirection; /*!< pointer to
+ another symbol table
+ node which contains
+ the value for this
+ node, NULL otherwise */
+ sym_node_t* alias; /*!< pointer to
+ another symbol table
+ node for which this
+ node is an alias,
+ NULL otherwise */
+ UT_LIST_NODE_T(sym_node_t) col_var_list; /*!< list of table
+ columns or a list of
+ input variables for an
+ explicit cursor */
+ ibool copy_val; /*!< TRUE if a column
+ and its value should
+ be copied to dynamic
+ memory when fetched */
+ ulint field_nos[2]; /*!< if a column, in
+ the position
+ SYM_CLUST_FIELD_NO is
+ the field number in the
+ clustered index; in
+ the position
+ SYM_SEC_FIELD_NO
+ the field number in the
+ non-clustered index to
+ use first; if not found
+ from the index, then
+ ULINT_UNDEFINED */
+ ibool resolved; /*!< TRUE if the
+ meaning of a variable
+ or a column has been
+ resolved; for literals
+ this is always TRUE */
+ enum sym_tab_entry token_type; /*!< type of the
+ parsed token */
+ const char* name; /*!< name of an id */
+ ulint name_len; /*!< id name length */
+ dict_table_t* table; /*!< table definition
+ if a table id or a
+ column id */
+ ulint col_no; /*!< column number if a
+ column */
+ sel_buf_t* prefetch_buf; /*!< NULL, or a buffer
+ for cached column
+ values for prefetched
+ rows */
+ sel_node_t* cursor_def; /*!< cursor definition
+ select node if a
+ named cursor */
+ ulint param_type; /*!< PARS_INPUT,
+ PARS_OUTPUT, or
+ PARS_NOT_PARAM if not a
+ procedure parameter */
+ sym_tab_t* sym_table; /*!< back pointer to
+ the symbol table */
+ UT_LIST_NODE_T(sym_node_t) sym_list; /*!< list of symbol
+ nodes */
+};
+
+/** Symbol table */
+struct sym_tab_struct{
+ que_t* query_graph;
+ /*!< query graph generated by the
+ parser */
+ const char* sql_string;
+ /*!< SQL string to parse */
+ size_t string_len;
+ /*!< SQL string length */
+ int next_char_pos;
+ /*!< position of the next character in
+ sql_string to give to the lexical
+ analyzer */
+ pars_info_t* info; /*!< extra information, or NULL */
+ sym_node_list_t sym_list;
+ /*!< list of symbol nodes in the symbol
+ table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ func_node_list;
+ /*!< list of function nodes in the
+ parsed query graph */
+ mem_heap_t* heap; /*!< memory heap from which we can
+ allocate space */
+};
+
+#ifndef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0sym.ic b/storage/xtradb/include/pars0sym.ic
new file mode 100644
index 00000000000..9eb09db3a47
--- /dev/null
+++ b/storage/xtradb/include/pars0sym.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.ic
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0types.h b/storage/xtradb/include/pars0types.h
new file mode 100644
index 00000000000..e0a8a86bf07
--- /dev/null
+++ b/storage/xtradb/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+typedef struct pars_info_struct pars_info_t;
+typedef struct pars_user_func_struct pars_user_func_t;
+typedef struct pars_bound_lit_struct pars_bound_lit_t;
+typedef struct pars_bound_id_struct pars_bound_id_t;
+typedef struct sym_node_struct sym_node_t;
+typedef struct sym_tab_struct sym_tab_t;
+typedef struct pars_res_word_struct pars_res_word_t;
+typedef struct func_node_struct func_node_t;
+typedef struct order_node_struct order_node_t;
+typedef struct proc_node_struct proc_node_t;
+typedef struct elsif_node_struct elsif_node_t;
+typedef struct if_node_struct if_node_t;
+typedef struct while_node_struct while_node_t;
+typedef struct for_node_struct for_node_t;
+typedef struct exit_node_struct exit_node_t;
+typedef struct return_node_struct return_node_t;
+typedef struct assign_node_struct assign_node_t;
+typedef struct col_assign_node_struct col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t) sym_node_list_t;
+
+#endif
diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h
new file mode 100644
index 00000000000..ed48f980294
--- /dev/null
+++ b/storage/xtradb/include/que0que.h
@@ -0,0 +1,529 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+extern ibool que_trace_on;
+
+/***********************************************************************//**
+Adds a query graph to the session's list of graphs. */
+UNIV_INTERN
+void
+que_graph_publish(
+/*==============*/
+ que_t* graph, /*!< in: graph */
+ sess_t* sess); /*!< in: session */
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+ que_t* graph, /*!< in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /*!< in: parent node */
+ ulint fork_type, /*!< in: fork type */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork); /*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent);/*!< in: parent */
+/***********************************************************************//**
+Creates a query graph thread node.
+@return own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+ que_fork_t* parent, /*!< in: parent node, i.e., a fork node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node); /*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+ que_t* graph); /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved.
+@return TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+ que_thr_t* thr, /*!< in: an query thread */
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+ que_thr_t* thr, /*!< in: query thread */
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.c, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+UNIV_INTERN
+void
+que_fork_error_handle(
+/*==================*/
+ trx_t* trx, /*!< in: trx */
+ que_t* fork); /*!< in: query graph which was run before signal
+ handling started, NULL not allowed */
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and releases
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+UNIV_INTERN
+void
+que_thr_end_wait(
+/*=============*/
+ que_thr_t* thr, /*!< in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/**********************************************************************//**
+Same as que_thr_end_wait, but no parameter next_thr available. */
+UNIV_INTERN
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+ que_thr_t* thr); /*!< in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork); /*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node); /*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size); /*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node); /*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node); /*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node); /*!< in: node */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list); /*!< in: node list, or NULL */
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph); /*!< in: graph */
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+ que_node_t* node); /*!< in: query graph node */
+/*********************************************************************//**
+Evaluate the given SQL
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ ibool reserve_dict_mutex,
+ /*!< in: if TRUE, acquire/release
+ dict_sys->mutex around call to pars_sql. */
+ trx_t* trx); /*!< in: trx */
+
+/* Query graph query thread node: the fields are protected by the kernel
+mutex with the exceptions named below */
+
+struct que_thr_struct{
+ que_common_t common; /*!< type: QUE_NODE_THR */
+ ulint magic_n; /*!< magic number to catch memory
+ corruption */
+ que_node_t* child; /*!< graph child node */
+ que_t* graph; /*!< graph where this node belongs */
+ ibool is_active; /*!< TRUE if the thread has been set
+ to the run state in
+ que_thr_move_to_run_state, but not
+ deactivated in
+ que_thr_dec_reference_count */
+ ulint state; /*!< state of the query thread */
+ UT_LIST_NODE_T(que_thr_t)
+ thrs; /*!< list of thread nodes of the fork
+ node */
+ UT_LIST_NODE_T(que_thr_t)
+ trx_thrs; /*!< lists of threads in wait list of
+ the trx */
+ UT_LIST_NODE_T(que_thr_t)
+ queue; /*!< list of runnable thread nodes in
+ the server task queue */
+ /*------------------------------*/
+ /* The following fields are private to the OS thread executing the
+ query thread, and are not protected by the kernel mutex: */
+
+ que_node_t* run_node; /*!< pointer to the node where the
+ subgraph down from this node is
+ currently executed */
+ que_node_t* prev_node; /*!< pointer to the node from which
+ the control came */
+ ulint resource; /*!< resource usage of the query thread
+ thus far */
+ ulint lock_state; /*!< lock state of thread (table or
+ row) */
+ ulint fk_cascade_depth; /*!< maximum cascading call depth
+ supported for foreign key constraint
+ related delete/updates */
+};
+
+#define QUE_THR_MAGIC_N 8476583
+#define QUE_THR_MAGIC_FREED 123461526
+
+/* Query graph fork node: its fields are protected by the kernel mutex */
+struct que_fork_struct{
+ que_common_t common; /*!< type: QUE_NODE_FORK */
+ que_t* graph; /*!< query graph of this node */
+ ulint fork_type; /*!< fork type */
+ ulint n_active_thrs; /*!< if this is the root of a graph, the
+ number query threads that have been
+ started in que_thr_move_to_run_state
+ but for which que_thr_dec_refer_count
+ has not yet been called */
+ trx_t* trx; /*!< transaction: this is set only in
+ the root node */
+ ulint state; /*!< state of the fork node */
+ que_thr_t* caller; /*!< pointer to a possible calling query
+ thread */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ thrs; /*!< list of query threads */
+ /*------------------------------*/
+ /* The fields in this section are defined only in the root node */
+ sym_tab_t* sym_tab; /*!< symbol table of the query,
+ generated by the parser, or NULL
+ if the graph was created 'by hand' */
+ pars_info_t* info; /*!< info struct, or NULL */
+ /* The following cur_... fields are relevant only in a select graph */
+
+ ulint cur_end; /*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START,
+ QUE_CUR_END */
+ ulint cur_pos; /*!< if there are n rows in the result
+ set, values 0 and n + 1 mean before
+ first row, or after last row, depending
+ on cur_end; values 1...n mean a row
+ index */
+ ibool cur_on_row; /*!< TRUE if cursor is on a row, i.e.,
+ it is not before the first row or
+ after the last row */
+ dulint n_inserts; /*!< number of rows inserted */
+ dulint n_updates; /*!< number of rows updated */
+ dulint n_deletes; /*!< number of rows deleted */
+ sel_node_t* last_sel_node; /*!< last executed select node, or NULL
+ if none */
+ UT_LIST_NODE_T(que_fork_t)
+ graphs; /*!< list of query graphs of a session
+ or a stored procedure */
+ /*------------------------------*/
+ mem_heap_t* heap; /*!< memory heap where the fork was
+ created */
+
+};
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL 1 /* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL 2 /* scrollable cursor */
+#define QUE_FORK_INSERT 3
+#define QUE_FORK_UPDATE 4
+#define QUE_FORK_ROLLBACK 5
+ /* This is really the undo graph used in rollback,
+ no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE 6
+#define QUE_FORK_EXECUTE 7
+#define QUE_FORK_PROCEDURE 8
+#define QUE_FORK_PROCEDURE_CALL 9
+#define QUE_FORK_MYSQL_INTERFACE 10
+#define QUE_FORK_RECOVERY 11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE 1
+#define QUE_FORK_COMMAND_WAIT 2
+#define QUE_FORK_INVALID 3
+#define QUE_FORK_BEING_FREED 4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT 1024
+
+/* Query graph node types */
+#define QUE_NODE_LOCK 1
+#define QUE_NODE_INSERT 2
+#define QUE_NODE_UPDATE 4
+#define QUE_NODE_CURSOR 5
+#define QUE_NODE_SELECT 6
+#define QUE_NODE_AGGREGATE 7
+#define QUE_NODE_FORK 8
+#define QUE_NODE_THR 9
+#define QUE_NODE_UNDO 10
+#define QUE_NODE_COMMIT 11
+#define QUE_NODE_ROLLBACK 12
+#define QUE_NODE_PURGE 13
+#define QUE_NODE_CREATE_TABLE 14
+#define QUE_NODE_CREATE_INDEX 15
+#define QUE_NODE_SYMBOL 16
+#define QUE_NODE_RES_WORD 17
+#define QUE_NODE_FUNC 18
+#define QUE_NODE_ORDER 19
+#define QUE_NODE_PROC (20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF (21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE (22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT 23
+#define QUE_NODE_FETCH 24
+#define QUE_NODE_OPEN 25
+#define QUE_NODE_COL_ASSIGNMENT 26
+#define QUE_NODE_FOR (27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN 28
+#define QUE_NODE_ROW_PRINTF 29
+#define QUE_NODE_ELSIF 30
+#define QUE_NODE_CALL 31
+#define QUE_NODE_EXIT 32
+
+#define QUE_NODE_INSERT_STATS 34
+
+/* Query thread states */
+#define QUE_THR_RUNNING 1
+#define QUE_THR_PROCEDURE_WAIT 2
+#define QUE_THR_COMPLETED 3 /* in selects this means that the
+ thread is at the end of its result set
+ (or start, in case of a scroll cursor);
+ in other statements, this means the
+ thread has done its task */
+#define QUE_THR_COMMAND_WAIT 4
+#define QUE_THR_LOCK_WAIT 5
+#define QUE_THR_SIG_REPLY_WAIT 6
+#define QUE_THR_SUSPENDED 7
+#define QUE_THR_ERROR 8
+
+/* Query thread lock states */
+#define QUE_THR_LOCK_NOLOCK 0
+#define QUE_THR_LOCK_ROW 1
+#define QUE_THR_LOCK_TABLE 2
+
+/* From where the cursor position is counted */
+#define QUE_CUR_NOT_DEFINED 1
+#define QUE_CUR_START 2
+#define QUE_CUR_END 3
+
+
+#ifndef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic
new file mode 100644
index 00000000000..bd936670e1e
--- /dev/null
+++ b/storage/xtradb/include/que0que.ic
@@ -0,0 +1,287 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+
+ return(thr->graph->trx);
+}
+
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+ const que_thr_t* thr) /*!< in: query thread */
+{
+ return(trx_is_recv(thr->graph->trx));
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+ que_fork_t* fork) /*!< in: query fork */
+{
+ que_thr_t* thr;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(&(((que_common_t*)node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(((que_common_t*)node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+ que_node_t* node, /*!< in: graph node */
+ ulint size) /*!< in: size */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+ que_node_t* node, /*!< in: graph node */
+ que_node_t* parent) /*!< in: parent */
+{
+ ut_ad(node);
+
+ ((que_common_t*)node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+ que_node_t* node) /*!< in: graph node */
+{
+ ut_ad(node);
+
+ return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+ que_node_t* node_list, /*!< in: node list, or NULL */
+ que_node_t* node) /*!< in: node */
+{
+ que_common_t* cnode;
+ que_common_t* cnode2;
+
+ cnode = (que_common_t*) node;
+
+ cnode->brother = NULL;
+
+ if (node_list == NULL) {
+
+ return(node);
+ }
+
+ cnode2 = (que_common_t*) node_list;
+
+ while (cnode2->brother != NULL) {
+ cnode2 = (que_common_t*) cnode2->brother;
+ }
+
+ cnode2->brother = node;
+
+ return(node_list);
+}
+
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+ que_node_t* node) /*!< in: node in a list */
+{
+ return(((que_common_t*)node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+ que_node_t* node_list) /*!< in: node list, or NULL */
+{
+ const que_common_t* cnode;
+ ulint len;
+
+ cnode = (const que_common_t*) node_list;
+ len = 0;
+
+ while (cnode != NULL) {
+ len++;
+ cnode = (const que_common_t*) cnode->brother;
+ }
+
+ return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+ que_node_t* node) /*!< in: node */
+{
+ return(((que_common_t*)node)->parent);
+}
+
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ que_t* graph;
+
+ graph = thr->graph;
+ trx = graph->trx;
+
+ if (graph->state != QUE_FORK_ACTIVE
+ || trx->que_state == TRX_QUE_LOCK_WAIT
+ || (UT_LIST_GET_LEN(trx->signals) > 0
+ && trx->que_state == TRX_QUE_RUNNING)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+ que_t* graph) /*!< in: graph */
+{
+ if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+ || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/storage/xtradb/include/que0types.h b/storage/xtradb/include/que0types.h
new file mode 100644
index 00000000000..ea976074768
--- /dev/null
+++ b/storage/xtradb/include/que0types.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+#include "dict0types.h"
+
+/* Pseudotype for all graph nodes */
+typedef void que_node_t;
+
+typedef struct que_fork_struct que_fork_t;
+
+/* Query graph root is a fork node */
+typedef que_fork_t que_t;
+
+typedef struct que_thr_struct que_thr_t;
+typedef struct que_common_struct que_common_t;
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_struct{
+ ulint type; /*!< query node type */
+ que_node_t* parent; /*!< back pointer to parent node, or NULL */
+ que_node_t* brother;/* pointer to a possible brother node */
+ dfield_t val; /*!< evaluated value for an expression */
+ ulint val_buf_size;
+ /* buffer size for the evaluated value data,
+ if the buffer has been allocated dynamically:
+ if this field is != 0, and the node is a
+ symbol node or a function node, then we
+ have to free the data field in val
+ explicitly */
+};
+
+#endif
diff --git a/storage/xtradb/include/read0read.h b/storage/xtradb/include/read0read.h
new file mode 100644
index 00000000000..4d9a9fade36
--- /dev/null
+++ b/storage/xtradb/include/read0read.h
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0read_h
+#define read0read_h
+
+#include "univ.i"
+
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "trx0trx.h"
+#include "read0types.h"
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ allocated */
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, or opens a new. The view
+must be closed with ..._close.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ allocated */
+/*********************************************************************//**
+Closes a read view. */
+UNIV_INTERN
+void
+read_view_close(
+/*============*/
+ read_view_t* view); /*!< in: read view */
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+ trx_t* trx); /*!< in: trx which has a read view */
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ const read_view_t* view, /*!< in: read view */
+ trx_id_t trx_id);/*!< in: trx id */
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+ const read_view_t* view); /*!< in: read view */
+/*********************************************************************//**
+Create a consistent cursor view for mysql to be used in cursors. In this
+consistent read view modifications done by the creating transaction or future
+transactions are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+ trx_t* cr_trx);/*!< in: trx where cursor view is created */
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+ trx_t* trx, /*!< in: trx */
+ cursor_view_t* curview); /*!< in: cursor view to be closed */
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+ trx_t* trx, /*!< in: transaction where cursor is set */
+ cursor_view_t* curview);/*!< in: consistent cursor view to be set */
+
+/** Read view lists the trx ids of those transactions for which a consistent
+read should not see the modifications to the database. */
+
+struct read_view_struct{
+ ulint type; /*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */
+ undo_no_t undo_no;/*!< ut_dulint_zero or if type is
+ VIEW_HIGH_GRANULARITY
+ transaction undo_no when this high-granularity
+ consistent read view was created */
+ trx_id_t low_limit_no;
+ /*!< The view does not need to see the undo
+ logs for transactions whose transaction number
+ is strictly smaller (<) than this value: they
+ can be removed in purge if not needed by other
+ views */
+ trx_id_t low_limit_id;
+ /*!< The read should not see any transaction
+ with trx id >= this value. In other words,
+ this is the "high water mark". */
+ trx_id_t up_limit_id;
+ /*!< The read should see all trx ids which
+ are strictly smaller (<) than this value.
+ In other words,
+ this is the "low water mark". */
+ ulint n_trx_ids;
+ /*!< Number of cells in the trx_ids array */
+ trx_id_t* trx_ids;/*!< Additional trx ids which the read should
+ not see: typically, these are the active
+ transactions at the time when the read is
+ serialized, except the reading transaction
+ itself; the trx ids in this array are in a
+ descending order. These trx_ids should be
+ between the "low" and "high" water marks,
+ that is, up_limit_id and low_limit_id. */
+ trx_id_t creator_trx_id;
+ /*!< trx id of creating transaction, or
+ ut_dulint_zero used in purge */
+ UT_LIST_NODE_T(read_view_t) view_list;
+ /*!< List of read views in trx_sys */
+};
+
+/** Read view types @{ */
+#define VIEW_NORMAL 1 /*!< Normal consistent read view
+ where transaction does not see changes
+ made by active transactions except
+ creating transaction. */
+#define VIEW_HIGH_GRANULARITY 2 /*!< High-granularity read view where
+ transaction does not see changes
+ made by active transactions and own
+ changes after a point in time when this
+ read view was created. */
+/* @} */
+
+/** Implement InnoDB framework to support consistent read views in
+cursors. This struct holds both heap where consistent read view
+is allocated and pointer to a read view. */
+
+struct cursor_view_struct{
+ mem_heap_t* heap;
+ /*!< Memory heap for the cursor view */
+ read_view_t* read_view;
+ /*!< Consistent read view of the cursor*/
+ ulint n_mysql_tables_in_use;
+ /*!< number of Innobase tables used in the
+ processing of this cursor */
+};
+
+#ifndef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/read0read.ic b/storage/xtradb/include/read0read.ic
new file mode 100644
index 00000000000..9924967cc2d
--- /dev/null
+++ b/storage/xtradb/include/read0read.ic
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.ic
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+/*********************************************************************//**
+Gets the nth trx id in a read view.
+@return trx id */
+UNIV_INLINE
+trx_id_t
+read_view_get_nth_trx_id(
+/*=====================*/
+ const read_view_t* view, /*!< in: read view */
+ ulint n) /*!< in: position */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ return(*(view->trx_ids + n));
+}
+
+/*********************************************************************//**
+Sets the nth trx id in a read view. */
+UNIV_INLINE
+void
+read_view_set_nth_trx_id(
+/*=====================*/
+ read_view_t* view, /*!< in: read view */
+ ulint n, /*!< in: position */
+ trx_id_t trx_id) /*!< in: trx id to set */
+{
+ ut_ad(n < view->n_trx_ids);
+
+ *(view->trx_ids + n) = trx_id;
+}
+
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+ const read_view_t* view, /*!< in: read view */
+ trx_id_t trx_id) /*!< in: trx id */
+{
+ ulint n_ids;
+ int cmp;
+ ulint i;
+
+ if (ut_dulint_cmp(trx_id, view->up_limit_id) < 0) {
+
+ return(TRUE);
+ }
+
+ if (ut_dulint_cmp(trx_id, view->low_limit_id) >= 0) {
+
+ return(FALSE);
+ }
+
+ /* We go through the trx ids in the array smallest first: this order
+ may save CPU time, because if there was a very long running
+ transaction in the trx id array, its trx id is looked at first, and
+ the first two comparisons may well decide the visibility of trx_id. */
+
+ n_ids = view->n_trx_ids;
+
+ for (i = 0; i < n_ids; i++) {
+
+ cmp = ut_dulint_cmp(
+ trx_id,
+ read_view_get_nth_trx_id(view, n_ids - i - 1));
+ if (cmp <= 0) {
+ return(cmp < 0);
+ }
+ }
+
+ return(TRUE);
+}
diff --git a/storage/xtradb/include/read0types.h b/storage/xtradb/include/read0types.h
new file mode 100644
index 00000000000..caf69e3fb51
--- /dev/null
+++ b/storage/xtradb/include/read0types.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+typedef struct read_view_struct read_view_t;
+typedef struct cursor_view_struct cursor_view_t;
+
+#endif
diff --git a/storage/xtradb/include/rem0cmp.h b/storage/xtradb/include/rem0cmp.h
new file mode 100644
index 00000000000..fcea62ad486
--- /dev/null
+++ b/storage/xtradb/include/rem0cmp.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets);
+ /*!< in: whether to check charsets */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2); /*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2); /*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ const dfield_t* dfield1,/*!< in: data field; must have type field set */
+ const dfield_t* dfield2);/*!< in: data field */
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns,
+ contains the value for current comparison */
+ ulint* matched_bytes); /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns, contains the
+ value for current comparison */
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ ibool* null_eq);/*!< out: set to TRUE if
+ found matching null values */
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index, /*!< in: data dictionary index */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when the function returns,
+ contains the value the for current
+ comparison */
+ ulint* matched_bytes, /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when the function returns, contains
+ the value for the current comparison */
+ ulint stats_method);
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index); /*!< in: data dictionary index */
+
+
+#ifndef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/rem0cmp.ic b/storage/xtradb/include/rem0cmp.ic
new file mode 100644
index 00000000000..d5185ec94af
--- /dev/null
+++ b/storage/xtradb/include/rem0cmp.ic
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.ic
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2) /*!< in: data field length or UNIV_SQL_NULL */
+{
+ return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
+}
+
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+ const dfield_t* dfield1,/*!< in: data field; must have type field set */
+ const dfield_t* dfield2)/*!< in: data field */
+{
+ const dtype_t* type;
+
+ ut_ad(dfield_check_typed(dfield1));
+
+ type = dfield_get_type(dfield1);
+
+ return(cmp_data_data(type->mtype, type->prtype,
+ (const byte*) dfield_get_data(dfield1),
+ dfield_get_len(dfield1),
+ (const byte*) dfield_get_data(dfield2),
+ dfield_get_len(dfield2)));
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index) /*!< in: data dictionary index */
+{
+ ulint match_f = 0;
+ ulint match_b = 0;
+
+ return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
+ &match_f, &match_b, 0));
+}
diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h
new file mode 100644
index 00000000000..17d08afabb9
--- /dev/null
+++ b/storage/xtradb/include/rem0rec.h
@@ -0,0 +1,824 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG 0x10UL
+/* The deleted flag in info bits */
+#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the
+ record has been delete marked */
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES 6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES 5
+
+/* Record status values */
+#define REC_STATUS_ORDINARY 0
+#define REC_STATUS_NODE_PTR 1
+#define REC_STATUS_INFIMUM 2
+#define REC_STATUS_SUPREMUM 3
+
+/* The following four constants are needed in page0zip.c in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO 4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define REC_HEAP_NO_SHIFT 3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE 4
+
+#ifdef UNIV_DEBUG
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE 4
+#else /* UNIV_DEBUG */
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE 2
+#endif /* UNIV_DEBUG */
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+#define REC_OFFS_NORMAL_SIZE 100
+#define REC_OFFS_SMALL_SIZE 10
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next); /*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next); /*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index); /*!< in: record descriptor */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec); /*!< in: old-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint n_owned); /*!< in: the number of owned */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec); /*!< in: new-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n_owned);/*!< in: the number of owned */
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint bits); /*!< in: info bits */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint bits); /*!< in: info bits */
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits); /*!< in: info bits */
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: compact physical record */
+ ulint bits); /*!< in: info bits */
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp); /*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint flag); /*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint flag); /*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec); /*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n); /*!< in: number of columns to scan */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array.
+@return the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: array consisting of
+ offsets[0] allocated elements,
+ or an array from rec_get_offsets(),
+ or NULL */
+ ulint n_fields,/*!< in: maximum number of
+ initialized fields
+ (ULINT_UNDEFINED if all fields) */
+ mem_heap_t** heap, /*!< in/out: memory heap */
+ const char* file, /*!< in: file name where called */
+ ulint line); /*!< in: line number where called */
+
+#define rec_get_offsets(rec,index,offsets,n,heap) \
+ rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__)
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT. This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INTERN
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+ const rec_t* rec, /*!< in: physical record in
+ ROW_FORMAT=COMPACT */
+ ulint extra, /*!< in: number of bytes to reserve
+ between the record header and
+ the data payload
+ (usually REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets);/*!< in/out: array of offsets;
+ in: n=rec_offs_n_fields(offsets) */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ ulint* offsets);/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: record or NULL */
+ const dict_index_t* index, /*!< in: record descriptor or NULL */
+ const ulint* offsets);/*!< in: array returned by
+ rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets);/*!< in: array returned by
+ rec_get_offsets() */
+#else
+# define rec_offs_make_valid(rec, index, offsets) ((void) 0)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len); /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n); /*!< in: index of the field */
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len); /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n); /*!< in: nth field */
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index number of the field */
+ const void* data, /*!< in: pointer to the data if not SQL null */
+ ulint len); /*!< in: length of the data or UNIV_SQL_NULL */
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec); /*!< in: physical record */
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+ const ulint* offsets);/*!< in: array for rec_get_offsets() */
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ ulint* offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc); /*!< in: number of elements */
+#define rec_offs_init(offsets) \
+ rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ void* buf, /*!< in: buffer */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size); /*!< in/out: buffer size */
+/************************************************************//**
+Folds a prefix of a physical record to a ulint.
+@return the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ const rec_t* rec, /*!< in: the physical record */
+ const ulint* offsets, /*!< in: array returned by
+ rec_get_offsets() */
+ ulint n_fields, /*!< in: number of complete
+ fields to fold */
+ ulint n_bytes, /*!< in: number of bytes to fold
+ in an incomplete last field */
+ dulint tree_id) /*!< in: index tree id */
+ __attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+ rec_t* rec, /*!< in: origin of record */
+ ulint extra, /*!< in: number of bytes to
+ reserve between the record
+ header and the data payload
+ (normally REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields);/*!< in: number of data fields */
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext); /*!< in: number of
+ externally stored columns */
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+ __attribute__((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra); /*!< out: extra size */
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra); /*!< out: extra size */
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext); /*!< in: number of externally stored columns */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple.
+The fields are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+ dtuple_t* tuple, /*!< out: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ mem_heap_t* heap); /*!< in: memory heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec); /*!< in: physical record */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT. Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ dict_index_t* index); /*!< in: record descriptor */
+#endif /* UNIV_HOTBACKUP */
+
+#define REC_INFO_BITS 6 /* This is single byte bit-field */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT 0x7FUL
+#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL
+
+/* The data size of record must be smaller than this because we reserve
+two upmost bits in a two byte offset for special purposes */
+#define REC_MAX_DATA_SIZE (16 * 1024)
+
+#ifndef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic
new file mode 100644
index 00000000000..fa96c97f95e
--- /dev/null
+++ b/storage/xtradb/include/rem0rec.ic
@@ -0,0 +1,1647 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0dict.h"
+
+/* Compact flag ORed to the extra size returned by rec_get_offsets() */
+#define REC_OFFS_COMPACT ((ulint) 1 << 31)
+/* SQL NULL flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_SQL_NULL ((ulint) 1 << 31)
+/* External flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_EXTERNAL ((ulint) 1 << 30)
+/* Mask for offsets returned by rec_get_offsets() */
+#define REC_OFFS_MASK (REC_OFFS_EXTERNAL - 1)
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits pointer to next record
+ 2 8 bits pointer to next record
+ 3 1 bit short flag
+ 7 bits number of fields
+ 4 3 bits number of fields
+ 5 bits heap number
+ 5 8 bits heap number
+ 6 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+ (1) byte offset (2) bit usage within byte
+ downward from
+ origin -> 1 8 bits relative offset of next record
+ 2 8 bits relative offset of next record
+ the relative offset is an unsigned 16-bit
+ integer:
+ (offset_of_next_record
+ - offset_of_this_record) mod 64Ki,
+ where mod is the modulo as a non-negative
+ number;
+ we can calculate the offset of the next
+ record with the formula:
+ relative_offset + offset_of_this_record
+ mod UNIV_PAGE_SIZE
+ 3 3 bits status:
+ 000=conventional record
+ 001=node pointer record (inside B-tree)
+ 010=infimum record
+ 011=supremum record
+ 1xx=reserved
+ 5 bits heap number
+ 4 8 bits heap number
+ 5 4 bits n_owned
+ 4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT 2
+#define REC_NEXT_MASK 0xFFFFUL
+#define REC_NEXT_SHIFT 0
+
+#define REC_OLD_SHORT 3 /* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK 0x1UL
+#define REC_OLD_SHORT_SHIFT 0
+
+#define REC_OLD_N_FIELDS 4
+#define REC_OLD_N_FIELDS_MASK 0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT 1
+
+#define REC_NEW_STATUS 3 /* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK 0x7UL
+#define REC_NEW_STATUS_SHIFT 0
+
+#define REC_OLD_HEAP_NO 5
+#define REC_HEAP_NO_MASK 0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.c */
+#define REC_NEW_HEAP_NO 4
+#define REC_HEAP_NO_SHIFT 3
+#endif
+
+#define REC_OLD_N_OWNED 6 /* This is single byte bit-field */
+#define REC_NEW_N_OWNED 5 /* This is single byte bit-field */
+#define REC_N_OWNED_MASK 0xFUL
+#define REC_N_OWNED_SHIFT 0
+
+#define REC_OLD_INFO_BITS 6 /* This is single byte bit-field */
+#define REC_NEW_INFO_BITS 5 /* This is single byte bit-field */
+#define REC_INFO_BITS_MASK 0xF0UL
+#define REC_INFO_BITS_SHIFT 0
+
+/* The following masks are used to filter the SQL null bit from
+one-byte and two-byte offsets */
+
+#define REC_1BYTE_SQL_NULL_MASK 0x80UL
+#define REC_2BYTE_SQL_NULL_MASK 0x8000UL
+
+/* In a 2-byte offset the second most significant bit denotes
+a field stored to another page: */
+
+#define REC_2BYTE_EXTERN_MASK 0x4000UL
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+ ^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+ ^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+ ^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+ ^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+ ^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+ ^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint i, /*!< in: ith field */
+ ibool val); /*!< in: value to set */
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint n); /*!< in: index of the field */
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_1(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_1(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask);
+ ut_ad(mask <= 0xFFUL);
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_1(rec - offs,
+ (mach_read_from_1(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+ const rec_t* rec, /*!< in: pointer to record origin */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+
+ return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+ rec_t* rec, /*!< in: pointer to record origin */
+ ulint val, /*!< in: value to set */
+ ulint offs, /*!< in: offset from the origin down */
+ ulint mask, /*!< in: mask used to filter bits */
+ ulint shift) /*!< in: shift right applied after masking */
+{
+ ut_ad(rec);
+ ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+ ut_ad(mask > 0xFFUL);
+ ut_ad(mask <= 0xFFFFUL);
+ ut_ad((mask >> shift) & 1);
+ ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+ ut_ad(((mask >> shift) << shift) == mask);
+ ut_ad(((val << shift) & mask) == (val << shift));
+
+ mach_write_to_2(rec - offs,
+ (mach_read_from_2(rec - offs) & ~mask)
+ | (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+
+ ut_ad(REC_NEXT_MASK == 0xFFFFUL);
+ ut_ad(REC_NEXT_SHIFT == 0);
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (UNIV_UNLIKELY(field_value == 0)) {
+
+ return(NULL);
+ }
+
+ if (UNIV_LIKELY(comp != 0)) {
+#if UNIV_PAGE_SIZE <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, UNIV_PAGE_SIZE)
+ < UNIV_PAGE_SIZE);
+#endif
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+ + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+ } else {
+ ut_ad(field_value < UNIV_PAGE_SIZE);
+
+ return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+ + field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+ rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return((rec_t*) rec_get_next_ptr_const(rec, comp));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint field_value;
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+ field_value = mach_read_from_2(rec - REC_NEXT);
+
+ if (UNIV_LIKELY(comp != 0)) {
+#if UNIV_PAGE_SIZE <= 32768
+ /* Note that for 64 KiB pages, field_value can 'wrap around'
+ and the debug assertion is not valid */
+
+ /* In the following assertion, field_value is interpreted
+ as signed 16-bit integer in 2's complement arithmetics.
+ If all platforms defined int16_t in the standard headers,
+ the expression could be written simpler as
+ (int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+ */
+ ut_ad((field_value >= 32768
+ ? field_value - 65536
+ : field_value)
+ + ut_align_offset(rec, UNIV_PAGE_SIZE)
+ < UNIV_PAGE_SIZE);
+#endif
+ if (UNIV_UNLIKELY(field_value == 0)) {
+
+ return(0);
+ }
+
+ /* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+ between each record. */
+ ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+ && field_value < 32768)
+ || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+ return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+ } else {
+ ut_ad(field_value < UNIV_PAGE_SIZE);
+
+ return(field_value);
+ }
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ut_ad(rec);
+ ut_ad(UNIV_PAGE_SIZE > next);
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+ mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint next) /*!< in: offset of the next record */
+{
+ ulint field_value;
+
+ ut_ad(rec);
+ ut_ad(UNIV_PAGE_SIZE > next);
+
+ if (UNIV_UNLIKELY(!next)) {
+ field_value = 0;
+ } else {
+ /* The following two statements calculate
+ next - offset_of_rec mod 64Ki, where mod is the modulo
+ as a non-negative number */
+
+ field_value = (ulint)
+ ((lint) next
+ - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
+ field_value &= REC_NEXT_MASK;
+ }
+
+ mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK,
+ REC_OLD_N_FIELDS_SHIFT);
+ ut_ad(ret <= REC_MAX_N_FIELDS);
+ ut_ad(ret > 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint n_fields) /*!< in: the number of fields */
+{
+ ut_ad(rec);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields > 0);
+
+ rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+ REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ulint ret;
+
+ ut_ad(rec);
+
+ ret = rec_get_bit_field_1(rec, REC_NEW_STATUS,
+ REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+ ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0);
+
+ return(ret);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index) /*!< in: record descriptor */
+{
+ ut_ad(rec);
+ ut_ad(index);
+
+ if (!dict_table_is_comp(index->table)) {
+ return(rec_get_n_fields_old(rec));
+ }
+
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ return(dict_index_get_n_fields(index));
+ case REC_STATUS_NODE_PTR:
+ return(dict_index_get_n_unique_in_tree(index) + 1);
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ return(1);
+ default:
+ ut_error;
+ return(ULINT_UNDEFINED);
+ }
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+ const rec_t* rec) /*!< in: old-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint n_owned) /*!< in: the number of owned */
+{
+ rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+ const rec_t* rec) /*!< in: new-style physical record */
+{
+ return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint n_owned)/*!< in: the number of owned */
+{
+ rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+ REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+ if (UNIV_LIKELY_NULL(page_zip)
+ && UNIV_LIKELY(rec_get_status(rec)
+ != REC_STATUS_SUPREMUM)) {
+ page_zip_rec_set_owned(page_zip, rec, n_owned);
+ }
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ return(rec_get_bit_field_1(
+ rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS,
+ REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+ rec_set_bit_field_1(rec, bits, REC_NEW_STATUS,
+ REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record. (Only compact records have status bits.)
+@return info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ ulint bits;
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+ if (UNIV_LIKELY(comp != 0)) {
+ bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
+ } else {
+ bits = rec_get_info_bits(rec, FALSE);
+ ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+ }
+ return(bits);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record. (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint bits) /*!< in: info bits */
+{
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+ rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+ rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ if (UNIV_LIKELY(comp != 0)) {
+ return(UNIV_UNLIKELY(
+ rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT)));
+ } else {
+ return(UNIV_UNLIKELY(
+ rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+ REC_INFO_DELETED_FLAG,
+ REC_INFO_BITS_SHIFT)));
+ }
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+ rec_t* rec, /*!< in: old-style physical record */
+ ulint flag) /*!< in: nonzero if delete marked */
+{
+ ulint val;
+
+ val = rec_get_info_bits(rec, FALSE);
+
+ if (flag) {
+ val |= REC_INFO_DELETED_FLAG;
+ } else {
+ val &= ~REC_INFO_DELETED_FLAG;
+ }
+
+ rec_set_info_bits_old(rec, val);
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+ rec_t* rec, /*!< in/out: new-style physical record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint flag) /*!< in: nonzero if delete marked */
+{
+ ulint val;
+
+ val = rec_get_info_bits(rec, TRUE);
+
+ if (flag) {
+ val |= REC_INFO_DELETED_FLAG;
+ } else {
+ val &= ~REC_INFO_DELETED_FLAG;
+ }
+
+ rec_set_info_bits_new(rec, val);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_rec_set_deleted(page_zip, rec, flag);
+ }
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+ rec_t* rec, /*!< in: physical record */
+ ulint heap_no)/*!< in: the heap number */
+{
+ rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+ rec_t* rec, /*!< in/out: physical record */
+ ulint heap_no)/*!< in: the heap number */
+{
+ rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO,
+ REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+
+ return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+ rec_t* rec, /*!< in: physical record */
+ ibool flag) /*!< in: TRUE if 1byte form */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+ ut_ad(flag <= TRUE);
+
+ rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+ REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/* Get the base address of offsets. The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+ const ulint* offsets)/*!< in: array for rec_get_offsets() */
+{
+ ulint n_alloc;
+ ut_ad(offsets);
+ n_alloc = offsets[0];
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets);
+ return(n_alloc);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+ ulint* offsets, /*!< out: array for rec_get_offsets(),
+ must be allocated */
+ ulint n_alloc) /*!< in: number of elements */
+{
+ ut_ad(offsets);
+ ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+ UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets);
+ offsets[0] = n_alloc;
+}
+
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_fields;
+ ut_ad(offsets);
+ n_fields = offsets[1];
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ return(n_fields);
+}
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+ const rec_t* rec, /*!< in: record or NULL */
+ const dict_index_t* index, /*!< in: record descriptor or NULL */
+ const ulint* offsets)/*!< in: array returned by
+ rec_get_offsets() */
+{
+ ulint i = rec_offs_n_fields(offsets);
+ ulint last = ULINT_MAX;
+ ulint comp = *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+ if (rec) {
+ ut_ad((ulint) rec == offsets[2]);
+ if (!comp) {
+ ut_a(rec_get_n_fields_old(rec) >= i);
+ }
+ }
+ if (index) {
+ ulint max_n_fields;
+ ut_ad((ulint) index == offsets[3]);
+ max_n_fields = ut_max(
+ dict_index_get_n_fields(index),
+ dict_index_get_n_unique_in_tree(index) + 1);
+ if (comp && rec) {
+ switch (rec_get_status(rec)) {
+ case REC_STATUS_ORDINARY:
+ break;
+ case REC_STATUS_NODE_PTR:
+ max_n_fields = dict_index_get_n_unique_in_tree(
+ index) + 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ max_n_fields = 1;
+ break;
+ default:
+ ut_error;
+ }
+ }
+ /* index->n_def == 0 for dummy indexes if !comp */
+ ut_a(!comp || index->n_def);
+ ut_a(!index->n_def || i <= max_n_fields);
+ }
+ while (i--) {
+ ulint curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK;
+ ut_a(curr <= last);
+ last = curr;
+ }
+ return(TRUE);
+}
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+ const rec_t* rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets)/*!< in: array returned by
+ rec_get_offsets() */
+{
+ ut_ad(rec);
+ ut_ad(index);
+ ut_ad(offsets);
+ ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets));
+ offsets[2] = (ulint) rec;
+ offsets[3] = (ulint) index;
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field; UNIV_SQL_NULL
+ if SQL null */
+{
+ ulint offs;
+ ulint length;
+ ut_ad(n < rec_offs_n_fields(offsets));
+ ut_ad(len);
+
+ if (UNIV_UNLIKELY(n == 0)) {
+ offs = 0;
+ } else {
+ offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
+ }
+
+ length = rec_offs_base(offsets)[1 + n];
+
+ if (length & REC_OFFS_SQL_NULL) {
+ length = UNIV_SQL_NULL;
+ } else {
+ length &= REC_OFFS_MASK;
+ length -= offs;
+ }
+
+ *len = length;
+ return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return(*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+ & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+ & REC_OFFS_SQL_NULL));
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: nth field */
+{
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ ut_ad(n < rec_offs_n_fields(offsets));
+ if (!n) {
+ return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK);
+ }
+ return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n])
+ & REC_OFFS_MASK);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n = 0;
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ if (rec_offs_nth_extern(offsets, i)) {
+ n++;
+ }
+ }
+ }
+
+ return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+ rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: field index */
+ ulint info) /*!< in: value to set */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n < rec_get_n_fields_old(rec));
+
+ mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_1_get_prev_field_end_info(rec, n)
+ & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(!rec_get_1byte_offs_flag(rec));
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ return(rec_2_get_prev_field_end_info(rec, n)
+ & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: field index */
+{
+ ut_ad(rec);
+ ut_ad(n <= rec_get_n_fields_old(rec));
+
+ if (n == 0) {
+
+ return(0);
+ }
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ return(rec_1_get_field_start_offs(rec, n));
+ }
+
+ return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+{
+ ulint os;
+ ulint next_os;
+
+ os = rec_get_field_start_offs(rec, n);
+ next_os = rec_get_field_start_offs(rec, n + 1);
+
+ ut_ad(next_os - os < UNIV_PAGE_SIZE);
+
+ return(next_os - os);
+}
+
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+ rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n, /*!< in: index number of the field */
+ const void* data, /*!< in: pointer to the data
+ if not SQL null */
+ ulint len) /*!< in: length of the data or UNIV_SQL_NULL */
+{
+ byte* data2;
+ ulint len2;
+
+ ut_ad(rec);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+ if (!rec_offs_nth_sql_null(offsets, n)) {
+ ut_a(!rec_offs_comp(offsets));
+ rec_set_nth_field_sql_null(rec, n);
+ }
+
+ return;
+ }
+
+ data2 = rec_get_nth_field(rec, offsets, n, &len2);
+ if (len2 == UNIV_SQL_NULL) {
+ ut_ad(!rec_offs_comp(offsets));
+ rec_set_nth_field_null_bit(rec, n, FALSE);
+ ut_ad(len == rec_get_nth_field_size(rec, n));
+ } else {
+ ut_ad(len2 == len);
+ }
+
+ ut_memcpy(data2, data, len);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ ut_ad(rec);
+
+ return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+ ulint* offsets, /*!< in/out: array returned by
+ rec_get_offsets() */
+ ulint n_fields) /*!< in: number of fields */
+{
+ ut_ad(offsets);
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= REC_MAX_N_FIELDS);
+ ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+ <= rec_offs_get_n_alloc(offsets));
+ offsets[1] = n_fields;
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)]
+ & REC_OFFS_MASK;
+ ut_ad(size < UNIV_PAGE_SIZE);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint size;
+ ut_ad(rec_offs_validate(NULL, NULL, offsets));
+ size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL);
+ ut_ad(size < UNIV_PAGE_SIZE);
+ return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(rec + rec_offs_data_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+ rec_t* rec, /*!< in: pointer to record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(rec - rec_offs_extra_size(offsets));
+}
+
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+ void* buf, /*!< in: buffer */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint extra_len;
+ ulint data_len;
+
+ ut_ad(rec && buf);
+ ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+
+ extra_len = rec_offs_extra_size(offsets);
+ data_len = rec_offs_data_size(offsets);
+
+ ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+
+ return((byte*)buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+ ulint data_size, /*!< in: data size */
+ ulint n_fields, /*!< in: number of fields */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ return(REC_N_OLD_EXTRA_BYTES + n_fields);
+ }
+
+ return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+ dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ ulint data_size;
+ ulint extra_size;
+
+ ut_ad(index);
+ ut_ad(dtuple);
+ ut_ad(dtuple_check_typed(dtuple));
+
+ ut_ad(index->type & DICT_UNIVERSAL
+ || dtuple_get_n_fields(dtuple)
+ == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+ == REC_STATUS_NODE_PTR)
+ ? dict_index_get_n_unique_in_tree(index) + 1
+ : dict_index_get_n_fields(index)));
+
+ if (dict_table_is_comp(index->table)) {
+ return(rec_get_converted_size_comp(index,
+ dtuple_get_info_bits(dtuple)
+ & REC_NEW_STATUS_MASK,
+ dtuple->fields,
+ dtuple->n_fields, NULL));
+ }
+
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ extra_size = rec_get_converted_extra_size(
+ data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+ return(data_size + extra_size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Folds a prefix of a physical record to a ulint. Folds only existing fields,
+that is, checks that we do not run out of the record.
+@return the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+ const rec_t* rec, /*!< in: the physical record */
+ const ulint* offsets, /*!< in: array returned by
+ rec_get_offsets() */
+ ulint n_fields, /*!< in: number of complete
+ fields to fold */
+ ulint n_bytes, /*!< in: number of bytes to fold
+ in an incomplete last field */
+ dulint tree_id) /*!< in: index tree id */
+{
+ ulint i;
+ const byte* data;
+ ulint len;
+ ulint fold;
+ ulint n_fields_rec;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(n_fields + n_bytes > 0);
+
+ n_fields_rec = rec_offs_n_fields(offsets);
+ ut_ad(n_fields <= n_fields_rec);
+ ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+ if (n_fields > n_fields_rec) {
+ n_fields = n_fields_rec;
+ }
+
+ if (n_fields == n_fields_rec) {
+ n_bytes = 0;
+ }
+
+ fold = ut_fold_dulint(tree_id);
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ if (n_bytes > 0) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len > n_bytes) {
+ len = n_bytes;
+ }
+
+ fold = ut_fold_ulint_pair(fold,
+ ut_fold_binary(data, len));
+ }
+ }
+
+ return(fold);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h
new file mode 100644
index 00000000000..8b84d4af233
--- /dev/null
+++ b/storage/xtradb/include/rem0types.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte rec_t;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS (1024 - 1)
+#define REC_MAX_HEAP_NO (2 * 8192 - 1)
+#define REC_MAX_N_OWNED (16 - 1)
+
+/* REC_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length). It is set to 3*256,
+so that one can create a column prefix index on 256 characters of a
+TEXT or VARCHAR column also in the UTF-8 charset. In that charset,
+a character may take at most 3 bytes.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_MAX_INDEX_COL_LEN 768
+
+#endif
diff --git a/storage/xtradb/include/row0ext.h b/storage/xtradb/include/row0ext.h
new file mode 100644
index 00000000000..43d82d644e6
--- /dev/null
+++ b/storage/xtradb/include/row0ext.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "univ.i"
+#include "row0types.h"
+#include "data0types.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ ulint zip_size,/*!< compressed page size in bytes, or 0 */
+ mem_heap_t* heap); /*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len); /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+
+/** Prefixes of externally stored columns */
+struct row_ext_struct{
+ ulint n_ext; /*!< number of externally stored columns */
+ const ulint* ext; /*!< col_no's of externally stored columns */
+ byte* buf; /*!< backing store of the column prefix cache */
+ ulint len[1]; /*!< prefix lengths; 0 if not cached */
+};
+
+#ifndef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0ext.ic b/storage/xtradb/include/row0ext.ic
new file mode 100644
index 00000000000..82771a9312a
--- /dev/null
+++ b/storage/xtradb/include/row0ext.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+ const row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+{
+ ut_ad(ext);
+ ut_ad(len);
+ ut_ad(i < ext->n_ext);
+
+ *len = ext->len[i];
+
+ if (UNIV_UNLIKELY(*len == 0)) {
+ /* The BLOB could not be fetched to the cache. */
+ return(field_ref_zero);
+ } else {
+ return(ext->buf + i * REC_MAX_INDEX_COL_LEN);
+ }
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+ const row_ext_t* ext, /*!< in: column prefix cache */
+ ulint col, /*!< in: column number in the InnoDB
+ table object, as reported by
+ dict_col_get_no(); NOT relative to the
+ records in the clustered index */
+ ulint* len) /*!< out: length of prefix, in bytes,
+ at most REC_MAX_INDEX_COL_LEN */
+{
+ ulint i;
+
+ ut_ad(ext);
+ ut_ad(len);
+
+ for (i = 0; i < ext->n_ext; i++) {
+ if (col == ext->ext[i]) {
+ return(row_ext_lookup_ith(ext, i, len));
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/xtradb/include/row0ins.h b/storage/xtradb/include/row0ins.h
new file mode 100644
index 00000000000..9f93565ddb7
--- /dev/null
+++ b/storage/xtradb/include/row0ins.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE If we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr); /*!< in: query thread */
+/*********************************************************************//**
+Creates an insert node struct.
+@return own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+ ulint ins_type, /*!< in: INS_VALUES, ... */
+ dict_table_t* table, /*!< in: table where to insert */
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row); /*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ ibool foreign,/*!< in: TRUE=check foreign key constraints */
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+ ins_node_t* node); /*!< in: row insert node */
+
+/* Insert node structure */
+
+struct ins_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_INSERT */
+ ulint ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+ dtuple_t* row; /*!< row to insert */
+ dict_table_t* table; /*!< table where to insert */
+ sel_node_t* select; /*!< select in searched insert */
+ que_node_t* values_list;/* list of expressions to evaluate and
+ insert in an INS_VALUES insert */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index where the index
+ entry should be inserted */
+ dtuple_t* entry; /*!< NULL, or entry to insert in the index;
+ after a successful insert of the entry,
+ this should be reset to NULL */
+ UT_LIST_BASE_NODE_T(dtuple_t)
+ entry_list;/* list of entries, one for each index */
+ byte* row_id_buf;/* buffer for the row id sys field in row */
+ trx_id_t trx_id; /*!< trx id or the last trx which executed the
+ node */
+ byte* trx_id_buf;/* buffer for the trx id sys field in row */
+ mem_heap_t* entry_sys_heap;
+ /* memory heap used as auxiliary storage;
+ entry_list and sys fields are stored here;
+ if this is NULL, entry list should be created
+ and buffers for sys fields in row allocated */
+ ulint magic_n;
+};
+
+#define INS_NODE_MAGIC_N 15849075
+
+/* Insert node types */
+#define INS_SEARCHED 0 /* INSERT INTO ... SELECT ... */
+#define INS_VALUES 1 /* INSERT INTO ... VALUES ... */
+#define INS_DIRECT 2 /* this is for internal use in dict0crea:
+ insert the row directly */
+
+/* Node execution states */
+#define INS_NODE_SET_IX_LOCK 1 /* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID 2 /* row id should be allocated */
+#define INS_NODE_INSERT_ENTRIES 3 /* index entries should be built and
+ inserted */
+
+#ifndef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0ins.ic b/storage/xtradb/include/row0ins.ic
new file mode 100644
index 00000000000..84f6da255bf
--- /dev/null
+++ b/storage/xtradb/include/row0ins.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.ic
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/xtradb/include/row0merge.h b/storage/xtradb/include/row0merge.h
new file mode 100644
index 00000000000..62a5efd11f7
--- /dev/null
+++ b/storage/xtradb/include/row0merge.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#ifndef row0merge_h
+#define row0merge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "read0types.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+
+/** Index field definition */
+struct merge_index_field_struct {
+ ulint prefix_len; /*!< column prefix length, or 0
+ if indexing the whole column */
+ const char* field_name; /*!< field name */
+};
+
+/** Index field definition */
+typedef struct merge_index_field_struct merge_index_field_t;
+
+/** Definition of an index being created */
+struct merge_index_def_struct {
+ const char* name; /*!< index name */
+ ulint ind_type; /*!< 0, DICT_UNIQUE,
+ or DICT_CLUSTERED */
+ ulint n_fields; /*!< number of fields
+ in index */
+ merge_index_field_t* fields; /*!< field definitions */
+};
+
+/** Definition of an index being created */
+typedef struct merge_index_def_struct merge_index_def_t;
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode); /*!< in: LOCK_X or LOCK_S */
+/*********************************************************************//**
+Drop an index from the InnoDB system tables. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+ dict_index_t* index, /*!< in: index to be removed */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index. The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table containing the indexes */
+ dict_index_t** index, /*!< in: indexes to drop */
+ ulint num_created); /*!< in: number of elements in index[] */
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void);
+/*=============================*/
+/*********************************************************************//**
+Rename the tables in the data dictionary. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+ dict_table_t* old_table, /*!< in/out: old table, renamed to
+ tmp_name */
+ dict_table_t* new_table, /*!< in/out: new table, renamed to
+ old_table->name */
+ const char* tmp_name, /*!< in: new name for old_table */
+ trx_t* trx); /*!< in: transaction handle */
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+ const char* table_name, /*!< in: new table name */
+ const merge_index_def_t*index_def, /*!< in: the index definition
+ of the primary key */
+ const dict_table_t* table, /*!< in: old table definition */
+ trx_t* trx); /*!< in/out: transaction
+ (sets error_state) */
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones. The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table); /*!< in/out: table with new indexes */
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+ trx_t* trx, /*!< in/out: trx (sets error_state) */
+ dict_table_t* table, /*!< in: the index is on this table */
+ const merge_index_def_t*index_def);
+ /*!< in: the index definition */
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return TRUE if index can be used by the transaction else FALSE */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index); /*!< in: index to check */
+/*********************************************************************//**
+If there are views that refer to the old table name then we "attach" to
+the new instance of the table else we drop it immediately.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table); /*!< in: table instance to drop */
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* old_table, /*!< in: table where rows are
+ read from */
+ dict_table_t* new_table, /*!< in: table where indexes are
+ created; identical to old_table
+ unless creating a PRIMARY KEY */
+ dict_index_t** indexes, /*!< in: indexes to be created */
+ ulint n_indexes, /*!< in: size of indexes[] */
+ TABLE* table); /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+#endif /* row0merge.h */
diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h
new file mode 100644
index 00000000000..9090e476bfd
--- /dev/null
+++ b/storage/xtradb/include/row0mysql.h
@@ -0,0 +1,807 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+
+extern ibool row_rollback_on_timeout;
+
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen);/*!< in: storage length of len: either 1
+ or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len); /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len); /*!< in: BLOB reference length
+ (not BLOB length) */
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp); /*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ ulint* new_err,/*!< out: possible new error encountered in
+ rollback, or the old error which was
+ during the function entry */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_savept_t* savept);/*!< in: savepoint */
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table); /*!< in: Innobase table handle */
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
+ ibool dict_locked); /*!< in: TRUE=data dictionary locked */
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in the MySQL
+ table handle */
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL
+ table handle */
+ dict_table_t* table, /*!< in: table to lock, or NULL
+ if prebuilt->table should be
+ locked as
+ prebuilt->select_lock_type */
+ ulint mode); /*!< in: lock mode of table
+ (ignored if table==NULL) */
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: row in the MySQL format */
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ const dict_table_t* table); /*!< in: table */
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt); /*!< in: prebuilt struct in MySQL
+ handle */
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL
+ handle */
+ ibool has_latches_on_recs);/*!< in: TRUE if called
+ so that we have the latches on
+ the records under pcur and
+ clust_pcur, and we do not need
+ to reposition the cursors. */
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap); /*!< in: mem heap from which allocated */
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table); /*!< in: table where we do the operation */
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line); /*!< in: line number */
+#define row_mysql_lock_data_dictionary(trx) \
+ row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line); /*!< in: line number */
+#define row_mysql_freeze_data_dictionary(trx) \
+ row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx); /*!< in/out: transaction */
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed) */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths); /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+/*********************************************************************//**
+*/
+UNIV_INTERN
+int
+row_insert_stats_for_mysql(
+/*=======================*/
+ dict_index_t* index,
+ trx_t* trx);
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES table2(c, d),
+ table2 can be written also with the
+ database name before it: test.table2 */
+ size_t sql_length, /*!< in: length of sql_string */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks); /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void);
+/*=========================================*/
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void);
+/*======================================*/
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+ dict_table_t* table, /*!< in: table handle */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drops a table for MySQL. If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread. If the data dictionary was not already locked
+by the transaction, the transaction will be committed. Otherwise, the
+data dictionary will remain locked.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool drop_db);/*!< in: TRUE=dropping whole database */
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void);
+/*============================*/
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx); /*!< in: transaction handle */
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Drops a database for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+ const char* name, /*!< in: database name which ends to '/' */
+ trx_t* trx); /*!< in: transaction handle */
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool commit); /*!< in: if TRUE then commit trx */
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return DB_SUCCESS if ok */
+UNIV_INTERN
+ulint
+row_check_index_for_mysql(
+/*======================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows); /*!< out: number of entries
+ seen in the consistent read */
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+ const char* table_name); /*!< in: name of the table, in the
+ form database/table_name */
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+typedef struct mysql_row_templ_struct mysql_row_templ_t;
+struct mysql_row_templ_struct {
+ ulint col_no; /*!< column number of the column */
+ ulint rec_field_no; /*!< field number of the column in an
+ Innobase record in the current index;
+ not defined if template_type is
+ ROW_MYSQL_WHOLE_ROW */
+ ulint mysql_col_offset; /*!< offset of the column in the MySQL
+ row format */
+ ulint mysql_col_len; /*!< length of the column in the MySQL
+ row format */
+ ulint mysql_null_byte_offset; /*!< MySQL NULL bit byte offset in a
+ MySQL record */
+ ulint mysql_null_bit_mask; /*!< bit mask to get the NULL bit,
+ zero if column cannot be NULL */
+ ulint type; /*!< column type in Innobase mtype
+ numbers DATA_CHAR... */
+ ulint mysql_type; /*!< MySQL type code; this is always
+ < 256 */
+ ulint mysql_length_bytes; /*!< if mysql_type
+ == DATA_MYSQL_TRUE_VARCHAR, this tells
+ whether we should use 1 or 2 bytes to
+ store the MySQL true VARCHAR data
+ length at the start of row in the MySQL
+ format (NOTE that the MySQL key value
+ format always uses 2 bytes for the data
+ len) */
+ ulint charset; /*!< MySQL charset-collation code
+ of the column, or zero */
+ ulint mbminlen; /*!< minimum length of a char, in bytes,
+ or zero if not a char type */
+ ulint mbmaxlen; /*!< maximum length of a char, in bytes,
+ or zero if not a char type */
+ ulint is_unsigned; /*!< if a column type is an integer
+ type and this field is != 0, then
+ it is an unsigned integer type */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE 8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD 4
+
+#define ROW_PREBUILT_ALLOCATED 78540783
+#define ROW_PREBUILT_FREED 26423527
+
+typedef int (*index_cond_func_t)(void *param);
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_struct {
+ ulint magic_n; /*!< this magic number is set to
+ ROW_PREBUILT_ALLOCATED when created,
+ or ROW_PREBUILT_FREED when the
+ struct has been freed */
+ dict_table_t* table; /*!< Innobase table handle */
+ dict_index_t* index; /*!< current index for a search, if
+ any */
+ trx_t* trx; /*!< current transaction handle */
+ unsigned sql_stat_start:1;/*!< TRUE when we start processing of
+ an SQL statement: we may have to set
+ an intention lock on the table,
+ create a consistent read view etc. */
+ unsigned mysql_has_locked:1;/*!< this is set TRUE when MySQL
+ calls external_lock on this handle
+ with a lock flag, and set FALSE when
+ with the F_UNLOCK flag */
+ unsigned clust_index_was_generated:1;
+ /*!< if the user did not define a
+ primary key in MySQL, then Innobase
+ automatically generated a clustered
+ index where the ordering column is
+ the row id: in this case this flag
+ is set to TRUE */
+ unsigned index_usable:1; /*!< caches the value of
+ row_merge_is_index_usable(trx,index) */
+ unsigned read_just_key:1;/*!< set to 1 when MySQL calls
+ ha_innobase::extra with the
+ argument HA_EXTRA_KEYREAD; it is enough
+ to read just columns defined in
+ the index (i.e., no read of the
+ clustered index record necessary) */
+ unsigned used_in_HANDLER:1;/*!< TRUE if we have been using this
+ handle in a MySQL HANDLER low level
+ index cursor command: then we must
+ store the pcur position even in a
+ unique search from a clustered index,
+ because HANDLER allows NEXT and PREV
+ in such a situation */
+ unsigned template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+ ROW_MYSQL_REC_FIELDS,
+ ROW_MYSQL_DUMMY_TEMPLATE, or
+ ROW_MYSQL_NO_TEMPLATE */
+ unsigned n_template:10; /*!< number of elements in the
+ template */
+ unsigned null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+ bitmap at the start of a row in the
+ MySQL format */
+ unsigned need_to_access_clustered:1; /*!< if we are fetching
+ columns through a secondary index
+ and at least one column is not in
+ the secondary index, then this is
+ set to TRUE */
+ unsigned templ_contains_blob:1;/*!< TRUE if the template contains
+ a column with DATA_BLOB ==
+ get_innobase_type_from_mysql_type();
+ not to be confused with InnoDB
+ externally stored columns
+ (VARCHAR can be off-page too) */
+ mysql_row_templ_t* mysql_template;/*!< template used to transform
+ rows fast between MySQL and Innobase
+ formats; memory for this template
+ is not allocated from 'heap' */
+ mem_heap_t* heap; /*!< memory heap from which
+ these auxiliary structures are
+ allocated when needed */
+ ins_node_t* ins_node; /*!< Innobase SQL insert node
+ used to perform inserts
+ to the table */
+ byte* ins_upd_rec_buff;/*!< buffer for storing data converted
+ to the Innobase format from the MySQL
+ format */
+ const byte* default_rec; /*!< the default values of all columns
+ (a "default row") in MySQL format */
+ ulint hint_need_to_fetch_extra_cols;
+ /*!< normally this is set to 0; if this
+ is set to ROW_RETRIEVE_PRIMARY_KEY,
+ then we should at least retrieve all
+ columns in the primary key; if this
+ is set to ROW_RETRIEVE_ALL_COLS, then
+ we must retrieve all columns in the
+ key (if read_just_key == 1), or all
+ columns in the table */
+ upd_node_t* upd_node; /*!< Innobase SQL update node used
+ to perform updates and deletes */
+ que_fork_t* ins_graph; /*!< Innobase SQL query graph used
+ in inserts */
+ que_fork_t* upd_graph; /*!< Innobase SQL query graph used
+ in updates or deletes */
+ btr_pcur_t* pcur; /*!< persistent cursor used in selects
+ and updates */
+ btr_pcur_t* clust_pcur; /*!< persistent cursor used in
+ some selects and updates */
+ que_fork_t* sel_graph; /*!< dummy query graph used in
+ selects */
+ dtuple_t* search_tuple; /*!< prebuilt dtuple used in selects */
+ byte row_id[DATA_ROW_ID_LEN];
+ /*!< if the clustered index was
+ generated, the row id of the
+ last row fetched is stored
+ here */
+ dtuple_t* clust_ref; /*!< prebuilt dtuple used in
+ sel/upd/del */
+ ulint select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+ ulint stored_select_lock_type;/*!< this field is used to
+ remember the original select_lock_type
+ that was decided in ha_innodb.cc,
+ ::store_lock(), ::external_lock(),
+ etc. */
+ ulint row_read_type; /*!< ROW_READ_WITH_LOCKS if row locks
+ should be the obtained for records
+ under an UPDATE or DELETE cursor.
+ If innodb_locks_unsafe_for_binlog
+ is TRUE, this can be set to
+ ROW_READ_TRY_SEMI_CONSISTENT, so that
+ if the row under an UPDATE or DELETE
+ cursor was locked by another
+ transaction, InnoDB will resort
+ to reading the last committed value
+ ('semi-consistent read'). Then,
+ this field will be set to
+ ROW_READ_DID_SEMI_CONSISTENT to
+ indicate that. If the row does not
+ match the WHERE condition, MySQL will
+ invoke handler::unlock_row() to
+ clear the flag back to
+ ROW_READ_TRY_SEMI_CONSISTENT and
+ to simply skip the row. If
+ the row matches, the next call to
+ row_search_for_mysql() will lock
+ the row.
+ This eliminates lock waits in some
+ cases; note that this breaks
+ serializability. */
+ ulint new_rec_locks; /*!< normally 0; if
+ srv_locks_unsafe_for_binlog is
+ TRUE or session is using READ
+ COMMITTED or READ UNCOMMITTED
+ isolation level, set in
+ row_search_for_mysql() if we set a new
+ record lock on the secondary
+ or clustered index; this is
+ used in row_unlock_for_mysql()
+ when releasing the lock under
+ the cursor if we determine
+ after retrieving the row that
+ it does not need to be locked
+ ('mini-rollback') */
+ ulint mysql_prefix_len;/*!< byte offset of the end of
+ the last requested column */
+ ulint mysql_row_len; /*!< length in bytes of a row in the
+ MySQL format */
+ ulint n_rows_fetched; /*!< number of rows fetched after
+ positioning the current cursor */
+ ulint fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+ byte* fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+ /*!< a cache for fetched rows if we
+ fetch many rows from the same cursor:
+ it saves CPU time to fetch them in a
+ batch; we reserve mysql_row_len
+ bytes for each such row; these
+ pointers point 4 bytes past the
+ allocated mem buf start, because
+ there is a 4 byte magic number at the
+ start and at the end */
+ ibool keep_other_fields_on_keyread; /*!< when using fetch
+ cache with HA_EXTRA_KEYREAD, don't
+ overwrite other fields in mysql row
+ row buffer.*/
+ ulint fetch_cache_first;/*!< position of the first not yet
+ fetched row in fetch_cache */
+ ulint n_fetch_cached; /*!< number of not yet fetched rows
+ in fetch_cache */
+ mem_heap_t* blob_heap; /*!< in SELECTS BLOB fields are copied
+ to this heap */
+ mem_heap_t* old_vers_heap; /*!< memory heap where a previous
+ version is built in consistent read */
+ /*----------------------*/
+ ulonglong autoinc_last_value;
+ /*!< last value of AUTO-INC interval */
+ ulonglong autoinc_increment;/*!< The increment step of the auto
+ increment column. Value must be
+ greater than or equal to 1. Required to
+ calculate the next value */
+ ulonglong autoinc_offset; /*!< The offset passed to
+ get_auto_increment() by MySQL. Required
+ to calculate the next value */
+ ulint autoinc_error; /*!< The actual error code encountered
+ while trying to init or read the
+ autoinc value from the table. We
+ store it here so that we can return
+ it to MySQL */
+ /*----------------------*/
+ ulint magic_n2; /*!< this should be the same as
+ magic_n */
+ /*----------------------*/
+ index_cond_func_t idx_cond_func;/* Index Condition Pushdown function,
+ or NULL if there is none set */
+ void* idx_cond_func_arg;/* ICP function argument */
+ ulint n_index_fields; /* Number of fields at the start of
+ mysql_template. Valid only when using
+ ICP. */
+ /*----------------------*/
+};
+
+#define ROW_PREBUILT_FETCH_MAGIC_N 465765687
+
+#define ROW_MYSQL_WHOLE_ROW 0
+#define ROW_MYSQL_REC_FIELDS 1
+#define ROW_MYSQL_NO_TEMPLATE 2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3 /* dummy template used in
+ row_scan_and_check_index */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY 1
+#define ROW_RETRIEVE_ALL_COLS 2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS 0
+#define ROW_READ_TRY_SEMI_CONSISTENT 1
+#define ROW_READ_DID_SEMI_CONSISTENT 2
+
+#ifndef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0mysql.ic b/storage/xtradb/include/row0mysql.ic
new file mode 100644
index 00000000000..35033aa2ad1
--- /dev/null
+++ b/storage/xtradb/include/row0mysql.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.ic
+MySQL interface for Innobase
+
+Created 1/23/2001 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0purge.h b/storage/xtradb/include/row0purge.h
new file mode 100644
index 00000000000..89ec54fb54a
--- /dev/null
+++ b/storage/xtradb/include/row0purge.h
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* Purge node structure */
+
+struct purge_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_PURGE */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ roll_ptr_t roll_ptr;/* roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/* undo log record */
+ trx_undo_inf_t* reservation;/* reservation for the undo log record in
+ the purge array */
+ undo_no_t undo_no;/* undo number of the record */
+ ulint rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ ibool found_clust;/* TRUE if the clustered index record
+ determined by ref was found in the clustered
+ index, and we were able to position pcur on
+ it */
+ dict_table_t* table; /*!< table where purge is done */
+ ulint cmpl_info;/* compiler analysis info of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ dtuple_t* ref; /*!< NULL, or row reference to the next row to
+ handle */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the indexed fields of the row to
+ handle */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after a successful
+ purge of a row */
+};
+
+#ifndef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0purge.ic b/storage/xtradb/include/row0purge.ic
new file mode 100644
index 00000000000..23d7d3845a4
--- /dev/null
+++ b/storage/xtradb/include/row0purge.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+
+/**************************************************//**
+@file include/row0purge.ic
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h
new file mode 100644
index 00000000000..723b7b53395
--- /dev/null
+++ b/storage/xtradb/include/row0row.h
@@ -0,0 +1,310 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the trx id field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ row_ext_t* ext, /*!< in: externally stored column prefixes,
+ or NULL */
+ dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory for
+ the index entry is allocated */
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead; the user
+ columns in this table should be
+ the same columns as in index->table */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ ulint type, /*!< in: ROW_COPY_DATA, or
+ ROW_COPY_POINTERS: the former
+ copies also the data fields to
+ heap as the latter only places
+ pointers to data fields on the
+ index page */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the dtuple is used! */
+ const dict_index_t* index, /*!< in: index */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap); /*!< in: memory heap from which
+ the memory needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: record in the index; must be
+ preserved while ref is used, as we do
+ not copy field values to heap */
+ const ulint* offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr); /*!< in/out: mtr */
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Searches an index record.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr); /*!< in: mtr */
+
+
+#define ROW_COPY_DATA 1
+#define ROW_COPY_POINTERS 2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record.
+
+No new latches may be obtained while the kernel mutex is reserved.
+However, the kernel mutex can be reserved while latches are owned. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#ifndef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic
new file mode 100644
index 00000000000..05c007641af
--- /dev/null
+++ b/storage/xtradb/include/row0row.ic
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+ return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint offset;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+ return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+ dtuple_t* ref, /*!< in/out: typed data tuple where the
+ reference is built */
+ const ulint* map, /*!< in: array of field numbers in rec
+ telling how ref should be built from
+ the fields of rec */
+ const rec_t* rec, /*!< in: record in the index; must be
+ preserved while ref is used, as we do
+ not copy field values to heap */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint field_no;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dtuple_get_n_fields(ref);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ field_no = *(map + i);
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ field = rec_get_nth_field(rec, offsets,
+ field_no, &len);
+ dfield_set_data(dfield, field, len);
+ }
+ }
+}
diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h
new file mode 100644
index 00000000000..8544b9d08ba
--- /dev/null
+++ b/storage/xtradb/include/row0sel.h
@@ -0,0 +1,402 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "read0read.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node); /*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf); /*!< in, own: prefetch buffer */
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i); /*!< in: get ith plan node */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr); /*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr); /*!< in: query thread */
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg); /*!< in: not used */
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len, /*!< in: MySQL key value length */
+ trx_t* trx); /*!< in: transaction */
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+ byte* buf, /*!< in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction); /*!< in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+ trx_t* trx, /*!< in: transaction object */
+ const char* norm_name); /*!< in: concatenation of database name,
+ '/' char, table name */
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+ dict_index_t* index, /*!< in: index to search */
+ const char* col_name, /*!< in: autoinc column name */
+ ib_uint64_t* value); /*!< out: AUTOINC value read */
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_struct{
+ byte* data; /*!< data, or NULL; if not NULL, this field
+ has allocated memory which must be explicitly
+ freed; can be != NULL even when len is
+ UNIV_SQL_NULL */
+ ulint len; /*!< data length or UNIV_SQL_NULL */
+ ulint val_buf_size;
+ /*!< size of memory buffer allocated for data:
+ this can be more than len; this is defined
+ when data != NULL */
+};
+
+/** Query plan */
+struct plan_struct{
+ dict_table_t* table; /*!< table struct in the dictionary
+ cache */
+ dict_index_t* index; /*!< table index used in the search */
+ btr_pcur_t pcur; /*!< persistent cursor used to search
+ the index */
+ ibool asc; /*!< TRUE if cursor traveling upwards */
+ ibool pcur_is_open; /*!< TRUE if pcur has been positioned
+ and we can try to fetch new rows */
+ ibool cursor_at_end; /*!< TRUE if the cursor is open but
+ we know that there are no more
+ qualifying rows left to retrieve from
+ the index tree; NOTE though, that
+ there may still be unprocessed rows in
+ the prefetch stack; always FALSE when
+ pcur_is_open is FALSE */
+ ibool stored_cursor_rec_processed;
+ /*!< TRUE if the pcur position has been
+ stored and the record it is positioned
+ on has already been processed */
+ que_node_t** tuple_exps; /*!< array of expressions
+ which are used to calculate
+ the field values in the search
+ tuple: there is one expression
+ for each field in the search
+ tuple */
+ dtuple_t* tuple; /*!< search tuple */
+ ulint mode; /*!< search mode: PAGE_CUR_G, ... */
+ ulint n_exact_match; /*!< number of first fields in
+ the search tuple which must be
+ exactly matched */
+ ibool unique_search; /*!< TRUE if we are searching an
+ index record with a unique key */
+ ulint n_rows_fetched; /*!< number of rows fetched using pcur
+ after it was opened */
+ ulint n_rows_prefetched;/*!< number of prefetched rows cached
+ for fetch: fetching several rows in
+ the same mtr saves CPU time */
+ ulint first_prefetched;/*!< index of the first cached row in
+ select buffer arrays for each column */
+ ibool no_prefetch; /*!< no prefetch for this table */
+ sym_node_list_t columns; /*!< symbol table nodes for the columns
+ to retrieve from the table */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ end_conds; /*!< conditions which determine the
+ fetch limit of the index segment we
+ have to look at: when one of these
+ fails, the result set has been
+ exhausted for the cursor in this
+ index; these conditions are normalized
+ so that in a comparison the column
+ for this table is the first argument */
+ UT_LIST_BASE_NODE_T(func_node_t)
+ other_conds; /*!< the rest of search conditions we can
+ test at this table in a join */
+ ibool must_get_clust; /*!< TRUE if index is a non-clustered
+ index and we must also fetch the
+ clustered index record; this is the
+ case if the non-clustered record does
+ not contain all the needed columns, or
+ if this is a single-table explicit
+ cursor, or a searched update or
+ delete */
+ ulint* clust_map; /*!< map telling how clust_ref is built
+ from the fields of a non-clustered
+ record */
+ dtuple_t* clust_ref; /*!< the reference to the clustered
+ index entry is built here if index is
+ a non-clustered index */
+ btr_pcur_t clust_pcur; /*!< if index is non-clustered, we use
+ this pcur to search the clustered
+ index */
+ mem_heap_t* old_vers_heap; /*!< memory heap used in building an old
+ version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+ SEL_NODE_CLOSED, /*!< it is a declared cursor which is not
+ currently open */
+ SEL_NODE_OPEN, /*!< intention locks not yet set on tables */
+ SEL_NODE_FETCH, /*!< intention locks have been set */
+ SEL_NODE_NO_MORE_ROWS /*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_SELECT */
+ enum sel_node_state
+ state; /*!< node state */
+ que_node_t* select_list; /*!< select list */
+ sym_node_t* into_list; /*!< variables list or NULL */
+ sym_node_t* table_list; /*!< table list */
+ ibool asc; /*!< TRUE if the rows should be fetched
+ in an ascending order */
+ ibool set_x_locks; /*!< TRUE if the cursor is for update or
+ delete, which means that a row x-lock
+ should be placed on the cursor row */
+ ulint row_lock_mode; /*!< LOCK_X or LOCK_S */
+ ulint n_tables; /*!< number of tables */
+ ulint fetch_table; /*!< number of the next table to access
+ in the join */
+ plan_t* plans; /*!< array of n_tables many plan nodes
+ containing the search plan and the
+ search data structures */
+ que_node_t* search_cond; /*!< search condition */
+ read_view_t* read_view; /*!< if the query is a non-locking
+ consistent read, its read view is
+ placed here, otherwise NULL */
+ ibool consistent_read;/*!< TRUE if the select is a consistent,
+ non-locking read */
+ order_node_t* order_by; /*!< order by column definition, or
+ NULL */
+ ibool is_aggregate; /*!< TRUE if the select list consists of
+ aggregate functions */
+ ibool aggregate_already_fetched;
+ /*!< TRUE if the aggregate row has
+ already been fetched for the current
+ cursor */
+ ibool can_get_updated;/*!< this is TRUE if the select
+ is in a single-table explicit
+ cursor which can get updated
+ within the stored procedure,
+ or in a searched update or
+ delete; NOTE that to determine
+ of an explicit cursor if it
+ can get updated, the parser
+ checks from a stored procedure
+ if it contains positioned
+ update or delete statements */
+ sym_node_t* explicit_cursor;/*!< not NULL if an explicit cursor */
+ UT_LIST_BASE_NODE_T(sym_node_t)
+ copy_variables; /*!< variables whose values we have to
+ copy when an explicit cursor is opened,
+ so that they do not change between
+ fetches */
+};
+
+/** Fetch statement node */
+struct fetch_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_FETCH */
+ sel_node_t* cursor_def; /*!< cursor definition */
+ sym_node_t* into_list; /*!< variables to set */
+
+ pars_user_func_t*
+ func; /*!< User callback function or NULL.
+ The first argument to the function
+ is a sel_node_t*, containing the
+ results of the SELECT operation for
+ one row. If the function returns
+ NULL, it is not interested in
+ further rows and the cursor is
+ modified so (cursor % NOTFOUND) is
+ true. If it returns not-NULL,
+ continue normally. See
+ row_fetch_print() for an example
+ (and a useful debugging tool). */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+ ROW_SEL_OPEN_CURSOR, /*!< open cursor */
+ ROW_SEL_CLOSE_CURSOR /*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_OPEN */
+ enum open_node_op
+ op_type; /*!< operation type: open or
+ close cursor */
+ sel_node_t* cursor_def; /*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_struct{
+ que_common_t common; /*!< type: QUE_NODE_ROW_PRINTF */
+ sel_node_t* sel_node; /*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+ ROW_SEL_NEXT = 1, /*!< ascending direction */
+ ROW_SEL_PREV = 2 /*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+ ROW_SEL_EXACT = 1, /*!< search using a complete key value */
+ ROW_SEL_EXACT_PREFIX /*!< search using a key prefix which
+ must match rows: the prefix may
+ contain an incomplete field (the last
+ field in prefix may be just a prefix
+ of a fixed length column) */
+};
+
+#ifndef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0sel.ic b/storage/xtradb/include/row0sel.ic
new file mode 100644
index 00000000000..5907f9913da
--- /dev/null
+++ b/storage/xtradb/include/row0sel.ic
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.ic
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+ sel_node_t* node, /*!< in: select node */
+ ulint i) /*!< in: get ith plan node */
+{
+ ut_ad(i < node->n_tables);
+
+ return(node->plans + i);
+}
+
+/*********************************************************************//**
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ node->state = SEL_NODE_OPEN;
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ open_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = (open_node_t*) thr->run_node;
+ ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+ sel_node = node->cursor_def;
+
+ err = DB_SUCCESS;
+
+ if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+ /* if (sel_node->state == SEL_NODE_CLOSED) { */
+
+ sel_node_reset_cursor(sel_node);
+ /* } else {
+ err = DB_ERROR;
+ } */
+ } else {
+ if (sel_node->state != SEL_NODE_CLOSED) {
+
+ sel_node->state = SEL_NODE_CLOSED;
+ } else {
+ err = DB_ERROR;
+ }
+ }
+
+ if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) {
+ /* SQL error detected */
+ fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+ ut_error;
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/xtradb/include/row0types.h b/storage/xtradb/include/row0types.h
new file mode 100644
index 00000000000..7920fd75061
--- /dev/null
+++ b/storage/xtradb/include/row0types.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0types_h
+#define row0types_h
+
+typedef struct plan_struct plan_t;
+
+typedef struct upd_struct upd_t;
+
+typedef struct upd_field_struct upd_field_t;
+
+typedef struct upd_node_struct upd_node_t;
+
+typedef struct del_node_struct del_node_t;
+
+typedef struct ins_node_struct ins_node_t;
+
+typedef struct sel_node_struct sel_node_t;
+
+typedef struct open_node_struct open_node_t;
+
+typedef struct fetch_node_struct fetch_node_t;
+
+typedef struct row_printf_node_struct row_printf_node_t;
+typedef struct sel_buf_struct sel_buf_t;
+
+typedef struct undo_node_struct undo_node_t;
+
+typedef struct purge_node_struct purge_node_t;
+
+typedef struct row_ext_struct row_ext_t;
+
+/* MySQL data types */
+typedef struct st_table TABLE;
+
+#endif
diff --git a/storage/xtradb/include/row0uins.h b/storage/xtradb/include/row0uins.h
new file mode 100644
index 00000000000..77b071c3a6b
--- /dev/null
+++ b/storage/xtradb/include/row0uins.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+ undo_node_t* node); /*!< in: row undo node */
+
+#ifndef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0uins.ic b/storage/xtradb/include/row0uins.ic
new file mode 100644
index 00000000000..27606150d8e
--- /dev/null
+++ b/storage/xtradb/include/row0uins.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.ic
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/row0umod.h b/storage/xtradb/include/row0umod.h
new file mode 100644
index 00000000000..ed44cc8d601
--- /dev/null
+++ b/storage/xtradb/include/row0umod.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr); /*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0umod.ic b/storage/xtradb/include/row0umod.ic
new file mode 100644
index 00000000000..ea3fd3b43c7
--- /dev/null
+++ b/storage/xtradb/include/row0umod.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.ic
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0undo.h b/storage/xtradb/include/row0undo.h
new file mode 100644
index 00000000000..6eb4ca448b3
--- /dev/null
+++ b/storage/xtradb/include/row0undo.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap); /*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node); /*!< in: row undo node */
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+ If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Execution state of an undo node */
+enum undo_exec {
+ UNDO_NODE_FETCH_NEXT = 1, /*!< we should fetch the next
+ undo log record */
+ UNDO_NODE_PREV_VERS, /*!< the roll ptr to previous
+ version of a row is stored in
+ node, and undo should be done
+ based on it */
+ UNDO_NODE_INSERT, /*!< undo a fresh insert of a
+ row to a table */
+ UNDO_NODE_MODIFY /*!< undo a modify operation
+ (DELETE or UPDATE) on a row
+ of a table */
+};
+
+/** Undo node structure */
+struct undo_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_UNDO */
+ enum undo_exec state; /*!< node execution state */
+ trx_t* trx; /*!< trx for which undo is done */
+ roll_ptr_t roll_ptr;/*!< roll pointer to undo log record */
+ trx_undo_rec_t* undo_rec;/*!< undo log record */
+ undo_no_t undo_no;/*!< undo number of the record */
+ ulint rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+ ... */
+ roll_ptr_t new_roll_ptr;
+ /*!< roll ptr to restore to clustered index
+ record */
+ trx_id_t new_trx_id; /*!< trx id to restore to clustered index
+ record */
+ btr_pcur_t pcur; /*!< persistent cursor used in searching the
+ clustered index record */
+ dict_table_t* table; /*!< table where undo is done */
+ ulint cmpl_info;/*!< compiler analysis of an update */
+ upd_t* update; /*!< update vector for a clustered index
+ record */
+ dtuple_t* ref; /*!< row reference to the next row to handle */
+ dtuple_t* row; /*!< a copy (also fields copied to heap) of the
+ row to handle */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns of the row */
+ dtuple_t* undo_row;/*!< NULL, or the row after undo */
+ row_ext_t* undo_ext;/*!< NULL, or prefixes of the externally
+ stored columns of undo_row */
+ dict_index_t* index; /*!< the next index whose record should be
+ handled */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage for
+ row; this must be emptied after undo is tried
+ on a row */
+};
+
+
+#ifndef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0undo.ic b/storage/xtradb/include/row0undo.ic
new file mode 100644
index 00000000000..dc788debc14
--- /dev/null
+++ b/storage/xtradb/include/row0undo.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.ic
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h
new file mode 100644
index 00000000000..635d746d5a1
--- /dev/null
+++ b/storage/xtradb/include/row0upd.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "btr0pcur.h"
+# include "que0types.h"
+# include "pars0types.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap); /*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update); /*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n); /*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ ulint field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx); /*!< in: transaction */
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ ulint no) /*!< in: field_no */
+ __attribute__((nonnull, pure));
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ dict_index_t* index, /*!< in: clustered index */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */
+ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ const dtuple_t* entry, /*!< in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val); /*!< in: value to write */
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+ const upd_t* update, /*!< in: update vector */
+ byte* log_ptr,/*!< in: pointer to mlog buffer: must
+ contain at least MLOG_BUF_MARGIN bytes
+ of free space; the buffer is closed
+ within this function */
+ mtr_t* mtr); /*!< in: mtr into whose log to write */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update);/*!< in: update vector */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /*!< in/out: record where replaced */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ page_zip_des_t* page_zip);/*!< in: compressed page with enough space
+ available, or NULL */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: secondary index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: clustered index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap); /*!< in: memory heap from which allocated */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the index so
+ that the field number in an upd_field is the
+ index position */
+ ibool order_only,
+ /*!< in: if TRUE, limit the replacement to
+ ordering fields of index; note that this
+ does not work for non-clustered indexes. */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+ __attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ CLUSTERED index so that the field number in
+ an upd_field is the clustered index position */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+ __attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap); /*!< in: memory heap */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update);/*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr); /*!< in: query thread */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses the log data of system field values.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint* pos, /*!< out: TRX_ID position in record */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr);/*!< out: roll ptr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint pos, /*!< in: TRX_ID position in rec */
+ trx_id_t trx_id, /*!< in: transaction id */
+ roll_ptr_t roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ mem_heap_t* heap, /*!< in: memory heap where update vector is
+ built */
+ upd_t** update_out);/*!< out: update vector */
+
+
+/* Update vector field */
+struct upd_field_struct{
+ unsigned field_no:16; /*!< field number in an index, usually
+ the clustered index, but in updating
+ a secondary index record in btr0cur.c
+ this is the position in the secondary
+ index */
+#ifndef UNIV_HOTBACKUP
+ unsigned orig_len:16; /*!< original length of the locally
+ stored part of an externally stored
+ column, or 0 */
+ que_node_t* exp; /*!< expression for calculating a new
+ value: it refers to column values and
+ constants in the symbol table of the
+ query graph */
+#endif /* !UNIV_HOTBACKUP */
+ dfield_t new_val; /*!< new value for the column */
+};
+
+/* Update vector structure */
+struct upd_struct{
+ ulint info_bits; /*!< new value of info bits to record;
+ default is 0 */
+ ulint n_fields; /*!< number of update fields */
+ upd_field_t* fields; /*!< array of update fields */
+};
+
+#ifndef UNIV_HOTBACKUP
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_UPDATE */
+ ibool is_delete;/* TRUE if delete, FALSE if update */
+ ibool searched_update;
+ /* TRUE if searched update, FALSE if
+ positioned */
+ ibool in_mysql_interface;
+ /* TRUE if the update node was created
+ for the MySQL interface */
+ dict_foreign_t* foreign;/* NULL or pointer to a foreign key
+ constraint if this update node is used in
+ doing an ON DELETE or ON UPDATE operation */
+ upd_node_t* cascade_node;/* NULL or an update node template which
+ is used to implement ON DELETE/UPDATE CASCADE
+ or ... SET NULL for foreign keys */
+ mem_heap_t* cascade_heap;/* NULL or a mem heap where the cascade
+ node is created */
+ sel_node_t* select; /*!< query graph subtree implementing a base
+ table cursor: the rows returned will be
+ updated */
+ btr_pcur_t* pcur; /*!< persistent cursor placed on the clustered
+ index record which should be updated or
+ deleted; the cursor is stored in the graph
+ of 'select' field above, except in the case
+ of the MySQL interface */
+ dict_table_t* table; /*!< table where updated */
+ upd_t* update; /*!< update vector for the row */
+ ulint update_n_fields;
+ /* when this struct is used to implement
+ a cascade operation for foreign keys, we store
+ here the size of the buffer allocated for use
+ as the update vector */
+ sym_node_list_t columns;/* symbol table nodes for the columns
+ to retrieve from the table */
+ ibool has_clust_rec_x_lock;
+ /* TRUE if the select which retrieves the
+ records to update already sets an x-lock on
+ the clustered record; note that it must always
+ set at least an s-lock */
+ ulint cmpl_info;/* information extracted during query
+ compilation; speeds up execution:
+ UPD_NODE_NO_ORD_CHANGE and
+ UPD_NODE_NO_SIZE_CHANGE, ORed */
+ /*----------------------*/
+ /* Local storage for this graph node */
+ ulint state; /*!< node execution state */
+ dict_index_t* index; /*!< NULL, or the next index whose record should
+ be updated */
+ dtuple_t* row; /*!< NULL, or a copy (also fields copied to
+ heap) of the row to update; this must be reset
+ to NULL after a successful update */
+ row_ext_t* ext; /*!< NULL, or prefixes of the externally
+ stored columns in the old row */
+ dtuple_t* upd_row;/* NULL, or a copy of the updated row */
+ row_ext_t* upd_ext;/* NULL, or prefixes of the externally
+ stored columns in upd_row */
+ mem_heap_t* heap; /*!< memory heap used as auxiliary storage;
+ this must be emptied after a successful
+ update */
+ /*----------------------*/
+ sym_node_t* table_sym;/* table node in symbol table */
+ que_node_t* col_assign_list;
+ /* column assignment list */
+ ulint magic_n;
+};
+
+#define UPD_NODE_MAGIC_N 1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK 1 /* execution came to the node from
+ a node above and if the field
+ has_clust_rec_x_lock is FALSE, we
+ should set an intention x-lock on
+ the table */
+#define UPD_NODE_UPDATE_CLUSTERED 2 /* clustered index record should be
+ updated */
+#define UPD_NODE_INSERT_CLUSTERED 3 /* clustered index record should be
+ inserted, old record is already delete
+ marked */
+#define UPD_NODE_UPDATE_ALL_SEC 4 /* an ordering field of the clustered
+ index record was changed, or this is
+ a delete operation: should update
+ all the secondary index records */
+#define UPD_NODE_UPDATE_SOME_SEC 5 /* secondary index entries should be
+ looked at and updated if an ordering
+ field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE 1 /* no secondary index record will be
+ changed in the update and no ordering
+ field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE 2 /* no record field size will be
+ changed in the update */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic
new file mode 100644
index 00000000000..18e22f1eca9
--- /dev/null
+++ b/storage/xtradb/include/row0upd.ic
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "trx0undo.h"
+# include "row0row.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+ ulint n, /*!< in: number of fields */
+ mem_heap_t* heap) /*!< in: heap from which memory allocated */
+{
+ upd_t* update;
+
+ update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t));
+
+ update->info_bits = 0;
+ update->n_fields = n;
+ update->fields = (upd_field_t*)
+ mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+
+ return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+ const upd_t* update) /*!< in: update vector */
+{
+ ut_ad(update);
+
+ return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+ const upd_t* update, /*!< in: update vector */
+ ulint n) /*!< in: field position in update vector */
+{
+ ut_ad(update);
+ ut_ad(n < update->n_fields);
+
+ return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+ upd_field_t* upd_field, /*!< in: update vector field */
+ ulint field_no, /*!< in: field number in a clustered
+ index */
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ upd_field->field_no = field_no;
+ upd_field->orig_len = 0;
+
+ if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to access field %lu in ",
+ (ulong) field_no);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, "\n"
+ "InnoDB: but index only has %lu fields\n",
+ (ulong) dict_index_get_n_fields(index));
+ }
+
+ dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+ dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+ const upd_t* update, /*!< in: update vector */
+ ulint no) /*!< in: field_no */
+{
+ ulint i;
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+ const upd_field_t* uf = upd_get_nth_field(update, i);
+
+ if (uf->field_no == no) {
+
+ return(uf);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be updated, or NULL */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+#ifdef UNIV_SYNC_DEBUG
+ if (!rw_lock_own(&btr_search_latch, RW_LOCK_EX)) {
+ ut_ad(!buf_block_align(rec)->is_hashed);
+ }
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ulint pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+ page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
+ pos, trx->id, roll_ptr);
+ } else {
+ ulint offset = index->trx_id_offset;
+
+ if (!offset) {
+ offset = row_get_trx_id_offset(rec, index, offsets);
+ }
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+ trx_write_trx_id(rec + offset, trx->id);
+ trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/row0vers.h b/storage/xtradb/include/row0vers.h
new file mode 100644
index 00000000000..5a2e38230d5
--- /dev/null
+++ b/storage/xtradb/include/row0vers.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "read0types.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: the secondary index */
+ const ulint* offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ trx_id_t trx_id, /*!< in: transaction id in the version */
+ mtr_t* mtr); /*!< in: mtr holding the latch on the
+ clustered index record; it will also
+ hold the latch on purge_view */
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@return TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ ibool also_curr,/*!< in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ const rec_t* rec, /*!< in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the secondary index */
+ const dtuple_t* ientry);/*!< in: the secondary index entry */
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ read_view_t* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers);/*!< out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers);/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+
+
+#ifndef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0vers.ic b/storage/xtradb/include/row0vers.ic
new file mode 100644
index 00000000000..8bb3a5c0cb3
--- /dev/null
+++ b/storage/xtradb/include/row0vers.ic
@@ -0,0 +1,30 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.ic
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "dict0dict.h"
+#include "read0read.h"
+#include "page0page.h"
+#include "log0recv.h"
diff --git a/storage/xtradb/include/srv0que.h b/storage/xtradb/include/srv0que.h
new file mode 100644
index 00000000000..82ee7739ef7
--- /dev/null
+++ b/storage/xtradb/include/srv0que.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0que.h
+Server query execution
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0que_h
+#define srv0que_h
+
+#include "univ.i"
+#include "que0types.h"
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread */
+
+#endif
+
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
new file mode 100644
index 00000000000..8c64d5cee71
--- /dev/null
+++ b/storage/xtradb/include/srv0srv.h
@@ -0,0 +1,733 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0srv_h
+#define srv0srv_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "os0sync.h"
+#include "que0types.h"
+#include "trx0types.h"
+
+extern const char* srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char srv_mysql50_table_name_prefix[9];
+
+/* When this event is set the lock timeout and InnoDB monitor
+thread starts running */
+extern os_event_t srv_lock_timeout_thread_event;
+
+/* This event is set to tell the purge thread to shut down */
+extern os_event_t srv_purge_thread_event;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT \
+ (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
+
+/* prototypes for new functions added to ha_innodb.cc */
+ibool innobase_get_slow_log();
+
+/* This is set to TRUE if the MySQL user has set it in MySQL */
+extern ibool srv_lower_case_table_names;
+
+/* Mutex for locking srv_monitor_file */
+extern mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE* srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+extern mutex_t srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+extern FILE* srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+extern mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE* srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char* srv_data_home;
+#ifdef UNIV_LOG_ARCHIVE
+extern char* srv_arch_dir;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+#ifndef UNIV_HOTBACKUP
+extern my_bool srv_file_per_table;
+#else
+extern ibool srv_file_per_table;
+#endif /* UNIV_HOTBACKUP */
+/** The file format to use on new *.ibd files. */
+extern ulint srv_file_format;
+/** Whether to check file format during startup. A value of
+DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to
+set it to the highest format we support. */
+extern ulint srv_check_file_format_at_startup;
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+extern ibool srv_locks_unsafe_for_binlog;
+#endif /* !UNIV_HOTBACKUP */
+
+extern ulint srv_n_data_files;
+extern char** srv_data_file_names;
+extern ulint* srv_data_file_sizes;
+extern ulint* srv_data_file_is_raw_partition;
+
+extern char* srv_doublewrite_file;
+
+extern ibool srv_extra_undoslots;
+
+extern ibool srv_recovery_stats;
+
+extern ulint srv_use_purge_thread;
+
+extern ibool srv_auto_extend_last_data_file;
+extern ulint srv_last_file_size_max;
+extern char** srv_log_group_home_dirs;
+#ifndef UNIV_HOTBACKUP
+extern ulong srv_auto_extend_increment;
+
+extern ibool srv_created_new_raw;
+
+extern ulint srv_n_log_groups;
+extern ulint srv_n_log_files;
+extern ulint srv_log_file_size;
+extern ulint srv_log_buffer_size;
+extern ulong srv_flush_log_at_trx_commit;
+extern char srv_adaptive_flushing;
+
+
+extern ulong srv_show_locks_held;
+extern ulong srv_show_verbose_locks;
+
+/* The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+extern const byte* srv_latin1_ordering;
+#ifndef UNIV_HOTBACKUP
+extern my_bool srv_use_sys_malloc;
+#else
+extern ibool srv_use_sys_malloc;
+#endif /* UNIV_HOTBACKUP */
+extern ulint srv_buf_pool_size; /*!< requested size in bytes */
+extern ulint srv_buf_pool_old_size; /*!< previously requested size */
+extern ulint srv_buf_pool_curr_size; /*!< current size in bytes */
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+extern uint srv_buffer_pool_shm_key;
+extern ibool srv_buffer_pool_shm_is_reused;
+extern ibool srv_buffer_pool_shm_checksum;
+
+extern ibool srv_thread_concurrency_timer_based;
+
+extern ulint srv_n_file_io_threads;
+extern ulong srv_read_ahead_threshold;
+extern ulint srv_n_read_io_threads;
+extern ulint srv_n_write_io_threads;
+
+/* Number of IO operations per second the server can do */
+extern ulong srv_io_capacity;
+/* Returns the number of IO operations that is X percent of the
+capacity. PCT_IO(5) -> returns the number of IO operations that
+is 5% of the max where max is srv_io_capacity. */
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+
+#ifdef UNIV_LOG_ARCHIVE
+extern ibool srv_log_archive_on;
+extern ibool srv_archive_recovery;
+extern dulint srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+extern char* srv_file_flush_method_str;
+extern ulint srv_unix_file_flush_method;
+extern ulint srv_win_file_flush_method;
+
+extern ulint srv_max_n_open_files;
+
+extern ulint srv_max_dirty_pages_pct;
+
+extern ulint srv_force_recovery;
+extern ulong srv_thread_concurrency;
+
+extern ulint srv_max_n_threads;
+
+extern lint srv_conc_n_threads;
+
+extern ulint srv_fast_shutdown; /* If this is 1, do not do a
+ purge and index buffer merge.
+ If this 2, do not even flush the
+ buffer pool to data files at the
+ shutdown: we effectively 'crash'
+ InnoDB (but lose no committed
+ transactions). */
+extern ibool srv_innodb_status;
+
+extern unsigned long long srv_stats_sample_pages;
+extern ulong srv_stats_method;
+#define SRV_STATS_METHOD_NULLS_EQUAL 0
+#define SRV_STATS_METHOD_NULLS_NOT_EQUAL 1
+#define SRV_STATS_METHOD_IGNORE_NULLS 2
+extern ulong srv_stats_auto_update;
+extern ulint srv_stats_update_need_lock;
+extern ibool srv_use_sys_stats_table;
+
+extern ibool srv_use_doublewrite_buf;
+extern ibool srv_use_checksums;
+extern ibool srv_fast_checksum;
+
+extern ibool srv_set_thread_priorities;
+extern int srv_query_thread_priority;
+
+extern ulong srv_max_buf_pool_modified_pct;
+extern ulong srv_max_purge_lag;
+
+extern ulong srv_replication_delay;
+
+extern long long srv_ibuf_max_size;
+extern ulong srv_ibuf_active_contract;
+extern ulong srv_ibuf_accel_rate;
+extern ulint srv_checkpoint_age_target;
+extern ulong srv_flush_neighbor_pages;
+extern ulong srv_enable_unsafe_group_commit;
+extern ulong srv_read_ahead;
+extern ulong srv_adaptive_checkpoint;
+
+extern ulong srv_expand_import;
+extern ulint srv_pass_corrupt_table;
+
+extern ulong srv_extra_rsegments;
+extern ulong srv_dict_size_limit;
+/*-------------------------------------------*/
+
+extern ulint srv_n_rows_inserted;
+extern ulint srv_n_rows_updated;
+extern ulint srv_n_rows_deleted;
+extern ulint srv_n_rows_read;
+
+extern ibool srv_print_innodb_monitor;
+extern ibool srv_print_innodb_lock_monitor;
+extern ibool srv_print_innodb_tablespace_monitor;
+extern ibool srv_print_verbose_log;
+extern ibool srv_print_innodb_table_monitor;
+
+extern ibool srv_lock_timeout_active;
+extern ibool srv_monitor_active;
+extern ibool srv_error_monitor_active;
+
+extern ulong srv_n_spin_wait_rounds;
+extern ulong srv_n_free_tickets_to_enter;
+extern ulong srv_thread_sleep_delay;
+extern ulong srv_spin_wait_delay;
+extern ibool srv_priority_boost;
+
+extern ulint srv_mem_pool_size;
+extern ulint srv_lock_table_size;
+
+#ifdef UNIV_DEBUG
+extern ibool srv_print_thread_releases;
+extern ibool srv_print_lock_waits;
+extern ibool srv_print_buf_io;
+extern ibool srv_print_log_io;
+extern ibool srv_print_latch_waits;
+#else /* UNIV_DEBUG */
+# define srv_print_thread_releases FALSE
+# define srv_print_lock_waits FALSE
+# define srv_print_buf_io FALSE
+# define srv_print_log_io FALSE
+# define srv_print_latch_waits FALSE
+#endif /* UNIV_DEBUG */
+
+extern ulint srv_activity_count;
+extern ulint srv_fatal_semaphore_wait_threshold;
+extern ulint srv_dml_needed_delay;
+
+extern mutex_t* kernel_mutex_temp;/* mutex protecting the server, trx structs,
+ query threads, and lock table: we allocate
+ it from dynamic memory to get it to the
+ same DRAM page as other hotspot semaphores */
+#define kernel_mutex (*kernel_mutex_temp)
+
+#define SRV_MAX_N_IO_THREADS 130
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+extern const char* srv_io_thread_op_info[];
+extern const char* srv_io_thread_function[];
+
+/* the number of the log write requests done */
+extern ulint srv_log_write_requests;
+
+/* the number of physical writes to the log performed */
+extern ulint srv_log_writes;
+
+/* amount of data written to the log files in bytes */
+extern ulint srv_os_log_written;
+
+/* amount of writes being done to the log files */
+extern ulint srv_os_log_pending_writes;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+extern ulint srv_log_waits;
+
+/* variable that counts amount of data read in total (in bytes) */
+extern ulint srv_data_read;
+
+/* here we count the amount of data written in total (in bytes) */
+extern ulint srv_data_written;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+extern ulint srv_dblwr_writes;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+extern ulint srv_dblwr_pages_written;
+
+/* in this variable we store the number of write requests issued */
+extern ulint srv_buf_pool_write_requests;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+extern ulint srv_buf_pool_wait_free;
+
+/* variable to count the number of pages that were written from the
+buffer pool to disk */
+extern ulint srv_buf_pool_flushed;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+extern ulint srv_buf_pool_reads;
+
+/** Time in seconds between automatic buffer pool dumps */
+extern uint srv_auto_lru_dump;
+
+/** Status variables to be passed to MySQL */
+typedef struct export_var_struct export_struc;
+
+/** Status variables to be passed to MySQL */
+extern export_struc export_vars;
+
+/** The server system */
+typedef struct srv_sys_struct srv_sys_t;
+
+/** The server system */
+extern srv_sys_t* srv_sys;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Types of raw partitions in innodb_data_file_path */
+enum {
+ SRV_NOT_RAW = 0, /*!< Not a raw partition */
+ SRV_NEW_RAW, /*!< A 'newraw' partition, only to be
+ initialized */
+ SRV_OLD_RAW /*!< An initialized raw partition */
+};
+
+/** Alternatives for the file flush option in Unix; see the InnoDB manual
+about what these mean */
+enum {
+ SRV_UNIX_FSYNC = 1, /*!< fsync, the default */
+ SRV_UNIX_O_DSYNC, /*!< open log files in O_SYNC mode */
+ SRV_UNIX_LITTLESYNC, /*!< do not call os_file_flush()
+ when writing data files, but do flush
+ after writing to log files */
+ SRV_UNIX_NOSYNC, /*!< do not flush after writing */
+ SRV_UNIX_O_DIRECT, /*!< invoke os_file_set_nocache() on
+ data files */
+ SRV_UNIX_ALL_O_DIRECT /* new method for examination: logfile also open O_DIRECT */
+};
+
+/** Alternatives for file i/o in Windows */
+enum {
+ SRV_WIN_IO_NORMAL = 1, /*!< buffered I/O */
+ SRV_WIN_IO_UNBUFFERED /*!< unbuffered I/O; this is the default */
+};
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+ SRV_FORCE_IGNORE_CORRUPT = 1, /*!< let the server run even if it
+ detects a corrupt page */
+ SRV_FORCE_NO_BACKGROUND = 2, /*!< prevent the main thread from
+ running: if a crash would occur
+ in purge, this prevents it */
+ SRV_FORCE_NO_TRX_UNDO = 3, /*!< do not run trx rollback after
+ recovery */
+ SRV_FORCE_NO_IBUF_MERGE = 4, /*!< prevent also ibuf operations:
+ if they would cause a crash, better
+ not do them */
+ SRV_FORCE_NO_UNDO_LOG_SCAN = 5, /*!< do not look at undo logs when
+ starting the database: InnoDB will
+ treat even incomplete transactions
+ as committed */
+ SRV_FORCE_NO_LOG_REDO = 6 /*!< do not do the log roll-forward
+ in connection with recovery */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Types of threads existing in the system. */
+enum srv_thread_type {
+ SRV_COM = 1, /**< threads serving communication and queries */
+ SRV_CONSOLE, /**< thread serving console */
+ SRV_WORKER, /**< threads serving parallelized queries and
+ queries released from lock wait */
+#if 0
+ /* Utility threads */
+ SRV_BUFFER, /**< thread flushing dirty buffer blocks */
+ SRV_RECOVERY, /**< threads finishing a recovery */
+ SRV_INSERT, /**< thread flushing the insert buffer to disk */
+#endif
+ SRV_PURGE, /* thread purging undo records */
+ SRV_PURGE_WORKER, /* thread purging undo records */
+ SRV_MASTER /**< the master thread, (whose type number must
+ be biggest) */
+};
+
+/*********************************************************************//**
+Boots Innobase server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void);
+/*==================*/
+/*********************************************************************//**
+Gets the number of threads in the system.
+@return sum of srv_n_threads[] */
+UNIV_INTERN
+ulint
+srv_get_n_threads(void);
+/*===================*/
+/*********************************************************************//**
+Returns the calling thread type.
+@return SRV_COM, ... */
+
+enum srv_thread_type
+srv_get_thread_type(void);
+/*=====================*/
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+ ulint i, /*!< in: the 'segment' of the i/o thread */
+ const char* str); /*!< in: constant char string describing the
+ state */
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+ enum srv_thread_type type, /*!< in: thread type */
+ ulint n); /*!< in: number of threads to release */
+/*********************************************************************//**
+The master thread controlling the server.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_master_thread(
+/*==============*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
+The undo purge thread. */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+ void* arg); /* in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
+The undo purge thread. */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_worker_thread(
+/*====================*/
+ void* arg);
+/*******************************************************************//**
+Tells the Innobase server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void);
+/*===============================*/
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void);
+/*========================*/
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB. */
+UNIV_INTERN
+void
+srv_conc_exit_innodb(
+/*=================*/
+ trx_t* trx); /*!< in: transaction object associated with the
+ thread */
+/***************************************************************//**
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+ que_thr_t* thr); /*!< in: query thread associated with the MySQL
+ OS thread */
+/********************************************************************//**
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr); /*!< in: query thread associated with the
+ MySQL OS thread */
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*************************************************************************
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_error_monitor_thread(
+/*=====================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/*********************************************************************//**
+A thread which restores the buffer pool from a dump file on startup and does
+periodic buffer pool dumps.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_LRU_dump_restore_thread(
+/*====================*/
+ void* arg); /*!< in: a dummy parameter required by
+ os_thread_create */
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
+ ulint* trx_start, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end); /*!< out: file position of the end of
+ the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void);
+/*==========================*/
+
+/** Thread slot in the thread table */
+typedef struct srv_slot_struct srv_slot_t;
+
+/** Thread table is an array of slots */
+typedef srv_slot_t srv_table_t;
+
+/** Status variables to be passed to MySQL */
+struct export_var_struct{
+ ulint innodb_data_pending_reads; /*!< Pending reads */
+ ulint innodb_data_pending_writes; /*!< Pending writes */
+ ulint innodb_data_pending_fsyncs; /*!< Pending fsyncs */
+ ulint innodb_data_fsyncs; /*!< Number of fsyncs so far */
+ ulint innodb_data_read; /*!< Data bytes read */
+ ulint innodb_data_writes; /*!< I/O write requests */
+ ulint innodb_data_written; /*!< Data bytes written */
+ ulint innodb_data_reads; /*!< I/O read requests */
+ ulint innodb_dict_tables;
+ ulint innodb_buffer_pool_pages_total; /*!< Buffer pool size */
+ ulint innodb_buffer_pool_pages_data; /*!< Data pages */
+ ulint innodb_buffer_pool_pages_dirty; /*!< Dirty data pages */
+ ulint innodb_buffer_pool_pages_misc; /*!< Miscellanous pages */
+ ulint innodb_buffer_pool_pages_free; /*!< Free pages */
+#ifdef UNIV_DEBUG
+ ulint innodb_buffer_pool_pages_latched; /*!< Latched pages */
+#endif /* UNIV_DEBUG */
+ ulint innodb_buffer_pool_read_requests; /*!< buf_pool->stat.n_page_gets */
+ ulint innodb_buffer_pool_reads; /*!< srv_buf_pool_reads */
+ ulint innodb_buffer_pool_wait_free; /*!< srv_buf_pool_wait_free */
+ ulint innodb_buffer_pool_pages_flushed; /*!< srv_buf_pool_flushed */
+ ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+ ulint innodb_buffer_pool_read_ahead; /*!< srv_read_ahead */
+ ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+ ulint innodb_deadlocks; /* ??? */
+ ulint innodb_dblwr_pages_written; /*!< srv_dblwr_pages_written */
+ ulint innodb_dblwr_writes; /*!< srv_dblwr_writes */
+ ibool innodb_have_atomic_builtins; /*!< HAVE_ATOMIC_BUILTINS */
+ ulint innodb_log_waits; /*!< srv_log_waits */
+ ulint innodb_log_write_requests; /*!< srv_log_write_requests */
+ ulint innodb_log_writes; /*!< srv_log_writes */
+ ulint innodb_os_log_written; /*!< srv_os_log_written */
+ ulint innodb_os_log_fsyncs; /*!< fil_n_log_flushes */
+ ulint innodb_os_log_pending_writes; /*!< srv_os_log_pending_writes */
+ ulint innodb_os_log_pending_fsyncs; /*!< fil_n_pending_log_flushes */
+ ulint innodb_page_size; /*!< UNIV_PAGE_SIZE */
+ ulint innodb_pages_created; /*!< buf_pool->stat.n_pages_created */
+ ulint innodb_pages_read; /*!< buf_pool->stat.n_pages_read */
+ ulint innodb_pages_written; /*!< buf_pool->stat.n_pages_written */
+ ulint innodb_row_lock_waits; /*!< srv_n_lock_wait_count */
+ ulint innodb_row_lock_current_waits; /*!< srv_n_lock_wait_current_count */
+ ib_int64_t innodb_row_lock_time; /*!< srv_n_lock_wait_time
+ / 1000 */
+ ulint innodb_row_lock_time_avg; /*!< srv_n_lock_wait_time
+ / 1000
+ / srv_n_lock_wait_count */
+ ulint innodb_row_lock_time_max; /*!< srv_n_lock_max_wait_time
+ / 1000 */
+ ulint innodb_rows_read; /*!< srv_n_rows_read */
+ ulint innodb_rows_inserted; /*!< srv_n_rows_inserted */
+ ulint innodb_rows_updated; /*!< srv_n_rows_updated */
+ ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */
+};
+
+/** The server system struct */
+struct srv_sys_struct{
+ srv_table_t* threads; /*!< server thread table */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ tasks; /*!< task queue */
+};
+
+extern ulint srv_n_threads_active[];
+#else /* !UNIV_HOTBACKUP */
+# define srv_use_checksums TRUE
+# define srv_use_adaptive_hash_indexes FALSE
+# define srv_force_recovery 0UL
+# define srv_set_io_thread_op_info(t,info) ((void) 0)
+# define srv_is_being_started 0
+# define srv_win_file_flush_method SRV_WIN_IO_UNBUFFERED
+# define srv_unix_file_flush_method SRV_UNIX_O_DSYNC
+# define srv_start_raw_disk_in_use 0
+# define srv_file_per_table 1
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/srv0srv.ic b/storage/xtradb/include/srv0srv.ic
new file mode 100644
index 00000000000..8a1a678a016
--- /dev/null
+++ b/storage/xtradb/include/srv0srv.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.ic
+Server main program
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/srv0start.h b/storage/xtradb/include/srv0start.h
new file mode 100644
index 00000000000..8abf15da9c1
--- /dev/null
+++ b/storage/xtradb/include/srv0start.h
@@ -0,0 +1,134 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0start_h
+#define srv0start_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+ char* str); /*!< in/out: null-terminated character string */
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+ char* str); /*!< in/out: the data file path string */
+/*********************************************************************//**
+Reads log group home directories from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_log_group_home_dirs(
+/*==========================*/
+ char* str); /*!< in/out: character string */
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void);
+/*==========================*/
+/*********************************************************************//**
+Adds a slash or a backslash to the end of a string if it is missing
+and the string is not empty.
+@return string which has the separator if the string is not empty */
+UNIV_INTERN
+char*
+srv_add_path_separator_if_needed(
+/*=============================*/
+ char* str); /*!< in: null-terminated character string */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Starts Innobase and creates a new database if database files
+are not found and the user wants.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_start_or_create_for_mysql(void);
+/*====================================*/
+/****************************************************************//**
+Shuts down the Innobase database.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_shutdown_for_mysql(void);
+/*=============================*/
+/** Log sequence number at shutdown */
+extern ib_uint64_t srv_shutdown_lsn;
+/** Log sequence number immediately after startup */
+extern ib_uint64_t srv_start_lsn;
+
+#ifdef __NETWARE__
+void set_panic_flag_for_netware(void);
+#endif
+
+#ifdef HAVE_DARWIN_THREADS
+/** TRUE if the F_FULLFSYNC option is available */
+extern ibool srv_have_fullfsync;
+#endif
+
+/** TRUE if the server is being started */
+extern ibool srv_is_being_started;
+/** TRUE if the server was successfully started */
+extern ibool srv_was_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern ibool srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern ibool srv_start_raw_disk_in_use;
+
+
+/** Shutdown state */
+enum srv_shutdown_state {
+ SRV_SHUTDOWN_NONE = 0, /*!< Database running normally */
+ SRV_SHUTDOWN_CLEANUP, /*!< Cleaning up in
+ logs_empty_and_mark_files_at_shutdown() */
+ SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+ the buffer pool can be freed: flush
+ all file spaces and close all files */
+ SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern enum srv_shutdown_state srv_shutdown_state;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log 'spaces' have id's >= this */
+#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL
+
+#endif
diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h
new file mode 100644
index 00000000000..5f1280f5e28
--- /dev/null
+++ b/storage/xtradb/include/sync0arr.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.h
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+
+/** Synchronization wait array cell */
+typedef struct sync_cell_struct sync_cell_t;
+/** Synchronization wait array */
+typedef struct sync_array_struct sync_array_t;
+
+/** Parameters for sync_array_create() @{ */
+#define SYNC_ARRAY_OS_MUTEX 1 /*!< protected by os_mutex_t */
+#define SYNC_ARRAY_MUTEX 2 /*!< protected by mutex_t */
+/* @} */
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return own: created wait array */
+UNIV_INTERN
+sync_array_t*
+sync_array_create(
+/*==============*/
+ ulint n_cells, /*!< in: number of cells in the array
+ to create */
+ ulint protection); /*!< in: either SYNC_ARRAY_OS_MUTEX or
+ SYNC_ARRAY_MUTEX: determines the type
+ of mutex protecting the data structure */
+/******************************************************************//**
+Frees the resources in a wait array. */
+UNIV_INTERN
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr); /*!< in, own: sync wait array */
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+UNIV_INTERN
+void
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ ulint line, /*!< in: line where requested */
+ ulint* index); /*!< out: index of the reserved cell */
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index); /*!< in: index of the reserved cell */
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index); /*!< in: index of the cell in array */
+/**********************************************************************//**
+Note that one of the wait objects was signalled. */
+UNIV_INTERN
+void
+sync_array_object_signalled(
+/*========================*/
+ sync_array_t* arr); /*!< in: wait array */
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void);
+/*====================================*/
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(void);
+/*=============================*/
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+ sync_array_t* arr); /*!< in: sync wait array */
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print_info(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr); /*!< in: wait array */
+
+
+#ifndef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/sync0arr.ic b/storage/xtradb/include/sync0arr.ic
new file mode 100644
index 00000000000..bf57f5b2dc2
--- /dev/null
+++ b/storage/xtradb/include/sync0arr.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.ic
+The wait array for synchronization primitives
+
+Inline code
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h
new file mode 100644
index 00000000000..4edf93f4042
--- /dev/null
+++ b/storage/xtradb/include/sync0rw.h
@@ -0,0 +1,588 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.h
+The read-write lock (for threads, not for database transactions)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "ut0lst.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+
+/* The following undef is to prevent a name conflict with a macro
+in MySQL: */
+#undef rw_lock_t
+#endif /* !UNIV_HOTBACKUP */
+
+/* Latch types; these are used also in btr0btr.h: keep the numerical values
+smaller than 30 and the order of the numerical values like below! */
+#define RW_S_LATCH 1
+#define RW_X_LATCH 2
+#define RW_NO_LATCH 3
+
+#ifndef UNIV_HOTBACKUP
+/* We decrement lock_word by this amount for each x_lock. It is also the
+start value for the lock_word, meaning that it limits the maximum number
+of concurrent read locks before the rw_lock breaks. The current value of
+0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
+#define X_LOCK_DECR 0x00100000
+
+typedef struct rw_lock_struct rw_lock_t;
+#ifdef UNIV_SYNC_DEBUG
+typedef struct rw_lock_debug_struct rw_lock_debug_t;
+#endif /* UNIV_SYNC_DEBUG */
+
+typedef UT_LIST_BASE_NODE_T(rw_lock_t) rw_lock_list_t;
+
+extern rw_lock_list_t rw_lock_list;
+extern mutex_t rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+
+acquired in addition to the mutex protecting the lock. */
+extern mutex_t rw_lock_debug_mutex;
+extern os_event_t rw_lock_debug_event; /*!< If deadlock detection does
+ not get immediately the mutex it
+ may wait for this event */
+extern ibool rw_lock_debug_waiters; /*!< This is set to TRUE, if
+ there may be waiters for the event */
+#endif /* UNIV_SYNC_DEBUG */
+
+/** number of spin waits on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_s_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_s_spin_round_count;
+/** number of unlocks (that unlock shared locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern ib_int64_t rw_s_exit_count;
+/** number of OS waits on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_s_os_wait_count;
+/** number of spin waits on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_x_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during shared (read) locks */
+extern ib_int64_t rw_x_spin_round_count;
+/** number of OS waits on rw-latches,
+resulted during exclusive (write) locks */
+extern ib_int64_t rw_x_os_wait_count;
+/** number of unlocks (that unlock exclusive locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern ib_int64_t rw_x_exit_count;
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), (level), #L, __FILE__, __LINE__)
+# else /* UNIV_SYNC_DEBUG */
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), #L, __FILE__, __LINE__)
+# endif /* UNIV_SYNC_DEBUG */
+#else /* UNIV_DEBUG */
+# define rw_lock_create(L, level) \
+ rw_lock_create_func((L), #L, NULL, 0)
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free(
+/*=========*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock(M) rw_lock_s_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_gen(M, P) rw_lock_s_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\
+ (M), 0, (F), (L))
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass __attribute__((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(P, L)
+#else
+# define rw_lock_s_unlock_gen(L, P) rw_lock_s_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases a shared mode lock. */
+#define rw_lock_s_unlock(L) rw_lock_s_unlock_gen(L, 0)
+
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock(M) rw_lock_x_lock_func(\
+ (M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_gen(M, P) rw_lock_x_lock_func(\
+ (M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_nowait(M) rw_lock_x_lock_func_nowait(\
+ (M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(P, L)
+#else
+# define rw_lock_x_unlock_gen(L, P) rw_lock_x_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases an exclusive mode lock. */
+#define rw_lock_x_unlock(L) rw_lock_x_unlock_gen(L, 0)
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line); /*!< in: line where lock requested */
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line); /*!< in: line where lock requested */
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock); /*!< in: lock which was x-locked in the
+ buffer read */
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock durint the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock); /*!< in/out: rw-lock */
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return 1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of readers.
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock); /*!< in: rw-lock */
+/******************************************************************//**
+Decrements lock_word the specified amount if it is greater than 0.
+This is used by both s_lock and x_lock operations.
+@return TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount); /*!< in: amount to decrement */
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount); /*!< in: amount to increment */
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+ rw_lock_t* lock, /*!< in/out: lock to work on */
+ ibool recursive); /*!< in: TRUE if recursion
+ allowed */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+ __attribute__((warn_unused_result));
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode. */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type); /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+ rw_lock_t* lock); /*!< in: rw-lock */
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+ FILE* file); /*!< in: file where to print */
+/***************************************************************//**
+Returns the number of currently locked rw-locks.
+Works only in the debug version.
+@return number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void);
+/*==================*/
+
+/*#####################################################################*/
+
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void);
+/*==========================*/
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void);
+/*==========================*/
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+ rw_lock_debug_t* info); /*!< in: debug struct */
+#endif /* UNIV_SYNC_DEBUG */
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! */
+
+/** The structure used in the spin lock implementation of a read-write
+lock. Several threads may have a shared lock simultaneously in this
+lock, but only one writer may have an exclusive lock, in which case no
+shared locks are allowed. To prevent starving of a writer blocked by
+readers, a writer may queue for x-lock by decrementing lock_word: no
+new readers will be let in while the thread waits for readers to
+exit. */
+struct rw_lock_struct {
+ volatile lint lock_word;
+ /*!< Holds the state of the lock. */
+ volatile ulint waiters;/*!< 1: there are waiters */
+ volatile ibool recursive;/*!< Default value FALSE which means the lock
+ is non-recursive. The value is typically set
+ to TRUE making normal rw_locks recursive. In
+ case of asynchronous IO, when a non-zero
+ value of 'pass' is passed then we keep the
+ lock non-recursive.
+ This flag also tells us about the state of
+ writer_thread field. If this flag is set
+ then writer_thread MUST contain the thread
+ id of the current x-holder or wait-x thread.
+ This flag must be reset in x_unlock
+ functions before incrementing the lock_word */
+ volatile os_thread_id_t writer_thread;
+ /*!< Thread id of writer thread. Is only
+ guaranteed to have sane and non-stale
+ value iff recursive flag is set. */
+ os_event_t event; /*!< Used by sync0arr.c for thread queueing */
+ os_event_t wait_ex_event;
+ /*!< Event for next-writer to wait on. A thread
+ must decrement lock_word before waiting. */
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_t mutex; /*!< The mutex protecting rw_lock_struct */
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ UT_LIST_NODE_T(rw_lock_t) list;
+ /*!< All allocated rw locks are put into a
+ list */
+#ifdef UNIV_SYNC_DEBUG
+ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+ /*!< In the debug version: pointer to the debug
+ info list of the lock */
+ ulint level; /*!< Level in the global latching order. */
+#endif /* UNIV_SYNC_DEBUG */
+ ulint count_os_wait; /*!< Count of os_waits. May not be accurate */
+ //const char* cfile_name;/*!< File name where lock created */
+ const char* lock_name;/*!< lock name */
+ /* last s-lock file/line is not guaranteed to be correct */
+ const char* last_s_file_name;/*!< File name where last s-locked */
+ const char* last_x_file_name;/*!< File name where last x-locked */
+ ibool writer_is_wait_ex;
+ /*!< This is TRUE if the writer field is
+ RW_LOCK_WAIT_EX; this field is located far
+ from the memory update hotspot fields which
+ are at the start of this struct, thus we can
+ peek this field without causing much memory
+ bus traffic */
+ //unsigned cline:14; /*!< Line where created */
+ unsigned last_s_line:14; /*!< Line number where last time s-locked */
+ unsigned last_x_line:14; /*!< Line number where last time x-locked */
+#ifdef UNIV_DEBUG
+ ulint magic_n; /*!< RW_LOCK_MAGIC_N */
+/** Value of rw_lock_struct::magic_n */
+#define RW_LOCK_MAGIC_N 22643
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_SYNC_DEBUG
+/** The structure for storing debug info of an rw-lock */
+struct rw_lock_debug_struct {
+
+ os_thread_id_t thread_id; /*!< The thread id of the thread which
+ locked the rw-lock */
+ ulint pass; /*!< Pass value given in the lock operation */
+ ulint lock_type; /*!< Type of the lock: RW_LOCK_EX,
+ RW_LOCK_SHARED, RW_LOCK_WAIT_EX */
+ const char* file_name;/*!< File name where the lock was obtained */
+ ulint line; /*!< Line where the rw-lock was locked */
+ UT_LIST_NODE_T(rw_lock_debug_t) list;
+ /*!< Debug structs are linked in a two-way
+ list */
+};
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic
new file mode 100644
index 00000000000..7116f1b7c9b
--- /dev/null
+++ b/storage/xtradb/include/sync0rw.ic
@@ -0,0 +1,624 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.ic
+The read-write lock (for threads)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line); /*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type, /*!< in: lock type */
+ const char* file_name, /*!< in: file where requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type); /*!< in: lock type */
+#endif /* UNIV_SYNC_DEBUG */
+
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return 1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ return(lock->waiters);
+}
+
+/********************************************************************//**
+Sets lock->waiters to 1. It is not an error if lock->waiters is already
+1. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_set_waiter_flag(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_compare_and_swap_ulint(&lock->waiters, 0, 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lock->waiters = 1;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/********************************************************************//**
+Resets lock->waiters to 0. It is not an error if lock->waiters is already
+0. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_reset_waiter_flag(
+/*======================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_compare_and_swap_ulint(&lock->waiters, 1, 0);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lock->waiters = 0;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_word = lock->lock_word;
+ if (lock_word > 0) {
+ /* return NOT_LOCKED in s-lock state, like the writer
+ member of the old lock implementation. */
+ return(RW_LOCK_NOT_LOCKED);
+ } else if (((-lock_word) % X_LOCK_DECR) == 0) {
+ return(RW_LOCK_EX);
+ } else {
+ ut_ad(lock_word > -X_LOCK_DECR);
+ return(RW_LOCK_WAIT_EX);
+ }
+}
+
+/******************************************************************//**
+Returns the number of readers.
+@return number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_word = lock->lock_word;
+ if (lock_word > 0) {
+ /* s-locked, no x-waiters */
+ return(X_LOCK_DECR - lock_word);
+ } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) {
+ /* s-locked, with x-waiters */
+ return((ulint)(-lock_word));
+ }
+ return(0);
+}
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+UNIV_INLINE
+mutex_t*
+rw_lock_get_mutex(
+/*==============*/
+ rw_lock_t* lock)
+{
+ return(&(lock->mutex));
+}
+#endif
+
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+ const rw_lock_t* lock) /*!< in: rw-lock */
+{
+ lint lock_copy = lock->lock_word;
+ /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */
+ if (lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) {
+ return(0);
+ }
+ return(((-lock_copy) / X_LOCK_DECR) + 1);
+}
+
+/******************************************************************//**
+Two different implementations for decrementing the lock_word of a rw_lock:
+one for systems supporting atomic operations, one for others. This does
+does not support recusive x-locks: they should be handled by the caller and
+need not be atomic since they are performed by the current lock holder.
+Returns true if the decrement was made, false if not.
+@return TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount) /*!< in: amount to decrement */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ lint local_lock_word = lock->lock_word;
+ while (local_lock_word > 0) {
+ if (os_compare_and_swap_lint(&lock->lock_word,
+ local_lock_word,
+ local_lock_word - amount)) {
+ return(TRUE);
+ }
+ local_lock_word = lock->lock_word;
+ }
+ return(FALSE);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ ibool success = FALSE;
+ mutex_enter(&(lock->mutex));
+ if (lock->lock_word > 0) {
+ lock->lock_word -= amount;
+ success = TRUE;
+ }
+ mutex_exit(&(lock->mutex));
+ return(success);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ ulint amount) /*!< in: amount of increment */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ return(os_atomic_increment_lint(&lock->lock_word, amount));
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+ lint local_lock_word;
+
+ mutex_enter(&(lock->mutex));
+
+ lock->lock_word += amount;
+ local_lock_word = lock->lock_word;
+
+ mutex_exit(&(lock->mutex));
+
+ return(local_lock_word);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+ rw_lock_t* lock, /*!< in/out: lock to work on */
+ ibool recursive) /*!< in: TRUE if recursion
+ allowed */
+{
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ os_thread_id_t local_thread;
+ ibool success;
+
+ /* Prevent Valgrind warnings about writer_thread being
+ uninitialized. It does not matter if writer_thread is
+ uninitialized, because we are comparing writer_thread against
+ itself, and the operation should always succeed. */
+ UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread);
+
+ local_thread = lock->writer_thread;
+ success = os_compare_and_swap_thread_id(
+ &lock->writer_thread, local_thread, curr_thread);
+ ut_a(success);
+ lock->recursive = recursive;
+
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ mutex_enter(&lock->mutex);
+ lock->writer_thread = curr_thread;
+ lock->recursive = recursive;
+ mutex_exit(&lock->mutex);
+
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass __attribute__((unused)),
+ /*!< in: pass value; != 0, if the lock will be
+ passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+ if (!rw_lock_lock_word_decr(lock, 1)) {
+ /* Locking did not succeed */
+ return(FALSE);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
+#endif
+ /* These debugging values are not set safely: they may be incorrect
+ or even refer to a line that is invalid for the file name. */
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+
+ return(TRUE); /* locking succeeded */
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line) /*!< in: line where lock requested */
+{
+ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+ /* Indicate there is a new reader by decrementing lock_word */
+ lock->lock_word--;
+
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+ rw_lock_t* lock, /*!< in/out: rw-lock */
+ const char* file_name, /*!< in: file name where requested */
+ ulint line) /*!< in: line where lock requested */
+{
+ ut_ad(rw_lock_validate(lock));
+ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+ lock->lock_word -= X_LOCK_DECR;
+ lock->writer_thread = os_thread_get_curr_id();
+ lock->recursive = TRUE;
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ /* NOTE: As we do not know the thread ids for threads which have
+ s-locked a latch, and s-lockers will be served only after waiting
+ x-lock requests have been fulfilled, then if this thread already
+ owns an s-lock here, it may end up in a deadlock with another thread
+ which requests an x-lock here. Therefore, we will forbid recursive
+ s-locking of a latch: the following assert will warn the programmer
+ of the possibility of this kind of a deadlock. If we want to implement
+ safe recursive s-locking, we should keep in a list the thread ids of
+ the threads which have s-locked a latch. This would use some CPU
+ time. */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+ if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+ return;
+ }
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+ ibool success;
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0);
+#else
+
+ success = FALSE;
+ mutex_enter(&(lock->mutex));
+ if (lock->lock_word == X_LOCK_DECR) {
+ lock->lock_word = 0;
+ success = TRUE;
+ }
+ mutex_exit(&(lock->mutex));
+
+#endif
+ if (success) {
+ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+
+ } else if (lock->recursive
+ && os_thread_eq(lock->writer_thread, curr_thread)) {
+ /* Relock: this lock_word modification is safe since no other
+ threads can modify (lock, unlock, or reserve) lock_word while
+ there is an exclusive writer and this is the writer thread. */
+ lock->lock_word -= X_LOCK_DECR;
+
+ ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
+
+ } else {
+ /* Failure */
+ return(FALSE);
+ }
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line;
+
+ ut_ad(rw_lock_validate(lock));
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad((lock->lock_word % X_LOCK_DECR) != 0);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+#endif
+
+ /* Increment lock_word to indicate 1 less reader */
+ if (rw_lock_lock_word_incr(lock, 1) == 0) {
+
+ /* wait_ex waiter exists. It may not be asleep, but we signal
+ anyway. We do not wake other waiters, because they can't
+ exist without wait_ex waiter and wait_ex waiter goes first.*/
+ os_event_set(lock->wait_ex_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad(lock->lock_word < X_LOCK_DECR);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+#endif
+
+ /* Decrease reader count by incrementing lock_word */
+ lock->lock_word++;
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock may have
+ been passed to another thread to unlock */
+#endif
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+ /* lock->recursive flag also indicates if lock->writer_thread is
+ valid or stale. If we are the last of the recursive callers
+ then we must unset lock->recursive flag to indicate that the
+ lock->writer_thread is now stale.
+ Note that since we still hold the x-lock we can safely read the
+ lock_word. */
+ if (lock->lock_word == 0) {
+ /* Last caller in a possible recursive chain. */
+ lock->recursive = FALSE;
+ UNIV_MEM_INVALID(&lock->writer_thread,
+ sizeof lock->writer_thread);
+ }
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+#endif
+
+ if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
+ /* Lock is now free. May have to signal read/write waiters.
+ We do not need to signal wait_ex waiters, since they cannot
+ exist when there is a writer. */
+ if (lock->waiters) {
+ rw_lock_reset_waiter_flag(lock);
+ os_event_set(lock->event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
+ }
+
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+ rw_lock_t* lock) /*!< in/out: rw-lock */
+{
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+#endif
+
+ if (lock->lock_word == 0) {
+ lock->recursive = FALSE;
+ UNIV_MEM_INVALID(&lock->writer_thread,
+ sizeof lock->writer_thread);
+ }
+
+ lock->lock_word += X_LOCK_DECR;
+
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+ rw_x_exit_count++;
+#endif
+}
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
new file mode 100644
index 00000000000..a500cf1da45
--- /dev/null
+++ b/storage/xtradb/include/sync0sync.h
@@ -0,0 +1,596 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.h
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+#include "sync0types.h"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+#include "os0sync.h"
+#include "sync0arr.h"
+
+#if defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
+extern my_bool timed_mutexes;
+#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+typedef LONG lock_word_t; /*!< On Windows, InterlockedExchange operates
+ on LONG variable */
+#else
+typedef byte lock_word_t;
+#endif
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void);
+/*===========*/
+/******************************************************************//**
+Frees the resources in synchronization data structures. */
+UNIV_INTERN
+void
+sync_close(void);
+/*===========*/
+/******************************************************************//**
+Creates, or rather, initializes a mutex object to a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+# define mutex_create(M, level) \
+ mutex_create_func((M), #M, (level), __FILE__, __LINE__)
+# else
+# define mutex_create(M, level) \
+ mutex_create_func((M), #M, __FILE__, __LINE__)
+# endif
+#else
+# define mutex_create(M, level) \
+ mutex_create_func((M), #M, NULL, 0)
+#endif
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+ mutex_t* mutex, /*!< in: pointer to memory */
+ const char* cmutex_name, /*!< in: mutex name */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline); /*!< in: file line where created */
+
+#undef mutex_free /* Fix for MacOS X */
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free(
+/*=======*/
+ mutex_t* mutex); /*!< in: mutex */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter(M) mutex_enter_func((M), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+/* NOTE! currently same as mutex_enter! */
+
+#define mutex_enter_fast(M) mutex_enter_func((M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a mutex for the current thread. If the mutex is reserved
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line); /*!< in: line where locked */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter_nowait(M) \
+ mutex_enter_nowait_func((M), __FILE__, __LINE__)
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return 0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line); /*!< in: line where requested */
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex); /*!< in: pointer to mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked.
+Works only in the debug version.
+@return TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void);
+/*================*/
+#endif /* UNIV_SYNC_DEBUG */
+/*#####################################################################
+FUNCTION PROTOTYPES FOR DEBUGGING */
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+ FILE* file); /*!< in: file where to print */
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+ FILE* file); /*!< in: file where to print */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+ const mutex_t* mutex); /*!< in: mutex */
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only
+in the debug version.
+@return TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+ const mutex_t* mutex) /*!< in: mutex */
+ __attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+ void* latch, /*!< in: pointer to a mutex or an rw-lock */
+ ulint level); /*!< in: level in the latching order; if
+ SYNC_LEVEL_VARYING, nothing is done */
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+ void* latch); /*!< in: pointer to a mutex or an rw-lock */
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty(void);
+/*==========================*/
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level); /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+/******************************************************************//**
+Checks if the level array for the current thread is empty.
+@return a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+ ibool dict_mutex_allowed); /*!< in: TRUE if dictionary mutex is
+ allowed to be owned by the thread,
+ also purge_is_running mutex is
+ allowed */
+#define sync_thread_levels_empty_gen(d) (!sync_thread_levels_nonempty_gen(d))
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char** file_name, /*!< out: file where requested */
+ ulint* line, /*!< out: line where requested */
+ os_thread_id_t* thread_id); /*!< out: id of the thread which owns
+ the mutex */
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void);
+/*==================*/
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the value
+of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+ const mutex_t* mutex); /*!< in: mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the waiters
+field in a mutex.
+@return value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ const mutex_t* mutex); /*!< in: mutex */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*
+ LATCHING ORDER WITHIN THE DATABASE
+ ==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object Notes
+---------------------- -----
+
+Dictionary mutex If we have a pointer to a dictionary
+| object, e.g., a table, it can be
+| accessed without reserving the
+| dictionary mutex. We must have a
+| reservation, a memoryfix, to the
+| appropriate table object in this case,
+| and the table must be explicitly
+| released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch The tree latch protects also all
+| the B-tree non-leaf pages. These
+V can be read with the page only
+Secondary index non-leaf bufferfixed to save CPU time,
+| no s-latch is needed on the page.
+| Modification of a page requires an
+| x-latch on the page, however. If a
+| thread owns an x-latch to the tree,
+| it is allowed to latch non-leaf pages
+| even after it has acquired the fsp
+| latch.
+V
+Secondary index leaf The latch on the secondary index leaf
+| can be kept while accessing the
+| clustered index, to save CPU time.
+V
+Clustered index tree latch To increase concurrency, the tree
+| latch is usually released when the
+| leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Transaction undo mutex The undo log entry must be written
+| before any index page is modified.
+| Transaction undo mutex is for the undo
+| logs the analogue of the tree latch
+| for a B-tree. If a thread has the
+| trx undo mutex reserved, it is allowed
+| to latch the undo log pages in any
+| order, and also after it has acquired
+| the fsp latch.
+V
+Rollback segment mutex The rollback segment mutex must be
+| reserved, if, e.g., a new page must
+| be added to an undo log. The rollback
+| segment and the undo logs in its
+| history list can be seen as an
+| analogue of a B-tree, and the latches
+| reserved similarly, using a version of
+| lock-coupling. If an undo log must be
+| extended by a page when inserting an
+| undo log record, this corresponds to
+| a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages If a thread owns the trx undo mutex,
+| or for a log in the history list, the
+| rseg mutex, it is allowed to latch
+| undo log pages in any order, and even
+| after it has acquired the fsp latch.
+| If a thread does not have the
+| appropriate mutex, it is allowed to
+| latch only a single undo log page in
+| a mini-transaction.
+V
+File space management latch If a mini-transaction must allocate
+| several file pages, it can do that,
+| because it keeps the x-latch to the
+| file space management in its memo.
+V
+File system pages
+|
+V
+Kernel mutex If a kernel operation needs a file
+| page allocation, it must reserve the
+| fsp x-latch before acquiring the kernel
+| mutex.
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/* Latching order levels */
+
+/* User transaction locks are higher than any of the latch levels below:
+no latches are allowed when a thread goes to wait for a normal table
+or row lock! */
+#define SYNC_USER_TRX_LOCK 9999
+#define SYNC_NO_ORDER_CHECK 3000 /* this can be used to suppress
+ latching order checking */
+#define SYNC_LEVEL_VARYING 2000 /* Level is varying. Only used with
+ buffer pool page locks, which do not
+ have a fixed level, but instead have
+ their level set after the page is
+ locked; see e.g.
+ ibuf_bitmap_get_map_page(). */
+#define SYNC_TRX_I_S_RWLOCK 1910 /* Used for
+ trx_i_s_cache_t::rw_lock */
+#define SYNC_TRX_I_S_LAST_READ 1900 /* Used for
+ trx_i_s_cache_t::last_read_mutex */
+#define SYNC_FILE_FORMAT_TAG 1200 /* Used to serialize access to the
+ file format tag */
+#define SYNC_DICT_OPERATION 1001 /* table create, drop, etc. reserve
+ this in X-mode; implicit or backround
+ operations purge, rollback, foreign
+ key checks reserve this in S-mode */
+#define SYNC_DICT 1000
+#define SYNC_DICT_AUTOINC_MUTEX 999
+#define SYNC_DICT_HEADER 995
+#define SYNC_IBUF_HEADER 914
+#define SYNC_IBUF_PESS_INSERT_MUTEX 912
+#define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below
+ SYNC_FSP_PAGE: we assign a value this
+ high only to make the program to pass
+ the debug checks */
+/*-------------------------------*/
+#define SYNC_INDEX_TREE 900
+#define SYNC_TREE_NODE_NEW 892
+#define SYNC_TREE_NODE_FROM_HASH 891
+#define SYNC_TREE_NODE 890
+#define SYNC_PURGE_SYS 810
+#define SYNC_PURGE_LATCH 800
+#define SYNC_TRX_UNDO 700
+#define SYNC_RSEG 600
+#define SYNC_RSEG_HEADER_NEW 591
+#define SYNC_RSEG_HEADER 590
+#define SYNC_TRX_UNDO_PAGE 570
+#define SYNC_EXTERN_STORAGE 500
+#define SYNC_FSP 400
+#define SYNC_FSP_PAGE 395
+/*------------------------------------- Insert buffer headers */
+/*------------------------------------- ibuf_mutex */
+/*------------------------------------- Insert buffer tree */
+#define SYNC_IBUF_BITMAP_MUTEX 351
+#define SYNC_IBUF_BITMAP 350
+/*------------------------------------- MySQL query cache mutex */
+/*------------------------------------- MySQL binlog mutex */
+/*-------------------------------*/
+#define SYNC_KERNEL 300
+#define SYNC_REC_LOCK 299
+#define SYNC_TRX_LOCK_HEAP 298
+#define SYNC_TRX_SYS_HEADER 290
+#define SYNC_LOG 170
+#define SYNC_RECV 168
+#define SYNC_WORK_QUEUE 162
+#define SYNC_SEARCH_SYS_CONF 161 /* for assigning btr_search_enabled */
+#define SYNC_SEARCH_SYS 160 /* NOTE that if we have a memory
+ heap that can be extended to the
+ buffer pool, its logical level is
+ SYNC_SEARCH_SYS, as memory allocation
+ can call routines there! Otherwise
+ the level is SYNC_MEM_HASH. */
+#define SYNC_BUF_LRU_LIST 157
+#define SYNC_BUF_PAGE_HASH 156
+#define SYNC_BUF_BLOCK 155
+#define SYNC_BUF_FREE_LIST 153
+#define SYNC_BUF_ZIP_FREE 152
+#define SYNC_BUF_ZIP_HASH 151
+#define SYNC_BUF_POOL 150
+#define SYNC_BUF_FLUSH_LIST 149
+#define SYNC_DOUBLEWRITE 140
+#define SYNC_ANY_LATCH 135
+#define SYNC_THR_LOCAL 133
+#define SYNC_MEM_HASH 131
+#define SYNC_MEM_POOL 130
+
+/* Codes used to designate lock operations */
+#define RW_LOCK_NOT_LOCKED 350
+#define RW_LOCK_EX 351
+#define RW_LOCK_EXCLUSIVE 351
+#define RW_LOCK_SHARED 352
+#define RW_LOCK_WAIT_EX 353
+#define SYNC_MUTEX 354
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a mutual exclusion semaphore. */
+
+/** InnoDB mutex */
+struct mutex_struct {
+ os_event_t event; /*!< Used by sync0arr.c for the wait queue */
+ volatile lock_word_t lock_word; /*!< lock_word is the target
+ of the atomic test-and-set instruction when
+ atomic operations are enabled. */
+
+#if !defined(HAVE_ATOMIC_BUILTINS)
+ os_fast_mutex_t
+ os_fast_mutex; /*!< We use this OS mutex in place of lock_word
+ when atomic operations are not enabled */
+#endif
+ volatile ulint waiters; /*!< This ulint is set to 1 if there are (or
+ may be) threads waiting in the global wait
+ array for this mutex to be released.
+ Otherwise, this is 0. */
+ UT_LIST_NODE_T(mutex_t) list; /*!< All allocated mutexes are put into
+ a list. Pointers to the next and prev. */
+#ifdef UNIV_SYNC_DEBUG
+ const char* file_name; /*!< File where the mutex was locked */
+ ulint line; /*!< Line where the mutex was locked */
+ ulint level; /*!< Level in the global latching order */
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+ const char* cfile_name;/*!< File name where mutex created */
+ ulint cline; /*!< Line where created */
+ os_thread_id_t thread_id; /*!< The thread id of the thread
+ which locked the mutex. */
+ ulint magic_n; /*!< MUTEX_MAGIC_N */
+/** Value of mutex_struct::magic_n */
+# define MUTEX_MAGIC_N (ulint)979585
+#endif /* UNIV_DEBUG */
+ ulong count_os_wait; /*!< count of os_wait */
+#ifdef UNIV_DEBUG
+ ulong count_using; /*!< count of times mutex used */
+ ulong count_spin_loop; /*!< count of spin loops */
+ ulong count_spin_rounds;/*!< count of spin rounds */
+ ulong count_os_yield; /*!< count of os_wait */
+ ulonglong lspent_time; /*!< mutex os_wait timer msec */
+ ulonglong lmax_spent_time;/*!< mutex os_wait timer msec */
+ ulint mutex_type; /*!< 0=usual mutex, 1=rw_lock mutex */
+#endif /* UNIV_DEBUG */
+ const char* cmutex_name; /*!< mutex name */
+};
+
+/** The global array of wait cells for implementation of the databases own
+mutexes and read-write locks. */
+extern sync_array_t* sync_primary_wait_array;/* Appears here for
+ debugging purposes only! */
+
+/** Constant determining how long spin wait is continued before suspending
+the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
+to 20 microseconds. */
+
+#define SYNC_SPIN_ROUNDS srv_n_spin_wait_rounds
+
+/** The number of mutex_exit calls. Intended for performance monitoring. */
+extern ib_int64_t mutex_exit_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+extern ibool sync_order_checks_on;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** This variable is set to TRUE when sync_init is called */
+extern ibool sync_initialized;
+
+/** Global list of database mutexes (not OS mutexes) created. */
+typedef UT_LIST_BASE_NODE_T(mutex_t) ut_list_base_node_t;
+/** Global list of database mutexes (not OS mutexes) created. */
+extern ut_list_base_node_t mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+extern mutex_t mutex_list_mutex;
+
+
+#ifndef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic
new file mode 100644
index 00000000000..b05020b5660
--- /dev/null
+++ b/storage/xtradb/include/sync0sync.ic
@@ -0,0 +1,222 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.ic
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+ mutex_t* mutex, /*!< in: mutex */
+ ulint n); /*!< in: value to set */
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line); /*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char* file_name, /*!< in: file where requested */
+ ulint line); /*!< in: line where requested */
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+ mutex_t* mutex); /*!< in: mutex */
+
+/******************************************************************//**
+Performs an atomic test-and-set instruction to the lock_word field of a
+mutex.
+@return the previous value of lock_word: 0 or 1 */
+UNIV_INLINE
+byte
+mutex_test_and_set(
+/*===============*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+ return(os_atomic_test_and_set_byte(&mutex->lock_word, 1));
+#else
+ ibool ret;
+
+ ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex));
+
+ if (ret == 0) {
+ /* We check that os_fast_mutex_trylock does not leak
+ and allow race conditions */
+ ut_a(mutex->lock_word == 0);
+
+ mutex->lock_word = 1;
+ }
+
+ return((byte)ret);
+#endif
+}
+
+/******************************************************************//**
+Performs a reset instruction to the lock_word field of a mutex. This
+instruction also serializes memory operations to the program order. */
+UNIV_INLINE
+void
+mutex_reset_lock_word(
+/*==================*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+ /* In theory __sync_lock_release should be used to release the lock.
+ Unfortunately, it does not work properly alone. The workaround is
+ that more conservative __sync_lock_test_and_set is used instead. */
+ os_atomic_test_and_set_byte(&mutex->lock_word, 0);
+#else
+ mutex->lock_word = 0;
+
+ os_fast_mutex_unlock(&(mutex->os_fast_mutex));
+#endif
+}
+
+/******************************************************************//**
+Gets the value of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ ut_ad(mutex);
+
+ return(mutex->lock_word);
+}
+
+/******************************************************************//**
+Gets the waiters field in a mutex.
+@return value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ const volatile ulint* ptr; /*!< declared volatile to ensure that
+ the value is read from memory */
+ ut_ad(mutex);
+
+ ptr = &(mutex->waiters);
+
+ return(*ptr); /* Here we assume that the read of a single
+ word from memory is atomic */
+}
+
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+ mutex_t* mutex) /*!< in: pointer to mutex */
+{
+ ut_ad(mutex_own(mutex));
+
+ ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+
+#ifdef UNIV_SYNC_DEBUG
+ sync_thread_reset_level(mutex);
+#endif
+ mutex_reset_lock_word(mutex);
+
+ /* A problem: we assume that mutex_reset_lock word
+ is a memory barrier, that is when we read the waiters
+ field next, the read must be serialized in memory
+ after the reset. A speculative processor might
+ perform the read first, which could leave a waiting
+ thread hanging indefinitely.
+
+ Our current solution call every second
+ sync_arr_wake_threads_if_sema_free()
+ to wake up possible hanging threads if
+ they are missed in mutex_signal_object. */
+
+ if (mutex_get_waiters(mutex) != 0) {
+
+ mutex_signal_object(mutex);
+ }
+
+#ifdef UNIV_SYNC_PERF_STAT
+ mutex_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Locks a mutex for the current thread. If the mutex is reserved, the function
+spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
+before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where locked */
+ ulint line) /*!< in: line where locked */
+{
+ ut_ad(mutex_validate(mutex));
+ ut_ad(!mutex_own(mutex));
+
+ /* Note that we do not peek at the value of lock_word before trying
+ the atomic test_and_set; we could peek, and possibly save time. */
+
+ ut_d(mutex->count_using++);
+
+ if (!mutex_test_and_set(mutex)) {
+ ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+#endif
+ return; /* Succeeded! */
+ }
+
+ mutex_spin_wait(mutex, file_name, line);
+}
diff --git a/storage/xtradb/include/sync0types.h b/storage/xtradb/include/sync0types.h
new file mode 100644
index 00000000000..1911bbac7fd
--- /dev/null
+++ b/storage/xtradb/include/sync0types.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0types.h
+Global types for sync
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+/** Rename mutex_t to avoid name space collision on some systems */
+#define mutex_t ib_mutex_t
+/** InnoDB mutex */
+typedef struct mutex_struct mutex_t;
+
+#endif
diff --git a/storage/xtradb/include/thr0loc.h b/storage/xtradb/include/thr0loc.h
new file mode 100644
index 00000000000..293d1ebd57f
--- /dev/null
+++ b/storage/xtradb/include/thr0loc.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.h
+The thread local storage
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+/* This module implements storage private to each thread,
+a capability useful in some situations like storing the
+OS handle to the current thread, or its priority. */
+
+#ifndef thr0loc_h
+#define thr0loc_h
+
+#include "univ.i"
+#include "os0thread.h"
+
+/****************************************************************//**
+Initializes the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_init(void);
+/*================*/
+ /****************************************************************//**
+Close the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_close(void);
+/*=================*/
+/*******************************************************************//**
+Creates a local storage struct for the calling new thread. */
+UNIV_INTERN
+void
+thr_local_create(void);
+/*==================*/
+/*******************************************************************//**
+Frees the local storage struct for the specified thread. */
+UNIV_INTERN
+void
+thr_local_free(
+/*===========*/
+ os_thread_id_t id); /*!< in: thread id */
+/*******************************************************************//**
+Gets the slot number in the thread table of a thread.
+@return slot number */
+UNIV_INTERN
+ulint
+thr_local_get_slot_no(
+/*==================*/
+ os_thread_id_t id); /*!< in: thread id of the thread */
+/*******************************************************************//**
+Sets in the local storage the slot number in the thread table of a thread. */
+UNIV_INTERN
+void
+thr_local_set_slot_no(
+/*==================*/
+ os_thread_id_t id, /*!< in: thread id of the thread */
+ ulint slot_no);/*!< in: slot number */
+/*******************************************************************//**
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage.
+@return pointer to the in_ibuf field */
+UNIV_INTERN
+ibool*
+thr_local_get_in_ibuf_field(void);
+/*=============================*/
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+thr_local_hash_cells(void);
+/*=======================*/
+
+ulint
+thr_local_hash_nodes(void);
+/*=======================*/
+
+#ifndef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/thr0loc.ic b/storage/xtradb/include/thr0loc.ic
new file mode 100644
index 00000000000..ce44e512320
--- /dev/null
+++ b/storage/xtradb/include/thr0loc.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.ic
+Thread local storage
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/trx0i_s.h b/storage/xtradb/include/trx0i_s.h
new file mode 100644
index 00000000000..7bd4e1b88c8
--- /dev/null
+++ b/storage/xtradb/include/trx0i_s.h
@@ -0,0 +1,247 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "ut0ut.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT 16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN 8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN 1024
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+typedef struct i_s_locks_row_struct i_s_locks_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_trx */
+typedef struct i_s_trx_row_struct i_s_trx_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_lock_waits */
+typedef struct i_s_lock_waits_row_struct i_s_lock_waits_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+typedef struct i_s_hash_chain_struct i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_struct {
+ i_s_locks_row_t* value; /*!< row of
+ INFORMATION_SCHEMA.innodb_locks*/
+ i_s_hash_chain_t* next; /*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_struct {
+ ullint lock_trx_id; /*!< transaction identifier */
+ const char* lock_mode; /*!< lock mode from
+ lock_get_mode_str() */
+ const char* lock_type; /*!< lock type from
+ lock_get_type_str() */
+ const char* lock_table; /*!< table name from
+ lock_get_table_name() */
+ const char* lock_index; /*!< index name from
+ lock_rec_get_index_name() */
+ /** Information for record locks. All these are
+ ULINT_UNDEFINED for table locks. */
+ /* @{ */
+ ulint lock_space; /*!< tablespace identifier */
+ ulint lock_page; /*!< page number within the_space */
+ ulint lock_rec; /*!< heap number of the record
+ on the page */
+ const char* lock_data; /*!< (some) content of the record */
+ /* @} */
+
+ /** The following are auxiliary and not included in the table */
+ /* @{ */
+ ullint lock_table_id;
+ /*!< table identifier from
+ lock_get_table_id */
+ i_s_hash_chain_t hash_chain; /*!< hash table chain node for
+ trx_i_s_cache_t::locks_hash */
+ /* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_struct {
+ ullint trx_id; /*!< transaction identifier */
+ const char* trx_state; /*!< transaction state from
+ trx_get_que_state_str() */
+ ib_time_t trx_started; /*!< trx_struct::start_time */
+ const i_s_locks_row_t* requested_lock_row;
+ /*!< pointer to a row
+ in innodb_locks if trx
+ is waiting, or NULL */
+ ib_time_t trx_wait_started;
+ /*!< trx_struct::wait_started */
+ ullint trx_weight; /*!< TRX_WEIGHT() */
+ ulint trx_mysql_thread_id;
+ /*!< thd_get_thread_id() */
+ const char* trx_query; /*!< MySQL statement being
+ executed in the transaction */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_struct {
+ const i_s_locks_row_t* requested_lock_row; /*!< requested lock */
+ const i_s_locks_row_t* blocking_lock_row; /*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+typedef struct trx_i_s_cache_struct trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+ I_S_INNODB_TRX, /*!< INFORMATION_SCHEMA.innodb_trx */
+ I_S_INNODB_LOCKS, /*!< INFORMATION_SCHEMA.innodb_locks */
+ I_S_INNODB_LOCK_WAITS /*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t* trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table); /*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n); /*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return 0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache); /*!< in/out: cache */
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache); /*!< in: cache */
+
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN (TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size);/*!< in: size of the lock id
+ buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h
new file mode 100644
index 00000000000..ae5bc6f90be
--- /dev/null
+++ b/storage/xtradb/include/trx0purge.h
@@ -0,0 +1,213 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "que0types.h"
+#include "page0page.h"
+#include "usr0sess.h"
+#include "fil0fil.h"
+
+/** The global data structure coordinating a purge */
+extern trx_purge_t* purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t trx_purge_dummy_rec;
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ fil_addr_t node_addr); /*!< in: file address of the history
+ list node of the log */
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ trx_id_t trx_id);/*!< in: transaction id */
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(void);
+/*======================*/
+/********************************************************************//**
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void);
+/*======================*/
+/************************************************************************
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /*!< in: transaction */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /*!< out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell); /*!< in: storage cell */
+/*******************************************************************//**
+This function runs a purge batch.
+@return number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(void);
+/*===========*/
+/**********************************************************************
+This function runs a purge worker batch */
+UNIV_INTERN
+void
+trx_purge_worker(
+/*=============*/
+ ulint worker_id);
+/**********************************************************************
+This function waits the event for worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wait(void);
+/*========================*/
+/**********************************************************************
+This function wakes the waiting worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wake(void);
+/*========================*/
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void);
+/*======================*/
+
+/** The control structure used in the purge operation */
+struct trx_purge_struct{
+ ulint state; /*!< Purge system state */
+ sess_t* sess; /*!< System session running the purge
+ query */
+ trx_t* trx; /*!< System transaction running the purge
+ query: this trx is not in the trx list
+ of the trx system and it never ends */
+ que_t* query; /*!< The query graph which will do the
+ parallelized purge operation */
+ ulint n_worker;
+ os_event_t worker_event;
+ sess_t** sess_arr;
+ trx_t** trx_arr;
+ que_t** query_arr;
+ rw_lock_t latch; /*!< The latch protecting the purge view.
+ A purge operation must acquire an
+ x-latch here for the instant at which
+ it changes the purge view: an undo
+ log operation can prevent this by
+ obtaining an s-latch here. */
+ read_view_t* view; /*!< The purge will not remove undo logs
+ which are >= this view (purge view) */
+ mutex_t mutex; /*!< Mutex protecting the fields below */
+ ulint n_pages_handled;/*!< Approximate number of undo log
+ pages processed in purge */
+ ulint handle_limit; /*!< Target of how many pages to get
+ processed in the current purge */
+ /*------------------------------*/
+ /* The following two fields form the 'purge pointer' which advances
+ during a purge, and which is used in history list truncation */
+
+ trx_id_t purge_trx_no; /*!< Purge has advanced past all
+ transactions whose number is less
+ than this */
+ undo_no_t purge_undo_no; /*!< Purge has advanced past all records
+ whose undo number is less than this */
+ /*-----------------------------*/
+ ibool next_stored; /*!< TRUE if the info of the next record
+ to purge is stored below: if yes, then
+ the transaction number and the undo
+ number of the record are stored in
+ purge_trx_no and purge_undo_no above */
+ trx_rseg_t* rseg; /*!< Rollback segment for the next undo
+ record to purge */
+ ulint page_no; /*!< Page number for the next undo
+ record to purge, page number of the
+ log header, if dummy record */
+ ulint offset; /*!< Page offset for the next undo
+ record to purge, 0 if the dummy
+ record */
+ ulint hdr_page_no; /*!< Header page of the undo log where
+ the next record to purge belongs */
+ ulint hdr_offset; /*!< Header byte offset on the page */
+ /*-----------------------------*/
+ trx_undo_arr_t* arr; /*!< Array of transaction numbers and
+ undo numbers of the undo records
+ currently under processing in purge */
+ mem_heap_t* heap; /*!< Temporary storage used during a
+ purge: can be emptied after purge
+ completes */
+};
+
+#define TRX_PURGE_ON 1 /* purge operation is running */
+#define TRX_STOP_PURGE 2 /* purge operation is stopped, or
+ it should be stopped */
+#ifndef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0purge.ic b/storage/xtradb/include/trx0purge.ic
new file mode 100644
index 00000000000..de09e393654
--- /dev/null
+++ b/storage/xtradb/include/trx0purge.ic
@@ -0,0 +1,43 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.ic
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+ fil_addr_t node_addr) /*!< in: file address of the history
+ list node of the log */
+{
+ node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
+
+ return(node_addr);
+}
+
diff --git a/storage/xtradb/include/trx0rec.h b/storage/xtradb/include/trx0rec.h
new file mode 100644
index 00000000000..a6e56e963c6
--- /dev/null
+++ b/storage/xtradb/include/trx0rec.h
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+#include "data0data.h"
+#include "rem0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "que0types.h"
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap); /*!< in: heap where copied */
+/**********************************************************************//**
+Reads the undo log record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec); /*!< in: undo log record */
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+ undo_no_t undo_no) /*!< in: undo no read from node */
+ __attribute__((const));
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no) \
+ ((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ ibool* updated_extern, /*!< out: TRUE if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ dulint* table_id); /*!< out: table id */
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** ref, /*!< out, own: row reference */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index); /*!< in: clustered index */
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ ulint* info_bits); /*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undorecord */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ ulint info_bits,/*!< in: info bits from this undo record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd); /*!< out, own: update vector */
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap); /*!< in: memory heap from which the memory
+ needed is allocated */
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ roll_ptr_t* roll_ptr); /*!< out: rollback pointer to the
+ inserted undo log record,
+ ut_dulint_zero if BTR_NO_UNDO_LOG
+ flag was specified */
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ trx_id_t trx_id, /*!< in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec,/*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr,/*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers);/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page); /*!< in: page or NULL */
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+
+#ifndef UNIV_HOTBACKUP
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+#define TRX_UNDO_INSERT_REC 11 /* fresh insert into clustered index */
+#define TRX_UNDO_UPD_EXIST_REC 12 /* update of a non-delete-marked
+ record */
+#define TRX_UNDO_UPD_DEL_REC 13 /* update of a delete marked record to
+ a not delete marked record; also the
+ fields of the record can change */
+#define TRX_UNDO_DEL_MARK_REC 14 /* delete marking of a record; fields
+ do not change */
+#define TRX_UNDO_CMPL_INFO_MULT 16 /* compilation info is multiplied by
+ this and ORed to the type above */
+#define TRX_UNDO_UPD_EXTERN 128 /* This bit can be ORed to type_cmpl
+ to denote that we updated external
+ storage fields: used by purge to
+ free the external storage */
+
+/* Operation type flags used in trx_undo_report_row_operation */
+#define TRX_UNDO_INSERT_OP 1
+#define TRX_UNDO_MODIFY_OP 2
+
+#ifndef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif /* trx0rec_h */
diff --git a/storage/xtradb/include/trx0rec.ic b/storage/xtradb/include/trx0rec.ic
new file mode 100644
index 00000000000..e7e41d6d9f6
--- /dev/null
+++ b/storage/xtradb/include/trx0rec.ic
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.ic
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads from an undo log record the record type.
+@return record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
+}
+
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+ const trx_undo_rec_t* undo_rec) /*!< in: undo log record */
+{
+ const byte* ptr;
+
+ ptr = undo_rec + 3;
+
+ return(mach_dulint_read_much_compressed(ptr));
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+ undo_no_t undo_no) /*!< in: undo no read from node */
+{
+ return (3 + mach_dulint_get_much_compressed_size(undo_no));
+}
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+ const trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ mem_heap_t* heap) /*!< in: heap where copied */
+{
+ ulint len;
+
+ len = mach_read_from_2(undo_rec)
+ - ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
+ return(mem_heap_dup(heap, undo_rec, len));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0roll.h b/storage/xtradb/include/trx0roll.h
new file mode 100644
index 00000000000..1dee5655c8c
--- /dev/null
+++ b/storage/xtradb/include/trx0roll.h
@@ -0,0 +1,352 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "univ.i"
+#include "trx0trx.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL)
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+ const trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx); /*!< in: transaction */
+/*******************************************************************//**
+Creates an undo number array. */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void);
+/*=====================*/
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr); /*!< in: undo number array */
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ ulint n); /*!< in: position */
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx); /*!< in/out: transaction */
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t limit, /*!< in: least undo number we need */
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ mem_heap_t* heap); /*!< in: memory heap where copied */
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no);/*!< in: undo number of the record */
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no);/*!< in: undo number */
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ trx_sig_t* sig, /*!< in: signal starting the rollback */
+ que_thr_t** next_thr);/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+ ibool all); /*!< in: FALSE=roll back dictionary transactions;
+ TRUE=roll back all non-PREPARED transactions */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+ void* arg __attribute__((unused)));
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /*!< in: undo graph which can now be freed */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr);/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx); /*!< in: trx handle */
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr); /*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx); /*!< in: transaction handle */
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx); /*!< in: transaction handle */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_savept_t* savept);/*!< in: pointer to savepoint undo number, if
+ partial rollback requested, or NULL for
+ complete rollback */
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t* mysql_binlog_cache_pos);/*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t binlog_cache_pos); /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name); /*!< in: savepoint name */
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep); /*!< in: savepoint to free */
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep); /*!< in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+
+/** A cell of trx_undo_arr_struct; used during a rollback and a purge */
+struct trx_undo_inf_struct{
+ trx_id_t trx_no; /*!< transaction number: not defined during
+ a rollback */
+ undo_no_t undo_no;/*!< undo number of an undo record */
+ ibool in_use; /*!< TRUE if the cell is in use */
+};
+
+/** During a rollback and a purge, undo numbers of undo records currently being
+processed are stored in this array */
+
+struct trx_undo_arr_struct{
+ ulint n_cells; /*!< number of cells in the array */
+ ulint n_used; /*!< number of cells currently in use */
+ trx_undo_inf_t* infos; /*!< the array of undo infos */
+ mem_heap_t* heap; /*!< memory heap from which allocated */
+};
+
+/** Rollback node states */
+enum roll_node_state {
+ ROLL_NODE_SEND = 1, /*!< about to send a rollback signal to
+ the transaction */
+ ROLL_NODE_WAIT /*!< rollback signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_ROLLBACK */
+ enum roll_node_state state; /*!< node execution state */
+ ibool partial;/*!< TRUE if we want a partial
+ rollback */
+ trx_savept_t savept; /*!< savepoint to which to
+ roll back, in the case of a
+ partial rollback */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_struct{
+ char* name; /*!< savepoint name */
+ trx_savept_t savept; /*!< the undo number corresponding to
+ the savepoint */
+ ib_int64_t mysql_binlog_cache_pos;
+ /*!< the MySQL binlog cache position
+ corresponding to this savepoint, not
+ defined if the MySQL binlogging is not
+ enabled */
+ UT_LIST_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< the list of savepoints of a
+ transaction */
+};
+
+#ifndef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0roll.ic b/storage/xtradb/include/trx0roll.ic
new file mode 100644
index 00000000000..3460832b18c
--- /dev/null
+++ b/storage/xtradb/include/trx0roll.ic
@@ -0,0 +1,40 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.ic
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ ulint n) /*!< in: position */
+{
+ ut_ad(arr);
+ ut_ad(n < arr->n_cells);
+
+ return(arr->infos + n);
+}
diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h
new file mode 100644
index 00000000000..303188f09f2
--- /dev/null
+++ b/storage/xtradb/include/trx0rseg.h
@@ -0,0 +1,223 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "trx0sys.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ ulint page_no,/*!< in: page number of the undo log segment */
+ mtr_t* mtr); /*!< in: mtr */
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ ulint id); /*!< in: rollback segment id */
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* slot_no, /*!< out: rseg id == slot number in trx sys */
+ mtr_t* mtr); /*!< in: mtr */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ mtr_t* mtr); /*!< in: mtr */
+/****************************************************************//**
+Creates a new rollback segment to the database.
+@return the created segment object, NULL if fail */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* id, /*!< out: rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************************
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+ trx_rseg_t* rseg); /* in, own: instance to free */
+
+
+/* Real max value may be 4076 in usual. But reserve 4 slot for safety or etc... */
+#define TRX_RSEG_N_EXTRA_SLOTS (((UNIV_PAGE_SIZE - (FIL_PAGE_DATA + FIL_PAGE_DATA_END + TRX_RSEG_UNDO_SLOTS)) / TRX_RSEG_SLOT_SIZE) - 4)
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS (srv_extra_undoslots ? TRX_RSEG_N_EXTRA_SLOTS : (UNIV_PAGE_SIZE / 16))
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS (TRX_RSEG_N_SLOTS / 2)
+
+/* The rollback segment memory object */
+struct trx_rseg_struct{
+ /*--------------------------------------------------------*/
+ ulint id; /*!< rollback segment id == the index of
+ its slot in the trx system file copy */
+ mutex_t mutex; /*!< mutex protecting the fields in this
+ struct except id; NOTE that the latching
+ order must always be kernel mutex ->
+ rseg mutex */
+ ulint space; /*!< space where the rollback segment is
+ header is placed */
+ ulint zip_size;/* compressed page size of space
+ in bytes, or 0 for uncompressed spaces */
+ ulint page_no;/* page number of the rollback segment
+ header */
+ ulint max_size;/* maximum allowed size in pages */
+ ulint curr_size;/* current size in pages */
+ /*--------------------------------------------------------*/
+ /* Fields for update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list;
+ /* List of update undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached;
+ /* List of update undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ /* Fields for insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list;
+ /* List of insert undo logs */
+ UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached;
+ /* List of insert undo log segments
+ cached for fast reuse */
+ /*--------------------------------------------------------*/
+ ulint last_page_no; /*!< Page number of the last not yet
+ purged log header in the history list;
+ FIL_NULL if all list purged */
+ ulint last_offset; /*!< Byte offset of the last not yet
+ purged log header */
+ trx_id_t last_trx_no; /*!< Transaction number of the last not
+ yet purged log */
+ ibool last_del_marks; /*!< TRUE if the last not yet purged log
+ needs purging */
+ /*--------------------------------------------------------*/
+ UT_LIST_NODE_T(trx_rseg_t) rseg_list;
+ /* the list of the rollback segment
+ memory objects */
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
+ an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE 4
+
+/* The offset of the rollback segment header on its page */
+#define TRX_RSEG FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+#define TRX_RSEG_MAX_SIZE 0 /* Maximum allowed size for rollback
+ segment in pages */
+#define TRX_RSEG_HISTORY_SIZE 4 /* Number of file pages occupied
+ by the logs in the history list */
+#define TRX_RSEG_HISTORY 8 /* The update undo logs for committed
+ transactions */
+#define TRX_RSEG_FSEG_HEADER (8 + FLST_BASE_NODE_SIZE)
+ /* Header for the file segment where
+ this page is placed */
+#define TRX_RSEG_UNDO_SLOTS (8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+ /* Undo log segment slots */
+/*-------------------------------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0rseg.ic b/storage/xtradb/include/trx0rseg.ic
new file mode 100644
index 00000000000..daffa92fc7d
--- /dev/null
+++ b/storage/xtradb/include/trx0rseg.ic
@@ -0,0 +1,145 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.ic
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "mtr0log.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_rsegf_t* header;
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
+
+ header = TRX_RSEG + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_rsegf_t* header;
+
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ header = TRX_RSEG + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to get slot %lu of rseg\n",
+ (ulong) n);
+ ut_error;
+ }
+
+ return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS
+ + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr));
+}
+
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ ulint n, /*!< in: index of slot */
+ ulint page_no,/*!< in: page number of the undo log segment */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to set slot %lu of rseg\n",
+ (ulong) n);
+ ut_error;
+ }
+
+ mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
+ page_no, MLOG_4BYTES, mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+ trx_rsegf_t* rsegf, /*!< in: rollback segment header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+ ulint page_no;
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h
new file mode 100644
index 00000000000..9ef9485b611
--- /dev/null
+++ b/storage/xtradb/include/trx0sys.h
@@ -0,0 +1,664 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "univ.i"
+
+#include "trx0types.h"
+#include "fsp0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+extern char trx_sys_mysql_master_log_name[];
+/** Master binlog file position. We have successfully got the updates
+up to this position. -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+extern ib_int64_t trx_sys_mysql_master_log_pos;
+/* @} */
+
+extern char trx_sys_mysql_relay_log_name[];
+extern ib_int64_t trx_sys_mysql_relay_log_pos;
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+extern char trx_sys_mysql_bin_log_name[];
+/** Binlog file position, or -1 if unknown */
+extern ib_int64_t trx_sys_mysql_bin_log_pos;
+/* @} */
+
+/** The transaction system */
+extern trx_sys_t* trx_sys;
+
+/** Doublewrite system */
+extern trx_doublewrite_t* trx_doublewrite;
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+extern ibool trx_doublewrite_must_reset_space_ids;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool trx_doublewrite_buf_is_being_created;
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+extern ibool trx_sys_multiple_tablespace_format;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void);
+/*================================*/
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages); /*!< in: TRUE=restore pages */
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void);
+/*===============================================*/
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+ ulint page_no); /*!< in: page number */
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ ulint space, /*!< in: space */
+ ulint page_no);/*!< in: page number */
+/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+ ulint space); /*!< in: space */
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+ ulint space); /*!< in: space */
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void);
+/*==========================*/
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void);
+/*================*/
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+ ulint space);
+/*********************************************************************
+Create extra rollback segments when create_new_db */
+UNIV_INTERN
+void
+trx_sys_create_extra_rseg(
+/*======================*/
+ ulint num); /* in: number of extra user rollback segments */
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ mtr_t* mtr); /*!< in: mtr */
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n); /*!< in: index of slot */
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n, /*!< in: index of slot */
+ trx_rseg_t* rseg); /*!< in: pointer to rseg object, NULL if slot
+ not in use */
+/**********************************************************************//**
+Gets a pointer to the transaction system file copy and x-locks its page.
+@return pointer to system file copy, page x-locked */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+file copy.
+@return page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint space, /*!< in: space id */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint page_no, /*!< in: page number, FIL_NULL if
+ the slot is reset to unused */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Allocates a new transaction id.
+@return new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void);
+/*========================*/
+/*****************************************************************//**
+Allocates a new transaction number.
+@return new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /*!< in: pointer to memory where written */
+ trx_id_t id); /*!< in: id */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+ const byte* ptr); /*!< in: pointer to memory from where to read */
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ trx_id_t trx_id);/*!< in: trx id to search for */
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void);
+/*=========================*/
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ trx_id_t trx_id);/*!< in: trx id of the transaction */
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+ trx_t* in_trx);/*!< in: trx */
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ trx_sysf_t* sys_header,
+ const char* file_name_in,/*!< in: MySQL log file name */
+ ib_int64_t offset, /*!< in: position in that log file */
+ ulint field, /*!< in: offset of the MySQL log info field in
+ the trx sys header */
+ mtr_t* mtr); /*!< in: mtr */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void);
+/*===================================*/
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void);
+/*====================================*/
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void);
+/*==========================*/
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void);
+/*===========================*/
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void);
+/*==============================*/
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void);
+/*===============*/
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id); /*!< in: id of the file format */
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name); /*!< out: max file format name or
+ NULL if not needed. */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+ ulint max_format_id); /*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+ const char** name, /*!< out: max file format name */
+ ulint format_id); /*!< in: file format identifier */
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+ const byte* page); /*!< in: buffer containing the trx
+ system header page, i.e., page number
+ TRX_SYS_PAGE_NO in the tablespace */
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+ const char *pathname, /*!< in: pathname of the first system
+ table space file */
+ ulint *format_id); /*!< out: file format of the system table
+ space */
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+ const char *pathname, /*!< in: pathname of a per-table
+ datafile */
+ ulint *format_id); /*!< out: file format of the per-table
+ data file */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id); /*!< in: id of the file format */
+
+#endif /* !UNIV_HOTBACKUP */
+/* The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID 0
+
+/* Space id and page no where the trx system file copy resides */
+#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+#define TRX_DOUBLEWRITE_SPACE 1 /* the doublewrite buffer tablespace if used */
+#define TRX_SYS_SPACE_MAX 9 /* reserved max space id for system tablespaces */
+#include "fsp0fsp.h"
+#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
+
+/* The offset of the transaction system header on the page */
+#define TRX_SYS FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+#define TRX_SYS_TRX_ID_STORE 0 /*!< the maximum trx id or trx
+ number modulo
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ written to a file page by any
+ transaction; the assignment of
+ transaction ids continues from
+ this number rounded up by
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ plus
+ TRX_SYS_TRX_ID_UPDATE_MARGIN
+ when the database is
+ started */
+#define TRX_SYS_FSEG_HEADER 8 /*!< segment header for the
+ tablespace segment the trx
+ system is created into */
+#define TRX_SYS_RSEGS (8 + FSEG_HEADER_SIZE)
+ /*!< the start of the array of
+ rollback segment specification
+ slots */
+/*------------------------------------------------------------- @} */
+
+/** Maximum number of rollback segments: the number of segment
+specification slots in the transaction system array; rollback segment
+id must fit in one byte, therefore 256; each slot is currently 8 bytes
+in size */
+#define TRX_SYS_N_RSEGS 256
+
+/** Maximum length of MySQL binlog file name, in bytes.
+@see trx_sys_mysql_master_log_name
+@see trx_sys_mysql_bin_log_name */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
+#define TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN 480 /* (500 - 12) is dead line. */
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
+
+//#if UNIV_PAGE_SIZE < 4096
+//# error "UNIV_PAGE_SIZE < 4096"
+//#endif
+/** The offset of the MySQL replication info in the trx system header;
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
+#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000)
+#define TRX_SYS_MYSQL_RELAY_LOG_INFO (UNIV_PAGE_SIZE - 1500)
+
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000)
+#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /*!< magic number which is
+ TRX_SYS_MYSQL_LOG_MAGIC_N
+ if we have valid data in the
+ MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /*!< high 4 bytes of the offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /*!< low 4 bytes of the offset
+ within that file */
+#define TRX_SYS_MYSQL_LOG_NAME 12 /*!< MySQL log file name */
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG 0 /*!< fseg header of the fseg
+ containing the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC FSEG_HEADER_SIZE
+ /*!< 4-byte magic number which
+ shows if we already have
+ created the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1 (4 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the first
+ sequence of 64
+ (= FSP_EXTENT_SIZE) consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2 (8 + FSEG_HEADER_SIZE)
+ /*!< page number of the
+ first page in the second
+ sequence of 64 consecutive
+ pages in the doublewrite
+ buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT 12 /*!< we repeat
+ TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
+ TRX_SYS_DOUBLEWRITE_BLOCK2
+ so that if the trx sys
+ header is half-written
+ to disk, we still may
+ be able to recover the
+ information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+#define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
+
+/** Size of the doublewrite block in pages */
+#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE
+/* @} */
+
+#ifndef UNIV_HOTBACKUP
+/** File format tag */
+/* @{ */
+/** The offset of the file format tag on the trx system header page
+(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */
+#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
+
+/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid. The file format
+identifier is added to this constant. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
+/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
+/* @} */
+
+/** Doublewrite control struct */
+struct trx_doublewrite_struct{
+ mutex_t mutex; /*!< mutex protecting the first_free field and
+ write_buf */
+ ulint block1; /*!< the page number of the first
+ doublewrite block (64 pages) */
+ ulint block2; /*!< page number of the second block */
+ ulint first_free; /*!< first free position in write_buf measured
+ in units of UNIV_PAGE_SIZE */
+ byte* write_buf; /*!< write buffer used in writing to the
+ doublewrite buffer, aligned to an
+ address divisible by UNIV_PAGE_SIZE
+ (which is required by Windows aio) */
+ byte* write_buf_unaligned;
+ /*!< pointer to write_buf, but unaligned */
+ buf_page_t**
+ buf_block_arr; /*!< array to store pointers to the buffer
+ blocks which have been cached to write_buf */
+};
+
+/** The transaction system central memory data structure; protected by the
+kernel mutex */
+struct trx_sys_struct{
+ trx_id_t max_trx_id; /*!< The smallest number not yet
+ assigned as a transaction id or
+ transaction number */
+ UT_LIST_BASE_NODE_T(trx_t) trx_list;
+ /*!< List of active and committed in
+ memory transactions, sorted on trx id,
+ biggest first */
+ UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
+ /*!< List of transactions created
+ for MySQL */
+ UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
+ /*!< List of rollback segment
+ objects */
+ trx_rseg_t* latest_rseg; /*!< Latest rollback segment in the
+ round-robin assignment of rollback
+ segments to transactions */
+ trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
+ /*!< Pointer array to rollback
+ segments; NULL if slot not in use */
+ ulint rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
+ list (update undo logs for committed
+ transactions), protected by
+ rseg->mutex */
+ UT_LIST_BASE_NODE_T(read_view_t) view_list;
+ /*!< List of read views sorted
+ on trx no, biggest first */
+};
+
+/** When a trx id which is zero modulo this number (which must be a power of
+two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
+page is updated */
+#define TRX_SYS_TRX_ID_WRITE_MARGIN 256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic
new file mode 100644
index 00000000000..c7b09d4aec2
--- /dev/null
+++ b/storage/xtradb/include/trx0sys.ic
@@ -0,0 +1,421 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.ic
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "mtr0log.h"
+
+/* The typedef for rseg slot in the file copy */
+typedef byte trx_sysf_rseg_t;
+
+/* Rollback segment specification slot offsets */
+/*-------------------------------------------------------------*/
+#define TRX_SYS_RSEG_SPACE 0 /* space where the segment
+ header is placed; starting with
+ MySQL/InnoDB 5.1.7, this is
+ UNIV_UNDEFINED if the slot is unused */
+#define TRX_SYS_RSEG_PAGE_NO 4 /* page number where the segment
+ header is placed; this is FIL_NULL
+ if the slot is unused */
+/*-------------------------------------------------------------*/
+/* Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE 8
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void);
+/*==========================*/
+
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+ ulint space) /*!< in: space */
+{
+ if (srv_doublewrite_file) {
+ /* several spaces are reserved */
+ return((ibool)(space <= TRX_SYS_SPACE_MAX));
+ } else {
+ return((ibool)(space == TRX_SYS_SPACE));
+ }
+}
+
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+ ulint space) /*!< in: space */
+{
+ if (srv_doublewrite_file) {
+ /* doublewrite buffer is separated */
+ return((ibool)(space == TRX_DOUBLEWRITE_SPACE));
+ } else {
+ return((ibool)(space == TRX_SYS_SPACE));
+ }
+}
+
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n) /*!< in: index of slot */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ return(sys->rseg_array[n]);
+}
+
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+ trx_sys_t* sys, /*!< in: trx system */
+ ulint n, /*!< in: index of slot */
+ trx_rseg_t* rseg) /*!< in: pointer to rseg object, NULL if slot
+ not in use */
+{
+ ut_ad(n < TRX_SYS_N_RSEGS);
+
+ sys->rseg_array[n] = rseg;
+}
+
+/**********************************************************************//**
+Gets a pointer to the transaction system header and x-latches its page.
+@return pointer to system header, page x-latched. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block;
+ trx_sysf_t* header;
+
+ ut_ad(mtr);
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ header = TRX_SYS + buf_block_get_frame(block);
+
+ return(header);
+}
+
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+header.
+@return page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ ulint i, /*!< in: slot index == rseg id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(sys_header);
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys file copy */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint space, /*!< in: space id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE,
+ space,
+ MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+ trx_sysf_t* sys_header, /*!< in: trx sys header */
+ ulint i, /*!< in: slot index == rseg id */
+ ulint page_no, /*!< in: page number, FIL_NULL if the
+ slot is reset to unused */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ut_ad(mutex_own(&(kernel_mutex)));
+ ut_ad(sys_header);
+ ut_ad(i < TRX_SYS_N_RSEGS);
+
+ mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+ + i * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_PAGE_NO,
+ page_no,
+ MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+ byte* ptr, /*!< in: pointer to memory where written */
+ trx_id_t id) /*!< in: id */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+ mach_write_to_6(ptr, id);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+ const byte* ptr) /*!< in: pointer to memory from where to read */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+ return(mach_read_from_6(ptr));
+}
+
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+ trx_id_t trx_id) /*!< in: trx id to search for */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+ if (0 == ut_dulint_cmp(trx_id, trx->id)) {
+
+ return(trx);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(NULL);
+}
+
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void)
+/*=========================*/
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+
+ if (trx == NULL) {
+
+ return(trx_sys->max_trx_id);
+ }
+
+ return(trx->id);
+}
+
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+ trx_id_t trx_id) /*!< in: trx id of the transaction */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ if (ut_dulint_cmp(trx_id, trx_list_get_min_trx_id()) < 0) {
+
+ return(FALSE);
+ }
+
+ if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) {
+
+ /* There must be corruption: we return TRUE because this
+ function is only called by lock_clust_rec_some_has_impl()
+ and row_vers_impl_x_locked_off_kernel() and they have
+ diagnostic prints in this case */
+
+ return(TRUE);
+ }
+
+ trx = trx_get_on_id(trx_id);
+ if (trx && (trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************//**
+Allocates a new transaction id.
+@return new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void)
+/*========================*/
+{
+ trx_id_t id;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
+ will evaluate to TRUE when this function is first time called,
+ and the value for trx id will be written to disk-based header!
+ Thus trx id values will not overlap when the database is
+ repeatedly started! */
+
+ if (ut_dulint_get_low(trx_sys->max_trx_id)
+ % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+
+ trx_sys_flush_max_trx_id();
+ }
+
+ id = trx_sys->max_trx_id;
+
+ UT_DULINT_INC(trx_sys->max_trx_id);
+
+ return(id);
+}
+
+/*****************************************************************//**
+Allocates a new transaction number.
+@return new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void)
+/*========================*/
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ return(trx_sys_get_new_trx_id());
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
new file mode 100644
index 00000000000..4c0ce392bcd
--- /dev/null
+++ b/storage/xtradb/include/trx0trx.h
@@ -0,0 +1,849 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "dict0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "lock0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "read0types.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+
+/** Dummy session used currently in MySQL interface */
+extern sess_t* trx_dummy_sess;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+extern ulint trx_n_mysql_transactions;
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INTERN
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+ trx_t* trx); /*!< in: transaction */
+/******************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg); /*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file); /*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx); /*!< in: trx object */
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+ sess_t* sess) /*!< in: session */
+ __attribute__((nonnull));
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void);
+/*========================*/
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void);
+/*=============================*/
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+ trx_t* trx); /*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx); /*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+ trx_t* trx); /*!< in, own: trx object */
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void);
+/*============================*/
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE if success, FALSE if the rollback segment could not
+support this many transactions */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx); /*!< in: transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ ulint len); /*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return trx or NULL */
+UNIV_INTERN
+trx_t *
+trx_get_trx_by_xid(
+/*===============*/
+ XID* xid); /*!< in: X/Open XA transaction identification */
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx); /*!< in: trx handle */
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx); /*!< in: trx handle */
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ trx_t* trx); /*!< in: active transaction */
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx); /*!< in: transaction */
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender, /*!< in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver_thr, /*!< in: query thread which wants the
+ reply, or NULL; if type is
+ TRX_SIG_END_WAIT, this must be NULL */
+ trx_savept_t* savept, /*!< in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has
+been handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+ trx_sig_t* sig, /*!< in: signal */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/****************************************************************//**
+Removes the signal object from a trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /*!< in: trx handle */
+ trx_sig_t* sig); /*!< in, own: signal */
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /*!< in: trx handle */
+ que_thr_t** next_thr); /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, returns control to the error
+handling routine of the graph (currently only returns the control to the
+graph root which then sends an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx); /*!< in: trx */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+ mem_heap_t* heap); /*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr); /*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ ulint max_query_len); /*!< in: max query length to print, or 0 to
+ use the default max length */
+
+/** Type of data dictionary operation */
+typedef enum trx_dict_op {
+ /** The transaction is not modifying the data dictionary. */
+ TRX_DICT_OP_NONE = 0,
+ /** The transaction is creating a table or an index, or
+ dropping a table. The table must be dropped in crash
+ recovery. This and TRX_DICT_OP_NONE are the only possible
+ operation modes in crash recovery. */
+ TRX_DICT_OP_TABLE = 1,
+ /** The transaction is creating or dropping an index in an
+ existing table. In crash recovery, the data dictionary
+ must be locked, but the table must not be dropped. */
+ TRX_DICT_OP_INDEX = 2
+} trx_dict_op_t;
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+ __attribute__((pure));
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op op); /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+ trx_t* trx); /*!< in: transaction */
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+ trx_t* trx); /*!< in: transaction */
+#else /* !UNIV_HOTBACKUP */
+#define trx_is_interrupted(trx) FALSE
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t transaction
+@return transaction weight */
+#define TRX_WEIGHT(t) \
+ ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks))
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return <0, 0 or >0; similar to strcmp(3) */
+UNIV_INTERN
+int
+trx_weight_cmp(
+/*===========*/
+ const trx_t* a, /*!< in: the first transaction to be compared */
+ const trx_t* b); /*!< in: the second transaction to be compared */
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+ const trx_t* trx); /*!< in: transaction */
+
+/* Maximum length of a string that can be returned by
+trx_get_que_state_str(). */
+#define TRX_QUE_STATE_STR_MAX_LEN 12 /* "ROLLING BACK" */
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx); /*!< in: transaction */
+
+/* Signal to a transaction */
+struct trx_sig_struct{
+ unsigned type:3; /*!< signal type */
+ unsigned sender:1; /*!< TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver; /*!< non-NULL if the sender of the signal
+ wants reply after the operation induced
+ by the signal is completed */
+ trx_savept_t savept; /*!< possible rollback savepoint */
+ UT_LIST_NODE_T(trx_sig_t)
+ signals; /*!< queue of pending signals to the
+ transaction */
+ UT_LIST_NODE_T(trx_sig_t)
+ reply_signals; /*!< list of signals for which the sender
+ transaction is waiting a reply */
+};
+
+#define TRX_MAGIC_N 91118598
+
+/* The transaction handle; every session has a trx object which is freed only
+when the session is freed; in addition there may be session-less transactions
+rolling back after a database recovery */
+
+struct trx_struct{
+ ulint magic_n;
+
+ /* These fields are not protected by any mutex. */
+ const char* op_info; /*!< English text describing the
+ current operation, or an empty
+ string */
+ ulint conc_state; /*!< state of the trx from the point
+ of view of concurrency control:
+ TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
+ ... */
+ ulint isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
+ ulint check_foreigns; /* normally TRUE, but if the user
+ wants to suppress foreign key checks,
+ (in table imports, for example) we
+ set this FALSE */
+ ulint check_unique_secondary;
+ /* normally TRUE, but if the user
+ wants to speed up inserts by
+ suppressing unique key checks
+ for secondary indexes when we decide
+ if we can use the insert buffer for
+ them, we set this FALSE */
+ ulint support_xa; /*!< normally we do the XA two-phase
+ commit steps, but by setting this to
+ FALSE, one can save CPU time and about
+ 150 bytes in the undo log size as then
+ we skip XA steps */
+ ulint flush_log_at_trx_commit_session;
+ ulint flush_log_later;/* In 2PC, we hold the
+ prepare_commit mutex across
+ both phases. In that case, we
+ defer flush of the logs to disk
+ until after we release the
+ mutex. */
+ ulint must_flush_log_later;/* this flag is set to TRUE in
+ trx_commit_off_kernel() if
+ flush_log_later was TRUE, and there
+ were modifications by the transaction;
+ in that case we must flush the log
+ in trx_commit_complete_for_mysql() */
+ ulint duplicates; /*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+ ulint active_trans; /*!< 1 - if a transaction in MySQL
+ is active. 2 - if prepare_commit_mutex
+ was taken */
+ ulint has_search_latch;
+ /* TRUE if this trx has latched the
+ search system latch in S-mode */
+ ulint deadlock_mark; /*!< a mark field used in deadlock
+ checking algorithm. */
+ trx_dict_op_t dict_operation; /**< @see enum trx_dict_op */
+
+ /* Fields protected by the srv_conc_mutex. */
+ ulint declared_to_be_inside_innodb;
+ /* this is TRUE if we have declared
+ this transaction in
+ srv_conc_enter_innodb to be inside the
+ InnoDB engine */
+
+ /* Fields protected by dict_operation_lock. The very latch
+ it is used to track. */
+ ulint dict_operation_lock_mode;
+ /*!< 0, RW_S_LATCH, or RW_X_LATCH:
+ the latch mode trx currently holds
+ on dict_operation_lock */
+
+ /* All the next fields are protected by the kernel mutex, except the
+ undo logs which are protected by undo_mutex */
+ ulint is_purge; /*!< 0=user transaction, 1=purge */
+ ulint is_recovered; /*!< 0=normal transaction,
+ 1=recovered, must be rolled back */
+ ulint que_state; /*!< valid when conc_state
+ == TRX_ACTIVE: TRX_QUE_RUNNING,
+ TRX_QUE_LOCK_WAIT, ... */
+ ulint handling_signals;/* this is TRUE as long as the trx
+ is handling signals */
+ time_t start_time; /*!< time the trx object was created
+ or the state last time became
+ TRX_ACTIVE */
+ trx_id_t id; /*!< transaction id */
+ XID xid; /*!< X/Open XA transaction
+ identification to identify a
+ transaction branch */
+ trx_id_t no; /*!< transaction serialization number ==
+ max trx id when the transaction is
+ moved to COMMITTED_IN_MEMORY state */
+ ib_uint64_t commit_lsn; /*!< lsn at the time of the commit */
+ trx_id_t table_id; /*!< Table to drop iff dict_operation
+ is TRUE, or ut_dulint_zero. */
+ /*------------------------------*/
+ void* mysql_thd; /*!< MySQL thread handle corresponding
+ to this trx, or NULL */
+ const char* mysql_log_file_name;
+ /* if MySQL binlog is used, this field
+ contains a pointer to the latest file
+ name; this is NULL if binlog is not
+ used */
+ ib_int64_t mysql_log_offset;/* if MySQL binlog is used, this field
+ contains the end offset of the binlog
+ entry */
+ const char* mysql_master_log_file_name;
+ /* if the database server is a MySQL
+ replication slave, we have here the
+ master binlog name up to which
+ replication has processed; otherwise
+ this is a pointer to a null
+ character */
+ ib_int64_t mysql_master_log_pos;
+ /* if the database server is a MySQL
+ replication slave, this is the
+ position in the log file up to which
+ replication has processed */
+ const char* mysql_relay_log_file_name;
+ ib_int64_t mysql_relay_log_pos;
+
+ os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated
+ with this transaction object */
+ ulint mysql_process_no;/* since in Linux, 'top' reports
+ process id's and not thread id's, we
+ store the process number too */
+ /*------------------------------*/
+ ulint n_mysql_tables_in_use; /* number of Innobase tables
+ used in the processing of the current
+ SQL statement in MySQL */
+ ulint mysql_n_tables_locked;
+ /* how many tables the current SQL
+ statement uses, except those
+ in consistent read */
+ ulint search_latch_timeout;
+ /* If we notice that someone is
+ waiting for our S-lock on the search
+ latch to be released, we wait in
+ row0sel.c for BTR_SEA_TIMEOUT new
+ searches until we try to keep
+ the search latch again over
+ calls from MySQL; this is intended
+ to reduce contention on the search
+ latch */
+ /*------------------------------*/
+ ulint n_tickets_to_enter_innodb;
+ /* this can be > 0 only when
+ declared_to_... is TRUE; when we come
+ to srv_conc_innodb_enter, if the value
+ here is > 0, we decrement this by 1 */
+ /*------------------------------*/
+ UT_LIST_NODE_T(trx_t)
+ trx_list; /*!< list of transactions */
+ UT_LIST_NODE_T(trx_t)
+ mysql_trx_list; /*!< list of transactions created for
+ MySQL */
+ /*------------------------------*/
+ ulint error_state; /*!< 0 if no error, otherwise error
+ number; NOTE That ONLY the thread
+ doing the transaction is allowed to
+ set this field: this is NOT protected
+ by the kernel mutex */
+ const dict_index_t*error_info; /*!< if the error number indicates a
+ duplicate key error, a pointer to
+ the problematic index is stored here */
+ ulint error_key_num; /*!< if the index creation fails to a
+ duplicate key error, a mysql key
+ number of that index is stored here */
+ sess_t* sess; /*!< session of the trx, NULL if none */
+ que_t* graph; /*!< query currently run in the session,
+ or NULL if none; NOTE that the query
+ belongs to the session, and it can
+ survive over a transaction commit, if
+ it is a stored procedure with a COMMIT
+ WORK statement, for instance */
+ ulint n_active_thrs; /*!< number of active query threads */
+ que_t* graph_before_signal_handling;
+ /* value of graph when signal handling
+ for this trx started: this is used to
+ return control to the original query
+ graph for error processing */
+ trx_sig_t sig; /*!< one signal object can be allocated
+ in this space, avoiding mem_alloc */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ signals; /*!< queue of processed or pending
+ signals to the trx */
+ UT_LIST_BASE_NODE_T(trx_sig_t)
+ reply_signals; /*!< list of signals sent by the query
+ threads of this trx for which a thread
+ is waiting for a reply; if this trx is
+ killed, the reply requests in the list
+ must be canceled */
+ /*------------------------------*/
+ lock_t* wait_lock; /*!< if trx execution state is
+ TRX_QUE_LOCK_WAIT, this points to
+ the lock request, otherwise this is
+ NULL */
+ ibool was_chosen_as_deadlock_victim;
+ /* when the transaction decides to wait
+ for a lock, it sets this to FALSE;
+ if another transaction chooses this
+ transaction as a victim in deadlock
+ resolution, it sets this to TRUE */
+ time_t wait_started; /*!< lock wait started at this time */
+ UT_LIST_BASE_NODE_T(que_thr_t)
+ wait_thrs; /*!< query threads belonging to this
+ trx that are in the QUE_THR_LOCK_WAIT
+ state */
+ /*------------------------------*/
+ mem_heap_t* lock_heap; /*!< memory heap for the locks of the
+ transaction */
+ UT_LIST_BASE_NODE_T(lock_t)
+ trx_locks; /*!< locks reserved by the transaction */
+ /*------------------------------*/
+ mem_heap_t* global_read_view_heap;
+ /* memory heap for the global read
+ view */
+ read_view_t* global_read_view;
+ /* consistent read view associated
+ to a transaction or NULL */
+ read_view_t* read_view; /*!< consistent read view used in the
+ transaction or NULL, this read view
+ if defined can be normal read view
+ associated to a transaction (i.e.
+ same as global_read_view) or read view
+ associated to a cursor */
+ /*------------------------------*/
+ UT_LIST_BASE_NODE_T(trx_named_savept_t)
+ trx_savepoints; /*!< savepoints set with SAVEPOINT ...,
+ oldest first */
+ /*------------------------------*/
+ mutex_t undo_mutex; /*!< mutex protecting the fields in this
+ section (down to undo_no_arr), EXCEPT
+ last_sql_stat_start, which can be
+ accessed only when we know that there
+ cannot be any activity in the undo
+ logs! */
+ undo_no_t undo_no; /*!< next undo log record number to
+ assign; since the undo log is
+ private for a transaction, this
+ is a simple ascending sequence
+ with no gaps; thus it represents
+ the number of modified/inserted
+ rows in a transaction */
+ trx_savept_t last_sql_stat_start;
+ /* undo_no when the last sql statement
+ was started: in case of an error, trx
+ is rolled back down to this undo
+ number; see note at undo_mutex! */
+ trx_rseg_t* rseg; /*!< rollback segment assigned to the
+ transaction, or NULL if not assigned
+ yet */
+ trx_undo_t* insert_undo; /*!< pointer to the insert undo log, or
+ NULL if no inserts performed yet */
+ trx_undo_t* update_undo; /*!< pointer to the update undo log, or
+ NULL if no update performed yet */
+ undo_no_t roll_limit; /*!< least undo number to undo during
+ a rollback */
+ ulint pages_undone; /*!< number of undo log pages undone
+ since the last undo log truncation */
+ trx_undo_arr_t* undo_no_arr; /*!< array of undo numbers of undo log
+ records which are currently processed
+ by a rollback operation */
+ /*------------------------------*/
+ ulint n_autoinc_rows; /*!< no. of AUTO-INC rows required for
+ an SQL statement. This is useful for
+ multi-row INSERTs */
+ ib_vector_t* autoinc_locks; /* AUTOINC locks held by this
+ transaction. Note that these are
+ also in the lock list trx_locks. This
+ vector needs to be freed explicitly
+ when the trx_t instance is desrtoyed */
+ /*------------------------------*/
+ char detailed_error[256]; /*!< detailed error message for last
+ error, or empty. */
+ /*------------------------------*/
+ ulint io_reads;
+ ib_uint64_t io_read;
+ ulint io_reads_wait_timer;
+ ib_uint64_t lock_que_wait_ustarted;
+ ulint lock_que_wait_timer;
+ ulint innodb_que_wait_timer;
+ ulint distinct_page_access;
+#define DPAH_SIZE 8192
+ byte* distinct_page_access_hash;
+ ibool take_stats;
+};
+
+#define TRX_MAX_N_THREADS 32 /* maximum number of
+ concurrent threads running a
+ single operation of a
+ transaction, e.g., a parallel
+ query */
+/* Transaction concurrency states (trx->conc_state) */
+#define TRX_NOT_STARTED 0
+#define TRX_ACTIVE 1
+#define TRX_COMMITTED_IN_MEMORY 2
+#define TRX_PREPARED 3 /* Support for 2PC/XA */
+
+/* Transaction execution states when trx->conc_state == TRX_ACTIVE */
+#define TRX_QUE_RUNNING 0 /* transaction is running */
+#define TRX_QUE_LOCK_WAIT 1 /* transaction is waiting for a lock */
+#define TRX_QUE_ROLLING_BACK 2 /* transaction is rolling back */
+#define TRX_QUE_COMMITTING 3 /* transaction is committing */
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED 0 /* dirty read: non-locking
+ SELECTs are performed so that
+ we do not look at a possible
+ earlier version of a record;
+ thus they are not 'consistent'
+ reads under this isolation
+ level; otherwise like level
+ 2 */
+
+#define TRX_ISO_READ_COMMITTED 1 /* somewhat Oracle-like
+ isolation, except that in
+ range UPDATE and DELETE we
+ must block phantom rows
+ with next-key locks;
+ SELECT ... FOR UPDATE and ...
+ LOCK IN SHARE MODE only lock
+ the index records, NOT the
+ gaps before them, and thus
+ allow free inserting;
+ each consistent read reads its
+ own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ 2 /* this is the default;
+ all consistent reads in the
+ same trx read the same
+ snapshot;
+ full next-key locking used
+ in locking reads to block
+ insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE 3 /* all plain SELECTs are
+ converted to LOCK IN SHARE
+ MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE 1 /* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE 2 /* duplicate rows are to be replaced */
+
+
+/* Types of a trx signal */
+#define TRX_SIG_NO_SIGNAL 0
+#define TRX_SIG_TOTAL_ROLLBACK 1
+#define TRX_SIG_ROLLBACK_TO_SAVEPT 2
+#define TRX_SIG_COMMIT 3
+#define TRX_SIG_ERROR_OCCURRED 4
+#define TRX_SIG_BREAK_EXECUTION 5
+
+/* Sender types of a signal */
+#define TRX_SIG_SELF 0 /* sent by the session itself, or
+ by an error occurring within this
+ session */
+#define TRX_SIG_OTHER_SESS 1 /* sent by another session (which
+ must hold rights to this) */
+
+/** Commit node states */
+enum commit_node_state {
+ COMMIT_NODE_SEND = 1, /*!< about to send a commit signal to
+ the transaction */
+ COMMIT_NODE_WAIT /*!< commit signal sent to the transaction,
+ waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_struct{
+ que_common_t common; /*!< node type: QUE_NODE_COMMIT */
+ enum commit_node_state
+ state; /*!< node execution state */
+};
+
+
+
+#ifndef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/trx0trx.ic b/storage/xtradb/include/trx0trx.ic
new file mode 100644
index 00000000000..7332eeece85
--- /dev/null
+++ b/storage/xtradb/include/trx0trx.ic
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start(trx, ULINT_UNDEFINED);
+ }
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start_low(trx, ULINT_UNDEFINED);
+ }
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+ const trx_t* trx) /*!< in: trx object */
+{
+ return(trx->error_info);
+}
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return((ullint)ut_conv_dulint_to_longlong(trx->id));
+}
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ /* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
+ switch (trx->que_state) {
+ case TRX_QUE_RUNNING:
+ return("RUNNING");
+ case TRX_QUE_LOCK_WAIT:
+ return("LOCK WAIT");
+ case TRX_QUE_ROLLING_BACK:
+ return("ROLLING BACK");
+ case TRX_QUE_COMMITTING:
+ return("COMMITTING");
+ default:
+ return("UNKNOWN");
+ }
+}
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation;
+
+#ifdef UNIV_DEBUG
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ return(op);
+ }
+ ut_error;
+#endif /* UNIV_DEBUG */
+ return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE));
+}
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+ trx_t* trx, /*!< in/out: transaction */
+ enum trx_dict_op op) /*!< in: operation, not
+ TRX_DICT_OP_NONE */
+{
+#ifdef UNIV_DEBUG
+ enum trx_dict_op old_op = trx_get_dict_operation(trx);
+
+ switch (op) {
+ case TRX_DICT_OP_NONE:
+ ut_error;
+ break;
+ case TRX_DICT_OP_TABLE:
+ switch (old_op) {
+ case TRX_DICT_OP_NONE:
+ case TRX_DICT_OP_INDEX:
+ case TRX_DICT_OP_TABLE:
+ goto ok;
+ }
+ ut_error;
+ break;
+ case TRX_DICT_OP_INDEX:
+ ut_ad(old_op == TRX_DICT_OP_NONE);
+ break;
+ }
+ok:
+#endif /* UNIV_DEBUG */
+
+ trx->dict_operation = op;
+}
diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h
new file mode 100644
index 00000000000..40a7256cbfd
--- /dev/null
+++ b/storage/xtradb/include/trx0types.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "ut0byte.h"
+
+/** prepare trx_t::id for being printed via printf(3) */
+#define TRX_ID_PREP_PRINTF(id) (ullint) ut_conv_dulint_to_longlong(id)
+
+/** printf(3) format used for printing TRX_ID_PRINTF_PREP() */
+#define TRX_ID_FMT "%llX"
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+#define TRX_ID_MAX_LEN 17
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+typedef struct trx_struct trx_t;
+/** Transaction system */
+typedef struct trx_sys_struct trx_sys_t;
+/** Doublewrite information */
+typedef struct trx_doublewrite_struct trx_doublewrite_t;
+/** Signal */
+typedef struct trx_sig_struct trx_sig_t;
+/** Rollback segment */
+typedef struct trx_rseg_struct trx_rseg_t;
+/** Transaction undo log */
+typedef struct trx_undo_struct trx_undo_t;
+/** Array of undo numbers of undo records being rolled back or purged */
+typedef struct trx_undo_arr_struct trx_undo_arr_t;
+/** A cell of trx_undo_arr_t */
+typedef struct trx_undo_inf_struct trx_undo_inf_t;
+/** The control structure used in the purge operation */
+typedef struct trx_purge_struct trx_purge_t;
+/** Rollback command node in a query graph */
+typedef struct roll_node_struct roll_node_t;
+/** Commit command node in a query graph */
+typedef struct commit_node_struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+typedef struct trx_named_savept_struct trx_named_savept_t;
+/* @} */
+
+/** Rollback contexts */
+enum trx_rb_ctx {
+ RB_NONE = 0, /*!< no rollback */
+ RB_NORMAL, /*!< normal rollback */
+ RB_RECOVERY_PURGE_REC,
+ /*!< rolling back an incomplete transaction,
+ in crash recovery, rolling back an
+ INSERT that was performed by updating a
+ delete-marked record; if the delete-marked record
+ no longer exists in an active read view, it will
+ be purged */
+ RB_RECOVERY /*!< rolling back an incomplete transaction,
+ in crash recovery */
+};
+
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef dulint trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef dulint roll_ptr_t;
+/** Undo number */
+typedef dulint undo_no_t;
+
+/** Transaction savepoint */
+typedef struct trx_savept_struct trx_savept_t;
+/** Transaction savepoint */
+struct trx_savept_struct{
+ undo_no_t least_undo_no; /*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Transaction system header */
+typedef byte trx_sysf_t;
+/** Rollback segment header */
+typedef byte trx_rsegf_t;
+/** Undo segment header */
+typedef byte trx_usegf_t;
+/** Undo log header */
+typedef byte trx_ulogf_t;
+/** Undo log page header */
+typedef byte trx_upagef_t;
+
+/** Undo log record */
+typedef byte trx_undo_rec_t;
+/* @} */
+
+#endif
diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h
new file mode 100644
index 00000000000..a084f2394b5
--- /dev/null
+++ b/storage/xtradb/include/trx0undo.h
@@ -0,0 +1,551 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "page0types.h"
+#include "trx0xa.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ ibool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ ulint page_no, /*!< in: page number */
+ ulint offset); /*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ ibool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ ulint* page_no, /*!< out: page number */
+ ulint* offset); /*!< out: offset of the undo
+ entry within page */
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr); /*!< in: roll pointer */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /*!< in: pointer to memory where
+ written */
+ roll_ptr_t roll_ptr); /*!< in: roll ptr */
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+ const byte* ptr); /*!< in: pointer to memory from where to read */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset); /*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header offset on page */
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr); /*!< in: mtr */
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr); /*!< in: mtr */
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return page number if success, else FIL_NULL */
+UNIV_INTERN
+ulint
+trx_undo_add_page(
+/*==============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory object */
+ mtr_t* mtr); /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end(
+/*==================*/
+ trx_t* trx, /*!< in: transaction whose undo log it is */
+ trx_undo_t* undo, /*!< in: undo log */
+ undo_no_t limit); /*!< in: all undo records with undo number
+ >= this value should be truncated */
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ulint space, /*!< in: space id of the log */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset on the page */
+ undo_no_t limit); /*!< in: all undo pages with
+ undo numbers < this value
+ should be truncated; NOTE that
+ the function only frees whole
+ pages; the header page is not
+ freed, but emptied, if all the
+ records there are < limit */
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy.
+This function is only called when the database is started or a new
+rollback segment created.
+@return the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+ trx_rseg_t* rseg); /*!< in: rollback segment memory object */
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ ulint type); /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr); /*!< in: mtr */
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /*!< in: trx owning the update undo log */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr); /*!< in: mtr */
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx); /*!< in: transaction handle */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr); /*!< in: mtr or NULL */
+/************************************************************************
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+ trx_undo_t* undo); /* in: the undo object to be freed */
+
+/* Types of an undo log segment */
+#define TRX_UNDO_INSERT 1 /* contains undo entries for inserts */
+#define TRX_UNDO_UPDATE 2 /* contains undo entries for updates
+ and delete markings: in short,
+ modifys (the name 'UPDATE' is a
+ historical relic) */
+/* States of an undo log segment */
+#define TRX_UNDO_ACTIVE 1 /* contains an undo log of an active
+ transaction */
+#define TRX_UNDO_CACHED 2 /* cached for quick reuse */
+#define TRX_UNDO_TO_FREE 3 /* insert undo segment can be freed */
+#define TRX_UNDO_TO_PURGE 4 /* update undo segment will not be
+ reused: it can be freed in purge when
+ all undo data in it is removed */
+#define TRX_UNDO_PREPARED 5 /* contains an undo log of an
+ prepared transaction */
+
+#ifndef UNIV_HOTBACKUP
+/** Transaction undo log memory object; this is protected by the undo_mutex
+in the corresponding transaction object */
+
+struct trx_undo_struct{
+ /*-----------------------------*/
+ ulint id; /*!< undo log slot number within the
+ rollback segment */
+ ulint type; /*!< TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint state; /*!< state of the corresponding undo log
+ segment */
+ ibool del_marks; /*!< relevant only in an update undo log:
+ this is TRUE if the transaction may
+ have delete marked records, because of
+ a delete of a row or an update of an
+ indexed field; purge is then
+ necessary; also TRUE if the transaction
+ has updated an externally stored
+ field */
+ trx_id_t trx_id; /*!< id of the trx assigned to the undo
+ log */
+ XID xid; /*!< X/Open XA transaction
+ identification */
+ ibool dict_operation; /*!< TRUE if a dict operation trx */
+ dulint table_id; /*!< if a dict operation, then the table
+ id */
+ trx_rseg_t* rseg; /*!< rseg where the undo log belongs */
+ /*-----------------------------*/
+ ulint space; /*!< space id where the undo log
+ placed */
+ ulint zip_size; /*!< compressed page size of space
+ in bytes, or 0 for uncompressed */
+ ulint hdr_page_no; /*!< page number of the header page in
+ the undo log */
+ ulint hdr_offset; /*!< header offset of the undo log on the
+ page */
+ ulint last_page_no; /*!< page number of the last page in the
+ undo log; this may differ from
+ top_page_no during a rollback */
+ ulint size; /*!< current size in pages */
+ /*-----------------------------*/
+ ulint empty; /*!< TRUE if the stack of undo log
+ records is currently empty */
+ ulint top_page_no; /*!< page number where the latest undo
+ log record was catenated; during
+ rollback the page from which the latest
+ undo record was chosen */
+ ulint top_offset; /*!< offset of the latest undo record,
+ i.e., the topmost element in the undo
+ log if we think of it as a stack */
+ undo_no_t top_undo_no; /*!< undo number of the latest record */
+ buf_block_t* guess_block; /*!< guess for the buffer block where
+ the top page might reside */
+ /*-----------------------------*/
+ UT_LIST_NODE_T(trx_undo_t) undo_list;
+ /*!< undo log objects in the rollback
+ segment are chained into lists */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define TRX_UNDO_PAGE_HDR FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define TRX_UNDO_PAGE_TYPE 0 /*!< TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+#define TRX_UNDO_PAGE_START 2 /*!< Byte offset where the undo log
+ records for the LATEST transaction
+ start on this page (remember that
+ in an update undo log, the first page
+ can contain several undo logs) */
+#define TRX_UNDO_PAGE_FREE 4 /*!< On each page of the undo log this
+ field contains the byte offset of the
+ first free byte on the page */
+#define TRX_UNDO_PAGE_NODE 6 /*!< The file list node in the chain
+ of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE (6 + FLST_NODE_SIZE)
+ /*!< Size of the transaction undo
+ log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT (3 * UNIV_PAGE_SIZE / 4)
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define TRX_UNDO_SEG_HDR (TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_STATE 0 /*!< TRX_UNDO_ACTIVE, ... */
+#define TRX_UNDO_LAST_LOG 2 /*!< Offset of the last undo log header
+ on the segment header page, 0 if
+ none */
+#define TRX_UNDO_FSEG_HEADER 4 /*!< Header for the file segment which
+ the undo log segment occupies */
+#define TRX_UNDO_PAGE_LIST (4 + FSEG_HEADER_SIZE)
+ /*!< Base node for the list of pages in
+ the undo log segment; defined only on
+ the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE (4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_TRX_ID 0 /*!< Transaction id */
+#define TRX_UNDO_TRX_NO 8 /*!< Transaction number of the
+ transaction; defined only if the log
+ is in a history list */
+#define TRX_UNDO_DEL_MARKS 16 /*!< Defined only in an update undo
+ log: TRUE if the transaction may have
+ done delete markings of records, and
+ thus purge is necessary */
+#define TRX_UNDO_LOG_START 18 /*!< Offset of the first undo log record
+ of this log on the header page; purge
+ may remove undo log record from the
+ log start, and therefore this is not
+ necessarily the same as this log
+ header end offset */
+#define TRX_UNDO_XID_EXISTS 20 /*!< TRUE if undo log header includes
+ X/Open XA transaction identification
+ XID */
+#define TRX_UNDO_DICT_TRANS 21 /*!< TRUE if the transaction is a table
+ create, index create, or drop
+ transaction: in recovery
+ the transaction cannot be rolled back
+ in the usual way: a 'rollback' rather
+ means dropping the created or dropped
+ table, if it still exists */
+#define TRX_UNDO_TABLE_ID 22 /*!< Id of the table if the preceding
+ field is TRUE */
+#define TRX_UNDO_NEXT_LOG 30 /*!< Offset of the next undo log header
+ on this page, 0 if none */
+#define TRX_UNDO_PREV_LOG 32 /*!< Offset of the previous undo log
+ header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE 34 /*!< If the log is put to the history
+ list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/* Note: the writing of the undo log old header is coded by a log record
+MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the
+header is logged separately. In this sense, the XID is not really a member
+of the undo log header. TODO: do not append the XID to the log header if XA
+is not needed by the user. The XID wastes about 150 bytes of space in every
+undo log. In the history list we may have millions of undo logs, which means
+quite a large overhead. */
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define TRX_UNDO_XA_FORMAT (TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define TRX_UNDO_XA_TRID_LEN (TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define TRX_UNDO_XA_BQUAL_LEN (TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define TRX_UNDO_XA_XID (TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+ /*!< Total size of the undo log header
+ with the XA XID */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0undo.ic b/storage/xtradb/include/trx0undo.ic
new file mode 100644
index 00000000000..2d289b34ef1
--- /dev/null
+++ b/storage/xtradb/include/trx0undo.ic
@@ -0,0 +1,351 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+ ibool is_insert, /*!< in: TRUE if insert undo log */
+ ulint rseg_id, /*!< in: rollback segment id */
+ ulint page_no, /*!< in: page number */
+ ulint offset) /*!< in: offset of the undo entry within page */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ ut_ad(rseg_id < 128);
+
+ return(ut_dulint_create(is_insert * 128 * 256 * 256
+ + rseg_id * 256 * 256
+ + (page_no / 256) / 256,
+ (page_no % (256 * 256)) * 256 * 256
+ + offset));
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer */
+ ibool* is_insert, /*!< out: TRUE if insert undo log */
+ ulint* rseg_id, /*!< out: rollback segment id */
+ ulint* page_no, /*!< out: page number */
+ ulint* offset) /*!< out: offset of the undo
+ entry within page */
+{
+ ulint low;
+ ulint high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ high = ut_dulint_get_high(roll_ptr);
+ low = ut_dulint_get_low(roll_ptr);
+
+ *offset = low % (256 * 256);
+
+ *is_insert = high / (256 * 256 * 128); /* TRUE == 1 */
+ *rseg_id = (high / (256 * 256)) % 128;
+
+ *page_no = (high % (256 * 256)) * 256 * 256
+ + (low / 256) / 256;
+}
+
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+ roll_ptr_t roll_ptr) /*!< in: roll pointer */
+{
+ ulint high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ high = ut_dulint_get_high(roll_ptr);
+
+ return(high / (256 * 256 * 128));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+ byte* ptr, /*!< in: pointer to memory where
+ written */
+ roll_ptr_t roll_ptr) /*!< in: roll ptr */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ mach_write_to_7(ptr, roll_ptr);
+}
+
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+ const byte* ptr) /*!< in: pointer to memory from where to read */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ return(mach_read_from_7(ptr));
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block = buf_page_get(space, zip_size, page_no,
+ RW_X_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+ ulint space, /*!< in: space where placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ buf_block_t* block = buf_page_get(space, zip_size, page_no,
+ RW_S_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Returns the start offset of the undo log records of the specified undo
+log on the page.
+@return start offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_start(
+/*====================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ start = mach_read_from_2(offset + undo_page
+ + TRX_UNDO_LOG_START);
+ } else {
+ start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+ }
+
+ return(start);
+}
+
+/******************************************************************//**
+Returns the end offset of the undo log records of the specified undo
+log on the page.
+@return end offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_end(
+/*==================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+
+ end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (end == 0) {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+ } else {
+ end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ }
+
+ return(end);
+}
+
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint start;
+
+ undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+
+ if (start + undo_page == rec) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(rec - 2));
+}
+
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+ trx_undo_rec_t* rec, /*!< in: undo log record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ page_t* undo_page;
+ ulint end;
+ ulint next;
+
+ undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ next = mach_read_from_2(rec);
+
+ if (next == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + next);
+}
+
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + mach_read_from_2(undo_page + end - 2));
+}
+
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+ page_t* undo_page,/*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header offset on page */
+{
+ ulint start;
+ ulint end;
+
+ start = trx_undo_page_get_start(undo_page, page_no, offset);
+ end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+ if (start == end) {
+
+ return(NULL);
+ }
+
+ return(undo_page + start);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0xa.h b/storage/xtradb/include/trx0xa.h
new file mode 100644
index 00000000000..e0dd8a1af5b
--- /dev/null
+++ b/storage/xtradb/include/trx0xa.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef XA_H
+#define XA_H
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define XIDDATASIZE 128 /*!< maximum size of a transaction
+ identifier, in bytes */
+#define MAXGTRIDSIZE 64 /*!< maximum size in bytes of gtrid */
+#define MAXBQUALSIZE 64 /*!< maximum size in bytes of bqual */
+
+/** X/Open XA distributed transaction identifier */
+struct xid_t {
+ long formatID; /*!< format identifier; -1
+ means that the XID is null */
+ long gtrid_length; /*!< value from 1 through 64 */
+ long bqual_length; /*!< value from 1 through 64 */
+ char data[XIDDATASIZE]; /*!< distributed transaction
+ identifier */
+};
+/** X/Open XA distributed transaction identifier */
+typedef struct xid_t XID;
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define XA_OK 0 /*!< normal execution */
+#define XAER_ASYNC -2 /*!< asynchronous operation already
+ outstanding */
+#define XAER_RMERR -3 /*!< a resource manager error
+ occurred in the transaction
+ branch */
+#define XAER_NOTA -4 /*!< the XID is not valid */
+#define XAER_INVAL -5 /*!< invalid arguments were given */
+#define XAER_PROTO -6 /*!< routine invoked in an improper
+ context */
+#define XAER_RMFAIL -7 /*!< resource manager unavailable */
+#define XAER_DUPID -8 /*!< the XID already exists */
+#define XAER_OUTSIDE -9 /*!< resource manager doing
+ work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
new file mode 100644
index 00000000000..8691e3cf337
--- /dev/null
+++ b/storage/xtradb/include/univ.i
@@ -0,0 +1,501 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#ifdef UNIV_HOTBACKUP
+#include "hb_univ.i"
+#endif /* UNIV_HOTBACKUP */
+
+#define INNODB_VERSION_MAJOR 1
+#define INNODB_VERSION_MINOR 0
+#define INNODB_VERSION_BUGFIX 12
+#define PERCONA_INNODB_VERSION 12.1
+
+/* The following is the InnoDB version as shown in
+SELECT plugin_version FROM information_schema.plugins;
+calculated in make_version_string() in sql/sql_show.cc like this:
+"version >> 8" . "version & 0xff"
+because the version is shown with only one dot, we skip the last
+component, i.e. we show M.N.P as M.N */
+#define INNODB_VERSION_SHORT \
+ (INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR)
+
+/* auxiliary macros to help creating the version as string */
+#define __INNODB_VERSION(a, b, c, d) (#a "." #b "." #c "-" #d)
+#define _INNODB_VERSION(a, b, c, d) __INNODB_VERSION(a, b, c, d)
+
+
+#define INNODB_VERSION_STR \
+ _INNODB_VERSION(INNODB_VERSION_MAJOR, \
+ INNODB_VERSION_MINOR, \
+ INNODB_VERSION_BUGFIX, \
+ PERCONA_INNODB_VERSION)
+
+#define REFMAN "http://dev.mysql.com/doc/refman/5.1/en/"
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+/* In the dynamic plugin, redefine some externally visible symbols
+in order not to conflict with the symbols of a builtin InnoDB. */
+
+/* Rename all C++ classes that contain virtual functions, because we
+have not figured out how to apply the visibility=hidden attribute to
+the virtual method table (vtable) in GCC 3. */
+# define ha_innobase ha_innodb
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+/* if any of the following macros is defined at this point this means
+that the code from the "right" plug.in was executed and we do not
+need to include ut0auxconf.h which would either define the same macros
+or will be empty */
+#if !defined(HAVE_IB_GCC_ATOMIC_BUILTINS) \
+ && !defined(HAVE_IB_ATOMIC_PTHREAD_T_GCC) \
+ && !defined(HAVE_IB_SOLARIS_ATOMICS) \
+ && !defined(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS) \
+ && !defined(SIZEOF_PTHREAD_T) \
+ && !defined(HAVE_IB_PAUSE_INSTRUCTION)
+# include "ut0auxconf.h"
+#endif
+
+#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__)
+# undef __WIN__
+# define __WIN__
+
+# include <windows.h>
+
+# ifdef _NT_
+# define __NT__
+# endif
+
+#else
+/* The defines used with MySQL */
+
+/* Include two header files from MySQL to make the Unix flavor used
+in compiling more Posix-compatible. These headers also define __WIN__
+if we are compiling on Windows. */
+
+#ifndef UNIV_HOTBACKUP
+# include <my_global.h>
+# include <my_pthread.h>
+#endif /* UNIV_HOTBACKUP */
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
+# include <sys/stat.h>
+# if !defined(__NETWARE__) && !defined(__WIN__)
+# include <sys/mman.h> /* mmap() for os0proc.c */
+# endif
+
+/* Include the header file generated by GNU autoconf */
+# ifndef __WIN__
+# ifndef UNIV_HOTBACKUP
+# include "config.h"
+# endif /* UNIV_HOTBACKUP */
+# endif
+
+# ifdef HAVE_SCHED_H
+# include <sched.h>
+# endif
+
+/* We only try to do explicit inlining of functions with gcc and
+Sun Studio */
+
+# if !defined(__GNUC__) && !(defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+# undef UNIV_MUST_NOT_INLINE /* Remove compiler warning */
+# define UNIV_MUST_NOT_INLINE
+# endif
+
+# ifdef HAVE_PREAD
+# define HAVE_PWRITE
+# endif
+
+#endif /* #if (defined(WIN32) || ... */
+
+/* DEBUG VERSION CONTROL
+ ===================== */
+
+/* The following flag will make InnoDB to initialize
+all memory it allocates to zero. It hides Purify
+warnings about reading unallocated memory unless
+memory is read outside the allocated blocks. */
+/*
+#define UNIV_INIT_MEM_TO_ZERO
+*/
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions are not called from anywhere in
+the code, they can be called from gdb after
+innobase_start_or_create_for_mysql() has executed using the call
+command. Not tested on Windows. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+*/
+
+#if defined(HAVE_valgrind)&& defined(HAVE_VALGRIND_MEMCHECK_H)
+# define UNIV_DEBUG_VALGRIND
+#endif
+#if 0
+#define UNIV_DEBUG_VALGRIND /* Enable extra
+ Valgrind instrumentation */
+#define UNIV_DEBUG_PRINT /* Enable the compilation of
+ some debug print functions */
+#define UNIV_AHI_DEBUG /* Enable adaptive hash index
+ debugging without UNIV_DEBUG */
+#define UNIV_BUF_DEBUG /* Enable buffer pool
+ debugging without UNIV_DEBUG */
+#define UNIV_DEBUG /* Enable ut_ad() assertions
+ and disable UNIV_INLINE */
+#define UNIV_DEBUG_LOCK_VALIDATE /* Enable
+ ut_ad(lock_rec_validate_page())
+ assertions. */
+#define UNIV_DEBUG_FILE_ACCESSES /* Debug .ibd file access
+ (field file_page_was_freed
+ in buf_page_t) */
+#define UNIV_LRU_DEBUG /* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG /* debug HASH_ macros */
+#define UNIV_LIST_DEBUG /* debug UT_LIST_ macros */
+#define UNIV_LOG_LSN_DEBUG /* write LSN to the redo log;
+this will break redo log file compatibility, but it may be useful when
+debugging redo log application problems. */
+#define UNIV_MEM_DEBUG /* detect memory leaks etc */
+#define UNIV_IBUF_DEBUG /* debug the insert buffer */
+#define UNIV_IBUF_COUNT_DEBUG /* debug the insert buffer;
+this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
+and the insert buffer must be empty when the database is started */
+#define UNIV_SYNC_DEBUG /* debug mutex and latch
+operations (very slow); also UNIV_DEBUG must be defined */
+#define UNIV_SEARCH_DEBUG /* debug B-tree comparisons */
+#define UNIV_SYNC_PERF_STAT /* operation counts for
+ rw-locks and mutexes */
+#define UNIV_SEARCH_PERF_STAT /* statistics for the
+ adaptive hash index */
+#define UNIV_SRV_PRINT_LATCH_WAITS /* enable diagnostic output
+ in sync0sync.c */
+#define UNIV_BTR_PRINT /* enable functions for
+ printing B-trees */
+#define UNIV_ZIP_DEBUG /* extensive consistency checks
+ for compressed pages */
+#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
+ more often */
+#endif
+
+#define UNIV_BTR_DEBUG /* check B-tree links */
+#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */
+
+#ifdef HAVE_valgrind
+/* The following sets all new allocated memory to zero before use:
+this can be used to eliminate unnecessary Purify warnings, but note that
+it also masks many bugs Purify could detect. For detailed Purify analysis it
+is best to remove the define below and look through the warnings one
+by one. */
+#define UNIV_SET_MEM_TO_ZERO
+#endif
+
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+ /* the above option prevents forcing of log to disk
+ at a buffer page write: it should be tested with this
+ option off; also some ibuf tests are suppressed */
+
+/* Linkage specifier for non-static InnoDB symbols (variables and functions)
+that are only referenced from within InnoDB, not from MySQL */
+#if defined(__GNUC__) && (__GNUC__ >= 4) || defined(__INTEL_COMPILER)
+# define UNIV_INTERN __attribute__((visibility ("hidden")))
+#else
+# define UNIV_INTERN
+#endif
+
+#if (!defined(UNIV_DEBUG) && !defined(UNIV_MUST_NOT_INLINE))
+/* Definition for inline version */
+
+#ifdef __WIN__
+# define UNIV_INLINE __inline
+#elif defined(__SUNPRO_CC) || defined(__SUNPRO_C)
+# define UNIV_INLINE static inline
+#else
+# define UNIV_INLINE static __inline__
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE UNIV_INTERN
+
+#endif /* UNIV_DEBUG */
+
+#ifdef _WIN32
+#define UNIV_WORD_SIZE 4
+#elif defined(_WIN64)
+#define UNIV_WORD_SIZE 8
+#else
+/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+#define UNIV_WORD_SIZE SIZEOF_LONG
+#endif
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT 8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT UNIV_WORD_SIZE
+
+/*
+ DATABASE VERSION CONTROL
+ ========================
+*/
+
+/* The 2-logarithm of UNIV_PAGE_SIZE: */
+/* #define UNIV_PAGE_SIZE_SHIFT 14 */
+#define UNIV_PAGE_SIZE_SHIFT_MAX 14
+#define UNIV_PAGE_SIZE_SHIFT srv_page_size_shift
+/* The universal page size of the database */
+/* #define UNIV_PAGE_SIZE (1u << UNIV_PAGE_SIZE_SHIFT) */
+#define UNIV_PAGE_SIZE srv_page_size
+#define UNIV_PAGE_SIZE_MAX (1u << UNIV_PAGE_SIZE_SHIFT_MAX)
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM 32
+
+/* The maximum length of a table name. This is the MySQL limit and is
+defined in mysql_com.h like NAME_CHAR_LEN*SYSTEM_CHARSET_MBMAXLEN, the
+number does not include a terminating '\0'. InnoDB probably can handle
+longer names internally */
+#define MAX_TABLE_NAME_LEN 192
+
+/*
+ UNIVERSAL TYPE DEFINITIONS
+ ==========================
+*/
+
+/* Note that inside MySQL 'byte' is defined as char on Linux! */
+#define byte unsigned char
+
+/* Define an unsigned integer type that is exactly 32 bits. */
+
+#if SIZEOF_INT == 4
+typedef unsigned int ib_uint32_t;
+#elif SIZEOF_LONG == 4
+typedef unsigned long ib_uint32_t;
+#else
+#error "Neither int or long is 4 bytes"
+#endif
+
+/* Another basic type we use is unsigned long integer which should be equal to
+the word size of the machine, that is on a 32-bit platform 32 bits, and on a
+64-bit platform 64 bits. We also give the printf format for the type as a
+macro ULINTPF. */
+
+#ifdef _WIN64
+typedef unsigned __int64 ulint;
+#define ULINTPF "%I64u"
+typedef __int64 lint;
+#define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONGLONG
+#else
+typedef unsigned long int ulint;
+#define ULINTPF "%lu"
+typedef long int lint;
+#define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONG
+#endif
+
+#ifdef __WIN__
+typedef __int64 ib_int64_t;
+typedef unsigned __int64 ib_uint64_t;
+#elif !defined(UNIV_HOTBACKUP)
+/* Note: longlong and ulonglong come from MySQL headers. */
+typedef longlong ib_int64_t;
+typedef ulonglong ib_uint64_t;
+#endif
+
+#ifndef UNIV_HOTBACKUP
+typedef unsigned long long int ullint;
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef __WIN__
+#if SIZEOF_LONG != SIZEOF_VOIDP
+#error "Error: InnoDB's ulint must be of the same size as void*"
+#endif
+#endif
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED ((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define ULINT32_UNDEFINED 0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX ((ulint)(-2))
+
+/* Maximum value for ib_uint64_t */
+#define IB_ULONGLONG_MAX ((ib_uint64_t) (~0ULL))
+
+/* This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool ulint
+
+#ifndef TRUE
+
+#define TRUE 1
+#define FALSE 0
+
+#endif
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX)
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \
+ || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590)
+# include <sun_prefetch.h>
+#if __SUNPRO_C >= 0x550
+# undef UNIV_INTERN
+# define UNIV_INTERN __hidden
+#endif /* __SUNPRO_C >= 0x550 */
+/* Use sun_prefetch when compile with Sun Studio */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+/* The return type from a thread's start function differs between Unix and
+Windows, so define a typedef for it and a macro to use at the end of such
+functions. */
+
+#ifdef __WIN__
+typedef ulint os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(0)
+#else
+typedef void* os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(NULL)
+#endif
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+#ifdef UNIV_DEBUG_VALGRIND
+# include <valgrind/memcheck.h>
+# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size)
+# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
+# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b)
+# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
+# define UNIV_MEM_ASSERT_RW(addr, size) do { \
+ const void* _p = (const void*) (ulint) \
+ VALGRIND_CHECK_MEM_IS_DEFINED(addr, size); \
+ if (UNIV_LIKELY_NULL(_p)) \
+ fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n", \
+ __FILE__, __LINE__, \
+ (const void*) (addr), (unsigned) (size), (long) \
+ (((const char*) _p) - ((const char*) (addr)))); \
+ } while (0)
+# define UNIV_MEM_ASSERT_W(addr, size) do { \
+ const void* _p = (const void*) (ulint) \
+ VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size); \
+ if (UNIV_LIKELY_NULL(_p)) \
+ fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n", \
+ __FILE__, __LINE__, \
+ (const void*) (addr), (unsigned) (size), (long) \
+ (((const char*) _p) - ((const char*) (addr)))); \
+ } while (0)
+#else
+# define UNIV_MEM_VALID(addr, size) do {} while(0)
+# define UNIV_MEM_INVALID(addr, size) do {} while(0)
+# define UNIV_MEM_FREE(addr, size) do {} while(0)
+# define UNIV_MEM_ALLOC(addr, size) do {} while(0)
+# define UNIV_MEM_DESC(addr, size, b) do {} while(0)
+# define UNIV_MEM_UNDESC(b) do {} while(0)
+# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
+#endif
+#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do { \
+ UNIV_MEM_ASSERT_W(addr, size); \
+ UNIV_MEM_FREE(addr, size); \
+} while (0)
+#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do { \
+ UNIV_MEM_ASSERT_W(addr, size); \
+ UNIV_MEM_ALLOC(addr, size); \
+} while (0)
+
+extern ulint srv_page_size_shift;
+extern ulint srv_page_size;
+#endif
diff --git a/storage/xtradb/include/usr0sess.h b/storage/xtradb/include/usr0sess.h
new file mode 100644
index 00000000000..2c288f7d455
--- /dev/null
+++ b/storage/xtradb/include/usr0sess.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.h
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0sess_h
+#define usr0sess_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "trx0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0rec.h"
+
+/*********************************************************************//**
+Opens a session.
+@return own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void);
+/*============*/
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+ sess_t* sess); /* in, own: session object */
+
+/* The session handle. All fields are protected by the kernel mutex */
+struct sess_struct{
+ ulint state; /*!< state of the session */
+ trx_t* trx; /*!< transaction object permanently
+ assigned for the session: the
+ transaction instance designated by the
+ trx id changes, but the memory
+ structure is preserved */
+ UT_LIST_BASE_NODE_T(que_t)
+ graphs; /*!< query graphs belonging to this
+ session */
+};
+
+/* Session states */
+#define SESS_ACTIVE 1
+#define SESS_ERROR 2 /* session contains an error message
+ which has not yet been communicated
+ to the client */
+#ifndef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/usr0sess.ic b/storage/xtradb/include/usr0sess.ic
new file mode 100644
index 00000000000..35a75d75acc
--- /dev/null
+++ b/storage/xtradb/include/usr0sess.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.ic
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/usr0types.h b/storage/xtradb/include/usr0types.h
new file mode 100644
index 00000000000..6cc6f015613
--- /dev/null
+++ b/storage/xtradb/include/usr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0types.h
+Users and sessions global types
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0types_h
+#define usr0types_h
+
+typedef struct sess_struct sess_t;
+
+#endif
diff --git a/storage/xtradb/include/ut0auxconf.h b/storage/xtradb/include/ut0auxconf.h
new file mode 100644
index 00000000000..16bcc308392
--- /dev/null
+++ b/storage/xtradb/include/ut0auxconf.h
@@ -0,0 +1,14 @@
+/* Do not remove this file even though it is empty.
+This file is included in univ.i and will cause compilation failure
+if not present.
+A custom checks have been added in the generated
+storage/innobase/Makefile.in that is shipped with the InnoDB Plugin
+source archive. These checks eventually define some macros and put
+them in this file.
+This is a hack that has been developed in order to deploy new compile
+time checks without the need to regenerate the ./configure script that is
+distributed in the MySQL 5.1 official source archives.
+If by any chance Makefile.in and ./configure are regenerated and thus
+the hack from Makefile.in wiped away then the "real" checks from plug.in
+will take over.
+*/
diff --git a/storage/xtradb/include/ut0byte.h b/storage/xtradb/include/ut0byte.h
new file mode 100644
index 00000000000..f55e2888c60
--- /dev/null
+++ b/storage/xtradb/include/ut0byte.h
@@ -0,0 +1,270 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+
+#include "univ.i"
+
+/** Pair of ulint integers. */
+typedef struct dulint_struct dulint;
+/** Type definition for a 64-bit unsigned integer, which works also
+in 32-bit machines. NOTE! Access the fields only with the accessor
+functions. This definition appears here only for the compiler to
+know the size of a dulint. */
+struct dulint_struct{
+ ulint high; /*!< most significant 32 bits */
+ ulint low; /*!< least significant 32 bits */
+};
+
+/** Zero value for a dulint */
+extern const dulint ut_dulint_zero;
+
+/** Maximum value for a dulint */
+extern const dulint ut_dulint_max;
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low); /*!< in: low-order 32 bits */
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+ dulint d); /*!< in: dulint */
+/*******************************************************//**
+Tests if a dulint is zero.
+@return TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ dulint a); /*!< in: dulint */
+/*******************************************************//**
+Compares two dulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Calculates the max of two dulints.
+@return max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Calculates the min of two dulints.
+@return min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b); /*!< in: dulint */
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ ulint b); /*!< in: ulint */
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ dulint a, /*!< in: dulint */
+ ulint b); /*!< in: ulint, b <= a */
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ dulint a, /*!< in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b); /*!< in: dulint */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number which must be a
+ power of 2 */
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number which must be a
+ power of 2 */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no); /*!< in: align by this number
+ which must be a power of 2 */
+/*******************************************************//**
+Increments a dulint variable by 1. */
+#define UT_DULINT_INC(D)\
+{\
+ if ((D).low == 0xFFFFFFFFUL) {\
+ (D).high = (D).high + 1;\
+ (D).low = 0;\
+ } else {\
+ (D).low = (D).low + 1;\
+ }\
+}
+/*******************************************************//**
+Tests if two dulints are equal. */
+#define UT_DULINT_EQ(D1, D2) (((D1).low == (D2).low)\
+ && ((D1).high == (D2).high))
+#ifdef notdefined
+/************************************************************//**
+Sort function for dulint arrays. */
+UNIV_INTERN
+void
+ut_dulint_sort(
+/*===========*/
+ dulint* arr, /*!< in/out: array to be sorted */
+ dulint* aux_arr,/*!< in/out: auxiliary array (same size as arr) */
+ ulint low, /*!< in: low bound of sort interval, inclusive */
+ ulint high); /*!< in: high bound of sort interval, noninclusive */
+#endif /* notdefined */
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no); /*!< in: align by this number */
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+ __attribute__((const));
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+ __attribute__((const));
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n); /*!< in: nth bit requested */
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val); /*!< in: value for the bit to set */
+
+#ifndef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0byte.ic b/storage/xtradb/include/ut0byte.ic
new file mode 100644
index 00000000000..3dd51890cb4
--- /dev/null
+++ b/storage/xtradb/include/ut0byte.ic
@@ -0,0 +1,411 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+ ulint high, /*!< in: high-order 32 bits */
+ ulint low) /*!< in: low-order 32 bits */
+{
+ dulint res;
+
+ ut_ad(high <= 0xFFFFFFFF);
+ ut_ad(low <= 0xFFFFFFFF);
+
+ res.high = high;
+ res.low = low;
+
+ return(res);
+}
+
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+ dulint d) /*!< in: dulint */
+{
+ return(d.high);
+}
+
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return 32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+ dulint d) /*!< in: dulint */
+{
+ return(d.low);
+}
+
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+ dulint d) /*!< in: dulint */
+{
+ return((ib_int64_t)d.low
+ + (((ib_int64_t)d.high) << 32));
+}
+
+/*******************************************************//**
+Tests if a dulint is zero.
+@return TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+ dulint a) /*!< in: dulint */
+{
+ if ((a.low == 0) && (a.high == 0)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************//**
+Compares two dulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (a.high > b.high) {
+ return(1);
+ } else if (a.high < b.high) {
+ return(-1);
+ } else if (a.low > b.low) {
+ return(1);
+ } else if (a.low < b.low) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/*******************************************************//**
+Calculates the max of two dulints.
+@return max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(a);
+ }
+
+ return(b);
+}
+
+/*******************************************************//**
+Calculates the min of two dulints.
+@return min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+ dulint a, /*!< in: dulint */
+ dulint b) /*!< in: dulint */
+{
+ if (ut_dulint_cmp(a, b) > 0) {
+
+ return(b);
+ }
+
+ return(a);
+}
+
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+ dulint a, /*!< in: dulint */
+ ulint b) /*!< in: ulint */
+{
+ if (0xFFFFFFFFUL - b >= a.low) {
+ a.low += b;
+
+ return(a);
+ }
+
+ a.low = a.low - (0xFFFFFFFFUL - b) - 1;
+
+ a.high++;
+
+ return(a);
+}
+
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+ dulint a, /*!< in: dulint */
+ ulint b) /*!< in: ulint, b <= a */
+{
+ if (a.low >= b) {
+ a.low -= b;
+
+ return(a);
+ }
+
+ b -= a.low + 1;
+
+ a.low = 0xFFFFFFFFUL - b;
+
+ ut_ad(a.high > 0);
+
+ a.high--;
+
+ return(a);
+}
+
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+ dulint a, /*!< in: dulint; NOTE a must be >= b and at most
+ 2 to power 32 - 1 greater */
+ dulint b) /*!< in: dulint */
+{
+ ulint diff;
+
+ if (a.high == b.high) {
+ ut_ad(a.low >= b.low);
+
+ return(a.low - b.low);
+ }
+
+ ut_ad(a.high == b.high + 1);
+
+ diff = (ulint)(0xFFFFFFFFUL - b.low);
+ diff += 1 + a.low;
+
+ ut_ad(diff > a.low);
+
+ return(diff);
+}
+
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number which must be a
+ power of 2 */
+{
+ ulint low, high;
+
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+
+ low = ut_dulint_get_low(n);
+ high = ut_dulint_get_high(n);
+
+ low = low & ~(align_no - 1);
+
+ return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+ dulint n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number which must be a
+ power of 2 */
+{
+ return(ut_dulint_align_down(ut_dulint_add(n, align_no - 1), align_no));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t downward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+ ib_uint64_t n, /*!< in: number to be rounded */
+ ulint align_no) /*!< in: align by this number
+ which must be a power of 2 */
+{
+ ib_uint64_t align_1 = (ib_uint64_t) align_no - 1;
+
+ ut_ad(align_no > 0);
+ ut_ad(ut_is_2pow(align_no));
+
+ return((n + align_1) & ~align_1);
+}
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+ const void* ptr, /*!< in: pointer */
+ ulint align_no) /*!< in: align by this number */
+{
+ ut_ad(align_no > 0);
+ ut_ad(((align_no - 1) & align_no) == 0);
+ ut_ad(ptr);
+
+ ut_ad(sizeof(void*) == sizeof(ulint));
+
+ return(((ulint)ptr) & (align_no - 1));
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n) /*!< in: nth bit requested */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ return(1 & (a >> n));
+}
+
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+ ulint a, /*!< in: ulint */
+ ulint n, /*!< in: nth bit requested */
+ ibool val) /*!< in: value for the bit to set */
+{
+ ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+ if (val) {
+ return(((ulint) 1 << n) | a);
+ } else {
+ return(~((ulint) 1 << n) & a);
+ }
+}
diff --git a/storage/xtradb/include/ut0dbg.h b/storage/xtradb/include/ut0dbg.h
new file mode 100644
index 00000000000..78b525c38ab
--- /dev/null
+++ b/storage/xtradb/include/ut0dbg.h
@@ -0,0 +1,175 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#include "univ.i"
+#include <stdlib.h>
+#include "os0thread.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+/** Test if an assertion fails.
+@param EXPR assertion expression
+@return nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR)))
+#else
+/** This is used to eliminate compiler warnings */
+extern ulint ut_dbg_zero;
+/** Test if an assertion fails.
+@param EXPR assertion expression
+@return nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero)
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion */
+ const char* file, /*!< in: source file containing the assertion */
+ ulint line); /*!< in: line number of the assertion */
+
+#ifdef __NETWARE__
+/** Flag for ignoring further assertion failures. This is set to TRUE
+when on NetWare there happens an InnoDB assertion failure or other
+fatal error condition that requires an immediate shutdown. */
+extern ibool panic_shutdown;
+/* Abort the execution. */
+void ut_dbg_panic(void);
+# define UT_DBG_PANIC ut_dbg_panic()
+/* Stop threads in ut_a(). */
+# define UT_DBG_STOP do {} while (0) /* We do not do this on NetWare */
+#else /* __NETWARE__ */
+# if defined(__WIN__) || defined(__INTEL_COMPILER)
+# undef UT_DBG_USE_ABORT
+# elif defined(__GNUC__) && (__GNUC__ > 2)
+# define UT_DBG_USE_ABORT
+# endif
+
+# ifndef UT_DBG_USE_ABORT
+/** A null pointer that will be dereferenced to trigger a memory trap */
+extern ulint* ut_dbg_null_ptr;
+# endif
+
+# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/** If this is set to TRUE by ut_dbg_assertion_failed(), all threads
+will stop at the next ut_a() or ut_ad(). */
+extern ibool ut_dbg_stop_threads;
+
+/*************************************************************//**
+Stop a thread after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_stop_thread(
+/*===============*/
+ const char* file,
+ ulint line);
+# endif
+
+# ifdef UT_DBG_USE_ABORT
+/** Abort the execution. */
+# define UT_DBG_PANIC abort()
+/** Stop threads (null operation) */
+# define UT_DBG_STOP do {} while (0)
+# else /* UT_DBG_USE_ABORT */
+/** Abort the execution. */
+# define UT_DBG_PANIC \
+ if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL
+/** Stop threads in ut_a(). */
+# define UT_DBG_STOP do \
+ if (UNIV_UNLIKELY(ut_dbg_stop_threads)) { \
+ ut_dbg_stop_thread(__FILE__, (ulint) __LINE__); \
+ } while (0)
+# endif /* UT_DBG_USE_ABORT */
+#endif /* __NETWARE__ */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR assertion expression that should hold */
+#define ut_a(EXPR) do { \
+ if (UT_DBG_FAIL(EXPR)) { \
+ ut_dbg_assertion_failed(#EXPR, \
+ __FILE__, (ulint) __LINE__); \
+ UT_DBG_PANIC; \
+ } \
+ UT_DBG_STOP; \
+} while (0)
+
+/** Abort execution. */
+#define ut_error do { \
+ ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__); \
+ UT_DBG_PANIC; \
+} while (0)
+
+#ifdef UNIV_DEBUG
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR) ut_a(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR) do {EXPR;} while (0)
+#else
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+/** Silence warnings about an unused variable by doing a null assignment.
+@param A the unused variable */
+#define UT_NOT_USED(A) A = A
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** structure used for recording usage statistics */
+typedef struct speedo_struct {
+ struct rusage ru; /*!< getrusage() result */
+ struct timeval tv; /*!< gettimeofday() result */
+} speedo_t;
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+ speedo_t* speedo); /*!< out: speedo */
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+ const speedo_t* speedo); /*!< in: speedo */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+#endif
diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h
new file mode 100644
index 00000000000..ec67f4e2a0f
--- /dev/null
+++ b/storage/xtradb/include/ut0list.h
@@ -0,0 +1,172 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+typedef struct ib_list_struct ib_list_t;
+typedef struct ib_list_node_struct ib_list_node_t;
+typedef struct ib_list_helper_struct ib_list_helper_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* prev_node, /*!< in: node preceding new node (can
+ be NULL) */
+ void* data, /*!< in: data */
+ mem_heap_t* heap); /*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node); /*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list); /*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list); /*!< in: list */
+
+/* List. */
+struct ib_list_struct {
+ ib_list_node_t* first; /*!< first node */
+ ib_list_node_t* last; /*!< last node */
+ ibool is_heap_list; /*!< TRUE if this list was
+ allocated through a heap */
+};
+
+/* A list node. */
+struct ib_list_node_struct {
+ ib_list_node_t* prev; /*!< previous node */
+ ib_list_node_t* next; /*!< next node */
+ void* data; /*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_struct {
+ mem_heap_t* heap; /*!< memory heap */
+ void* data; /*!< user data */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic
new file mode 100644
index 00000000000..eb5c62796e8
--- /dev/null
+++ b/storage/xtradb/include/ut0list.ic
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+ ib_list_t* list) /*!< in: list */
+{
+ return(list->last);
+}
diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h
new file mode 100644
index 00000000000..245dfc226c3
--- /dev/null
+++ b/storage/xtradb/include/ut0lst.h
@@ -0,0 +1,304 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+#include "univ.i"
+
+/* This module implements the two-way linear list which should be used
+if a list is used in the database. Note that a single struct may belong
+to two or more lists, provided that the list are given different names.
+An example of the usage of the lists can be found in fil0fil.c. */
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which acts
+as the two-way list base node. The base node contains pointers
+to both ends of the list and a count of nodes in the list (excluding
+the base node from the count).
+@param TYPE the name of the list node data type */
+#define UT_LIST_BASE_NODE_T(TYPE)\
+struct {\
+ ulint count; /*!< count of nodes in list */\
+ TYPE * start; /*!< pointer to list start, NULL if empty */\
+ TYPE * end; /*!< pointer to list end, NULL if empty */\
+}\
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which
+should be embedded in the nodes of the list, the node type must be a struct.
+This struct contains the pointers to next and previous nodes in the list.
+The name of the field in the node struct should be the name given
+to the list.
+@param TYPE the list node type name */
+/* Example:
+typedef struct LRU_node_struct LRU_node_t;
+struct LRU_node_struct {
+ UT_LIST_NODE_T(LRU_node_t) LRU_list;
+ ...
+}
+The example implements an LRU list of name LRU_list. Its nodes are of type
+LRU_node_t. */
+
+#define UT_LIST_NODE_T(TYPE)\
+struct {\
+ TYPE * prev; /*!< pointer to the previous node,\
+ NULL if start of list */\
+ TYPE * next; /*!< pointer to next node, NULL if end of list */\
+}\
+
+/*******************************************************************//**
+Initializes the base node of a two-way list.
+@param BASE the list base node
+*/
+#define UT_LIST_INIT(BASE)\
+{\
+ (BASE).count = 0;\
+ (BASE).start = NULL;\
+ (BASE).end = NULL;\
+}\
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be added to the list.
+*/
+#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).next = (BASE).start;\
+ ((N)->NAME).prev = NULL;\
+ if (UNIV_LIKELY((BASE).start != NULL)) {\
+ ut_ad((BASE).start != (N));\
+ (((BASE).start)->NAME).prev = (N);\
+ }\
+ (BASE).start = (N);\
+ if (UNIV_UNLIKELY((BASE).end == NULL)) {\
+ (BASE).end = (N);\
+ }\
+}\
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be added to the list
+*/
+#define UT_LIST_ADD_LAST(NAME, BASE, N)\
+{\
+ ut_ad(N);\
+ ((BASE).count)++;\
+ ((N)->NAME).prev = (BASE).end;\
+ ((N)->NAME).next = NULL;\
+ if ((BASE).end != NULL) {\
+ ut_ad((BASE).end != (N));\
+ (((BASE).end)->NAME).next = (N);\
+ }\
+ (BASE).end = (N);\
+ if ((BASE).start == NULL) {\
+ (BASE).start = (N);\
+ }\
+}\
+
+/*******************************************************************//**
+Inserts a NODE2 after NODE1 in a list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param NODE1 pointer to node after which NODE2 is inserted
+@param NODE2 pointer to node being inserted after NODE1
+*/
+#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
+{\
+ ut_ad(NODE1);\
+ ut_ad(NODE2);\
+ ut_ad((NODE1) != (NODE2));\
+ ((BASE).count)++;\
+ ((NODE2)->NAME).prev = (NODE1);\
+ ((NODE2)->NAME).next = ((NODE1)->NAME).next;\
+ if (((NODE1)->NAME).next != NULL) {\
+ ((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
+ }\
+ ((NODE1)->NAME).next = (NODE2);\
+ if ((BASE).end == (NODE1)) {\
+ (BASE).end = (NODE2);\
+ }\
+}\
+
+#ifdef UNIV_LIST_DEBUG
+/** Invalidate the pointers in a list node.
+@param NAME list name
+@param N pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N) \
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+/** Invalidate the pointers in a list node.
+@param NAME list name
+@param N pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param NAME list name
+@param BASE the base node (not a pointer to it)
+@param N pointer to the node to be removed from the list
+*/
+#define UT_LIST_REMOVE(NAME, BASE, N) \
+do { \
+ ut_ad(N); \
+ ut_a((BASE).count > 0); \
+ ((BASE).count)--; \
+ if (((N)->NAME).next != NULL) { \
+ ((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev; \
+ } else { \
+ (BASE).end = ((N)->NAME).prev; \
+ } \
+ if (((N)->NAME).prev != NULL) { \
+ ((((N)->NAME).prev)->NAME).next = ((N)->NAME).next; \
+ } else { \
+ (BASE).start = ((N)->NAME).next; \
+ } \
+ UT_LIST_REMOVE_CLEAR(NAME, N); \
+} while (0)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)\
+ (((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME list name
+@param N pointer to a node
+@return the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)\
+ (((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE the base node (not a pointer to it).
+@return the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)\
+ (BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)\
+ (BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE the base node (not a pointer to it)
+@return last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)\
+ (BASE).end
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param NAME the name of the list
+@param TYPE node type
+@param BASE base node (not a pointer to it)
+@param ASSERTION a condition on ut_list_node_313 */
+#define UT_LIST_VALIDATE(NAME, TYPE, BASE, ASSERTION) \
+do { \
+ ulint ut_list_i_313; \
+ TYPE* ut_list_node_313; \
+ \
+ ut_list_node_313 = (BASE).start; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ ASSERTION; \
+ ut_ad((ut_list_node_313->NAME).next || !ut_list_i_313); \
+ ut_list_node_313 = (ut_list_node_313->NAME).next; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+ \
+ ut_list_node_313 = (BASE).end; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ ASSERTION; \
+ ut_ad((ut_list_node_313->NAME).prev || !ut_list_i_313); \
+ ut_list_node_313 = (ut_list_node_313->NAME).prev; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+} while (0)
+
+/********************************************************************//**
+Align nodes with moving location.
+@param NAME the name of the list
+@param TYPE node type
+@param BASE base node (not a pointer to it)
+@param OFFSET offset moved */
+#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET) \
+do { \
+ ulint ut_list_i_313; \
+ TYPE* ut_list_node_313; \
+ \
+ if ((BASE).start) \
+ (BASE).start = (void*)((byte*)((BASE).start) \
+ + (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\
+ if ((BASE).end) \
+ (BASE).end = (void*)((byte*)((BASE).end) \
+ + (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\
+ \
+ ut_list_node_313 = (BASE).start; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ if ((ut_list_node_313->NAME).prev) \
+ (ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\
+ + (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\
+ if ((ut_list_node_313->NAME).next) \
+ (ut_list_node_313->NAME).next = (void*)((byte*)((ut_list_node_313->NAME).next)\
+ + (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\
+ ut_list_node_313 = (ut_list_node_313->NAME).next; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+ \
+ ut_list_node_313 = (BASE).end; \
+ \
+ for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) { \
+ ut_a(ut_list_node_313); \
+ ut_list_node_313 = (ut_list_node_313->NAME).prev; \
+ } \
+ \
+ ut_a(ut_list_node_313 == NULL); \
+} while (0)
+
+#endif
+
diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h
new file mode 100644
index 00000000000..f14606be966
--- /dev/null
+++ b/storage/xtradb/include/ut0mem.h
@@ -0,0 +1,307 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+#include <string.h>
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc(). Does not count malloc()
+if srv_use_sys_malloc is set. Protected by ut_list_mutex. */
+extern ulint ut_total_allocated_memory;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+extern os_fast_mutex_t ut_list_mutex;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for memcpy(3). Copy memory area when the source and
+target are not overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memmove(3). Copy memory area when the source and
+target are overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memcmp(3). Compare memory areas.
+* @param str1 in: first memory block to compare
+* @param str2 in: second memory block to compare
+* @param n in: number of bytes to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n);
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void);
+/*=============*/
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined and set_to_zero is TRUE.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+ ulint n, /*!< in: number of bytes to allocate */
+ ibool set_to_zero, /*!< in: TRUE if allocated memory
+ should be set to zero if
+ UNIV_SET_MEM_TO_ZERO is defined */
+ ibool assert_on_error); /*!< in: if TRUE, we crash mysqld if
+ the memory cannot be allocated */
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc(
+/*======*/
+ ulint n); /*!< in: number of bytes to allocate */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+ut_test_malloc(
+/*===========*/
+ ulint n); /*!< in: try to allocate this many bytes */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+ void* ptr); /*!< in, own: memory block */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+ realloc() changes the size of the memory block pointed to
+ by ptr to size bytes. The contents will be unchanged to
+ the minimum of the old and new sizes; newly allocated mem­
+ ory will be uninitialized. If ptr is NULL, the call is
+ equivalent to malloc(size); if size is equal to zero, the
+ call is equivalent to free(ptr). Unless ptr is NULL, it
+ must have been returned by an earlier call to malloc(),
+ calloc() or realloc().
+
+RETURN VALUE
+ realloc() returns a pointer to the newly allocated memory,
+ which is suitably aligned for any kind of variable and may
+ be different from ptr, or NULL if the request fails. If
+ size was equal to 0, either NULL or a pointer suitable to
+ be passed to free() is returned. If realloc() fails the
+ original block is left untouched - it is not freed or
+ moved.
+@return own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+ void* ptr, /*!< in: pointer to old block or NULL */
+ ulint size); /*!< in: desired size */
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void);
+/*=================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for strcpy(3). Copy a NUL-terminated string.
+* @param dest in: copy to
+* @param sour in: copy from
+* @return dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour);
+
+/** Wrapper for strlen(3). Determine the length of a NUL-terminated string.
+* @param str in: string
+* @return length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str);
+
+/** Wrapper for strcmp(3). Compare NUL-terminated strings.
+* @param str1 in: first string to compare
+* @param str2 in: second string to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2);
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size); /*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size); /*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+ const char* str, /*!< in: null-terminated string */
+ char q); /*!< in: the quote character */
+
+/**********************************************************************//**
+Make a quoted copy of a NUL-terminated string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_memcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_strcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src); /*!< in: null-terminated string */
+
+/**********************************************************************//**
+Make a quoted copy of a fixed-length string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_strcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_memcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src, /*!< in: string to be quoted */
+ ulint len); /*!< in: length of src */
+
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+ const char* s1, /*!< in: string to search in */
+ const char* s2); /*!< in: string to search for */
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+ const char* str, /*!< in: string to operate on */
+ const char* s1, /*!< in: string to replace */
+ const char* s2); /*!< in: string to replace s1 with */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size); /*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size); /*!< in: output buffer size
+ in bytes */
+
+#ifndef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic
new file mode 100644
index 00000000000..f36c28f1989
--- /dev/null
+++ b/storage/xtradb/include/ut0mem.ic
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/** Wrapper for memcpy(3). Copy memory area when the source and
+target are not overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n)
+{
+ return(memcpy(dest, sour, n));
+}
+
+/** Wrapper for memmove(3). Copy memory area when the source and
+target are overlapping.
+* @param dest in: copy to
+* @param sour in: copy from
+* @param n in: number of bytes to copy
+* @return dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n)
+{
+ return(memmove(dest, sour, n));
+}
+
+/** Wrapper for memcmp(3). Compare memory areas.
+* @param str1 in: first memory block to compare
+* @param str2 in: second memory block to compare
+* @param n in: number of bytes to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n)
+{
+ return(memcmp(str1, str2, n));
+}
+
+/** Wrapper for strcpy(3). Copy a NUL-terminated string.
+* @param dest in: copy to
+* @param sour in: copy from
+* @return dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour)
+{
+ return(strcpy(dest, sour));
+}
+
+/** Wrapper for strlen(3). Determine the length of a NUL-terminated string.
+* @param str in: string
+* @return length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str)
+{
+ return(strlen(str));
+}
+
+/** Wrapper for strcmp(3). Compare NUL-terminated strings.
+* @param str1 in: first string to compare
+* @param str2 in: second string to compare
+* @return negative, 0, or positive if str1 is smaller, equal,
+ or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2)
+{
+ return(strcmp(str1, str2));
+}
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+ const char* str, /*!< in: null-terminated string */
+ char q) /*!< in: the quote character */
+{
+ ulint len;
+
+ for (len = 0; *str; len++, str++) {
+ if (*str == q) {
+ len++;
+ }
+ }
+
+ return(len);
+}
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+ const void* raw, /*!< in: raw data */
+ ulint raw_size, /*!< in: "raw" length in bytes */
+ char* hex, /*!< out: hex string */
+ ulint hex_size) /*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u) ((unsigned char) ((u) >> 8))
+#define UINT16_GET_B(u) ((unsigned char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u) ((unsigned char) ((u) & 0xFF))
+#define UINT16_GET_B(u) ((unsigned char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a) \
+ MK_UINT16(a, '0'), \
+ MK_UINT16(a, '1'), \
+ MK_UINT16(a, '2'), \
+ MK_UINT16(a, '3'), \
+ MK_UINT16(a, '4'), \
+ MK_UINT16(a, '5'), \
+ MK_UINT16(a, '6'), \
+ MK_UINT16(a, '7'), \
+ MK_UINT16(a, '8'), \
+ MK_UINT16(a, '9'), \
+ MK_UINT16(a, 'A'), \
+ MK_UINT16(a, 'B'), \
+ MK_UINT16(a, 'C'), \
+ MK_UINT16(a, 'D'), \
+ MK_UINT16(a, 'E'), \
+ MK_UINT16(a, 'F')
+
+ static const uint16 hex_map[256] = {
+ MK_ALL_UINT16_WITH_A('0'),
+ MK_ALL_UINT16_WITH_A('1'),
+ MK_ALL_UINT16_WITH_A('2'),
+ MK_ALL_UINT16_WITH_A('3'),
+ MK_ALL_UINT16_WITH_A('4'),
+ MK_ALL_UINT16_WITH_A('5'),
+ MK_ALL_UINT16_WITH_A('6'),
+ MK_ALL_UINT16_WITH_A('7'),
+ MK_ALL_UINT16_WITH_A('8'),
+ MK_ALL_UINT16_WITH_A('9'),
+ MK_ALL_UINT16_WITH_A('A'),
+ MK_ALL_UINT16_WITH_A('B'),
+ MK_ALL_UINT16_WITH_A('C'),
+ MK_ALL_UINT16_WITH_A('D'),
+ MK_ALL_UINT16_WITH_A('E'),
+ MK_ALL_UINT16_WITH_A('F')
+ };
+ const unsigned char* rawc;
+ ulint read_bytes;
+ ulint write_bytes;
+ ulint i;
+
+ rawc = (const unsigned char*) raw;
+
+ if (hex_size == 0) {
+
+ return(0);
+ }
+
+ if (hex_size <= 2 * raw_size) {
+
+ read_bytes = hex_size / 2;
+ write_bytes = hex_size;
+ } else {
+
+ read_bytes = raw_size;
+ write_bytes = 2 * raw_size + 1;
+ }
+
+#define LOOP_READ_BYTES(ASSIGN) \
+ for (i = 0; i < read_bytes; i++) { \
+ ASSIGN; \
+ hex += 2; \
+ rawc++; \
+ }
+
+ if (ut_align_offset(hex, 2) == 0) {
+
+ LOOP_READ_BYTES(
+ *(uint16*) hex = hex_map[*rawc]
+ );
+ } else {
+
+ LOOP_READ_BYTES(
+ *hex = UINT16_GET_A(hex_map[*rawc]);
+ *(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+ );
+ }
+
+ if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+ hex--;
+ }
+
+ *hex = '\0';
+
+ return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+ const char* str, /*!< in: string */
+ ulint str_len, /*!< in: string length in bytes */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint str_i;
+ ulint buf_i;
+
+ buf_i = 0;
+
+ switch (buf_size) {
+ case 3:
+
+ if (str_len == 0) {
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\'';
+ buf_i++;
+ }
+ /* FALLTHROUGH */
+ case 2:
+ case 1:
+
+ buf[buf_i] = '\0';
+ buf_i++;
+ /* FALLTHROUGH */
+ case 0:
+
+ return(buf_i);
+ }
+
+ /* buf_size >= 4 */
+
+ buf[0] = '\'';
+ buf_i = 1;
+
+ for (str_i = 0; str_i < str_len; str_i++) {
+
+ char ch;
+
+ if (buf_size - buf_i == 2) {
+
+ break;
+ }
+
+ ch = str[str_i];
+
+ switch (ch) {
+ case '\0':
+
+ if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = '\\';
+ buf_i++;
+ buf[buf_i] = '0';
+ buf_i++;
+ break;
+ case '\'':
+ case '\\':
+
+ if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+ goto func_exit;
+ }
+ buf[buf_i] = ch;
+ buf_i++;
+ /* FALLTHROUGH */
+ default:
+
+ buf[buf_i] = ch;
+ buf_i++;
+ }
+ }
+
+func_exit:
+
+ buf[buf_i] = '\'';
+ buf_i++;
+ buf[buf_i] = '\0';
+ buf_i++;
+
+ return(buf_i);
+}
diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h
new file mode 100644
index 00000000000..6fd050acfe7
--- /dev/null
+++ b/storage/xtradb/include/ut0rbt.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0rbt.h
+Red-Black tree implementation.
+
+Created 2007-03-20 Sunny Bains
+************************************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "univ.i"
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define ut_malloc malloc
+#define ut_free free
+#define ulint unsigned long
+#define ut_a(c) assert(c)
+#define ut_error assert(0)
+#define ibool unsigned int
+#define TRUE 1
+#define FALSE 0
+#endif
+
+/* Red black tree typedefs */
+typedef struct ib_rbt_struct ib_rbt_t;
+typedef struct ib_rbt_node_struct ib_rbt_node_t;
+/* FIXME: Iterator is a better name than _bound_ */
+typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+
+/* Red black tree color types */
+enum ib_rbt_color_enum {
+ IB_RBT_RED,
+ IB_RBT_BLACK
+};
+
+typedef enum ib_rbt_color_enum ib_rbt_color_t;
+
+/* Red black tree node */
+struct ib_rbt_node_struct {
+ ib_rbt_color_t color; /* color of this node */
+
+ ib_rbt_node_t* left; /* points left child */
+ ib_rbt_node_t* right; /* points right child */
+ ib_rbt_node_t* parent; /* points parent node */
+
+ char value[1]; /* Data value */
+};
+
+/* Red black tree instance.*/
+struct ib_rbt_struct {
+ ib_rbt_node_t* nil; /* Black colored node that is
+ used as a sentinel. This is
+ pre-allocated too.*/
+
+ ib_rbt_node_t* root; /* Root of the tree, this is
+ pre-allocated and the first
+ data node is the left child.*/
+
+ ulint n_nodes; /* Total number of data nodes */
+
+ ib_rbt_compare compare; /* Fn. to use for comparison */
+ ulint sizeof_value; /* Sizeof the item in bytes */
+};
+
+/* The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_struct {
+ const ib_rbt_node_t*
+ last; /* Last node visited */
+
+ int result; /* Result of comparing with
+ the last non-nil node that
+ was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t) (t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t) (rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/****************************************************************//**
+Free an instance of a red black tree */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree); /*!< in: rb tree to free */
+/****************************************************************//**
+Create an instance of a red black tree
+@return rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: size in bytes */
+ ib_rbt_compare compare); /*!< in: comparator */
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to delete */
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return the deleted node with the const. */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t*
+ node); /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller has access
+ only to const nodes.*/
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return node if found else return NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree to search */
+ const void* key); /*!< in: key to lookup */
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value); /*!< in: data that will be
+ copied to the node.*/
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: parent */
+ const void* value); /*!< in: this value is copied
+ to the node */
+/****************************************************************//**
+Return the left most data node in the tree
+@return left most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Return the right most data node in the tree
+@return right most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Return the next node from current.
+@return successor node to current that is passed in. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /*!< in: current node */
+ current);
+/****************************************************************//**
+Return the prev node from current.
+@return precedessor node to current that is passed in */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* /*!< in: current node */
+ current);
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return node that satisfies the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return node that satisifies the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key); /*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return result of last comparison */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare); /*!< in: comparator */
+/****************************************************************//**
+Clear the tree, deletes (and free's) all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree); /*!< in: rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src); /*!< in: src rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src, since we don't support
+duplicates (yet). NOTE: src and dst must be similar, the function doesn't
+check for this condition (yet).
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src); /*!< in: src rb tree */
+/****************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return TRUE if OK FALSE if tree invalid. */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree); /*!< in: tree to validate */
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print); /*!< in: print function */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/xtradb/include/ut0rnd.h b/storage/xtradb/include/ut0rnd.h
new file mode 100644
index 00000000000..ad55df40abc
--- /dev/null
+++ b/storage/xtradb/include/ut0rnd.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "univ.i"
+
+#include "ut0byte.h"
+
+/** The 'character code' for end of field or string (used
+in folding records */
+#define UT_END_OF_FIELD 257
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed); /*!< in: seed */
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ ulint rnd); /*!< in: the previous random number value */
+/*********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space (let there be N of them) of ulint integers
+in a pseudo-random fashion. Note that the same integer is repeated
+always after N calls to the generator.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void);
+/*==================*/
+/********************************************************//**
+Generates a random integer from a given interval.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ ulint low, /*!< in: low limit; can generate also this value */
+ ulint high); /*!< in: high limit; can generate also this value */
+/*********************************************************//**
+Generates a random iboolean value.
+@return the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void);
+/*=================*/
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size); /*!< in: hash table size */
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+ __attribute__((const));
+/*************************************************************//**
+Folds a dulint.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ dulint d) /*!< in: dulint */
+ __attribute__((const));
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+ __attribute__((pure));
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+ __attribute__((pure));
+UNIV_INLINE
+ulint
+ut_fold_binary_32(
+/*==============*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+ __attribute__((pure));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+ __attribute__((const));
+
+
+#ifndef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic
new file mode 100644
index 00000000000..c2043660efd
--- /dev/null
+++ b/storage/xtradb/include/ut0rnd.ic
@@ -0,0 +1,256 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK 1463735687
+#define UT_HASH_RANDOM_MASK2 1653893711
+#define UT_RND1 151117737
+#define UT_RND2 119785373
+#define UT_RND3 85689495
+#define UT_RND4 76595339
+#define UT_SUM_RND2 98781234
+#define UT_SUM_RND3 126792457
+#define UT_SUM_RND4 63498502
+#define UT_XOR_RND1 187678878
+#define UT_XOR_RND2 143537923
+
+/** Seed value of ut_rnd_gen_ulint() */
+extern ulint ut_rnd_ulint_counter;
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+ ulint seed) /*!< in: seed */
+{
+ ut_rnd_ulint_counter = seed;
+}
+
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+ ulint rnd) /*!< in: the previous random number value */
+{
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ rnd = UT_RND2 * rnd + UT_SUM_RND3;
+ rnd = UT_XOR_RND1 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND3 * rnd + UT_SUM_RND4;
+ rnd = UT_XOR_RND2 ^ rnd;
+ rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+ rnd = UT_RND1 * rnd + UT_SUM_RND2;
+
+ return(rnd);
+}
+
+/********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space of ulint integers in a pseudo random
+fashion. Note that the same integer is repeated always after
+2 to power 32 calls to the generator (if ulint is 32-bit).
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void)
+/*==================*/
+{
+ ulint rnd;
+ ulint n_bits;
+
+ n_bits = 8 * sizeof(ulint);
+
+ ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2;
+
+ rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter);
+
+ return(rnd);
+}
+
+/********************************************************//**
+Generates a random integer from a given interval.
+@return the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+ ulint low, /*!< in: low limit; can generate also this value */
+ ulint high) /*!< in: high limit; can generate also this value */
+{
+ ulint rnd;
+
+ ut_ad(high >= low);
+
+ if (low == high) {
+
+ return(low);
+ }
+
+ rnd = ut_rnd_gen_ulint();
+
+ return(low + (rnd % (high - low + 1)));
+}
+
+/*********************************************************//**
+Generates a random iboolean value.
+@return the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void)
+/*=================*/
+{
+ ulint x;
+
+ x = ut_rnd_gen_ulint();
+
+ if (((x >> 20) + (x >> 15)) & 1) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+ ulint key, /*!< in: value to be hashed */
+ ulint table_size) /*!< in: hash table size */
+{
+ ut_ad(table_size);
+ key = key ^ UT_HASH_RANDOM_MASK2;
+
+ return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+ ulint n1, /*!< in: ulint */
+ ulint n2) /*!< in: ulint */
+{
+ return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+ ^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a dulint.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+ dulint d) /*!< in: dulint */
+{
+ return(ut_fold_ulint_pair(ut_dulint_get_low(d),
+ ut_dulint_get_high(d)));
+}
+
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+ const char* str) /*!< in: null-terminated string */
+{
+ ulint fold = 0;
+
+ ut_ad(str);
+
+ while (*str != '\0') {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+ str++;
+ }
+
+ return(fold);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+{
+ const byte* str_end = str + len;
+ ulint fold = 0;
+
+ ut_ad(str || !len);
+
+ while (str < str_end) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+
+ str++;
+ }
+
+ return(fold);
+}
+
+UNIV_INLINE
+ulint
+ut_fold_binary_32(
+/*==============*/
+ const byte* str, /*!< in: string of bytes */
+ ulint len) /*!< in: length */
+{
+ const ib_uint32_t* str_end = (const ib_uint32_t*) (str + len);
+ const ib_uint32_t* str_32 = (const ib_uint32_t*) str;
+ ulint fold = 0;
+
+ ut_ad(str);
+ /* This function is only for word-aligned data */
+ ut_ad(len % 4 == 0);
+ ut_ad((ulint)str % 4 == 0);
+
+ while (str_32 < str_end) {
+ fold = ut_fold_ulint_pair(fold, (ulint)(*str_32));
+
+ str_32++;
+ }
+
+ return(fold);
+}
diff --git a/storage/xtradb/include/ut0sort.h b/storage/xtradb/include/ut0sort.h
new file mode 100644
index 00000000000..5c6647dda9e
--- /dev/null
+++ b/storage/xtradb/include/ut0sort.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+#include "univ.i"
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+ ulint ut_sort_mid77;\
+ ulint ut_sort_i77;\
+ ulint ut_sort_low77;\
+ ulint ut_sort_high77;\
+\
+ ut_ad((LOW) < (HIGH));\
+ ut_ad(ARR);\
+ ut_ad(AUX_ARR);\
+\
+ if ((LOW) == (HIGH) - 1) {\
+ return;\
+ } else if ((LOW) == (HIGH) - 2) {\
+ if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+ (AUX_ARR)[LOW] = (ARR)[LOW];\
+ (ARR)[LOW] = (ARR)[(HIGH) - 1];\
+ (ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+ }\
+ return;\
+ }\
+\
+ ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+ SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+ SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+ ut_sort_low77 = (LOW);\
+ ut_sort_high77 = ut_sort_mid77;\
+\
+ for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+ if (ut_sort_low77 >= ut_sort_mid77) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else if (ut_sort_high77 >= (HIGH)) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ } else if (CMP_FUN((ARR)[ut_sort_low77],\
+ (ARR)[ut_sort_high77]) > 0) {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+ ut_sort_high77++;\
+ } else {\
+ (AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+ ut_sort_low77++;\
+ }\
+ }\
+\
+ memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+ ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h
new file mode 100644
index 00000000000..197b8401428
--- /dev/null
+++ b/storage/xtradb/include/ut0ut.h
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#endif /* UNIV_HOTBACKUP */
+
+#include <time.h>
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif
+
+/** Index name prefix in fast index creation */
+#define TEMP_INDEX_PREFIX '\377'
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR "\377"
+
+/** Time stamp */
+typedef time_t ib_time_t;
+
+#ifndef UNIV_HOTBACKUP
+#if defined(HAVE_IB_PAUSE_INSTRUCTION)
+# ifdef WIN32
+ /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+ the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+ independent way by using YieldProcessor.*/
+# define UT_RELAX_CPU() YieldProcessor()
+# else
+ /* According to the gcc info page, asm volatile means that the
+ instruction has important side-effects and must not be removed.
+ Also asm volatile may trigger a memory barrier (spilling all registers
+ to memory). */
+# define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+# endif
+#elif defined(HAVE_ATOMIC_BUILTINS)
+# define UT_RELAX_CPU() do { \
+ volatile lint volatile_var; \
+ os_compare_and_swap_lint(&volatile_var, 0, 1); \
+ } while (0)
+#else
+# define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
+#endif
+
+/*********************************************************************//**
+Delays execution for at most max_wait_us microseconds or returns earlier
+if cond becomes true.
+@param cond in: condition to wait for; evaluated every 2 ms
+@param max_wait_us in: maximum delay to wait, in microseconds */
+#define UT_WAIT_FOR(cond, max_wait_us) \
+do { \
+ ullint start_us; \
+ start_us = ut_time_us(NULL); \
+ while (!(cond) \
+ && ut_time_us(NULL) - start_us < (max_wait_us)) {\
+ \
+ os_thread_sleep(2000 /* 2 ms */); \
+ } \
+} while (0)
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Gets the high 32 bits in a ulint. That is makes a shift >> 32,
+but since there seem to be compiler bugs in both gcc and Visual C++,
+we do this by a special conversion.
+@return a >> 32 */
+UNIV_INTERN
+ulint
+ut_get_high32(
+/*==========*/
+ ulint a); /*!< in: ulint */
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2); /*!< in: second number */
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2); /*!< in: second number */
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /*!< out: more significant part of minimum */
+ ulint* b, /*!< out: less significant part of minimum */
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint b1, /*!< in: less significant part of first pair */
+ ulint a2, /*!< in: more significant part of second pair */
+ ulint b2); /*!< in: less significant part of second pair */
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b); /*!< in: ulint */
+/*******************************************************//**
+Compares two pairs of ulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint a2, /*!< in: less significant part of first pair */
+ ulint b1, /*!< in: more significant part of second pair */
+ ulint b2); /*!< in: less significant part of second pair */
+/*************************************************************//**
+Determines if a number is zero or a power of two.
+@param n in: number
+@return nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1)))
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n in: numerator
+@param m in: denominator, must be a power of two
+@return the remainder of n/m */
+#define ut_2pow_remainder(n, m) ((n) & ((m) - 1))
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two. In other words, rounds n down to m * k.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+#define ut_2pow_round(n, m) ((n) & ~((m) - 1))
+/** Align a number down to a multiple of a power of two.
+@param n in: number to round down
+@param m in: alignment, must be a power of two
+@return n rounded down to the biggest possible integer multiple of m */
+#define ut_calc_align_down(n, m) ut_2pow_round(n, m)
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two. In other words, rounds n up to m * k.
+@param n in: number to round up
+@param m in: alignment, must be a power of two
+@return n rounded up to the smallest possible integer multiple of m */
+#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1))
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n); /*!< in: number */
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+ ulint n) /*!< in: number != 0 */
+ __attribute__((const));
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b in: bits
+@return number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8)
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void);
+/*=========*/
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return 0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+ ulint* sec, /*!< out: seconds since the Epoch */
+ ulint* ms); /*!< out: microseconds since the Epoch+*sec */
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+ ullint* tloc); /*!< out: us since epoch, if non-NULL */
+/**********************************************************//**
+Returns the number of milliseconds since some epoch. The
+value may wrap around. It should only be used for heuristic
+purposes.
+@return ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+ ib_time_t time2, /*!< in: time */
+ ib_time_t time1); /*!< in: time */
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file); /*!< in: file where to print */
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf); /*!< in: buffer where to sprintf */
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+ char* buf); /*!< in: buffer where to sprintf */
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+ ulint* year, /*!< out: current year */
+ ulint* month, /*!< out: month */
+ ulint* day); /*!< out: day */
+#else /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+ ulint delay); /*!< in: delay in microseconds on 100 MHz Pentium */
+#endif /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len); /*!< in: length of the buffer */
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const char* name); /*!< in: name to print */
+
+#ifndef UNIV_HOTBACKUP
+/* Forward declaration of transaction handle */
+struct trx_struct;
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+ FILE* f, /*!< in: output stream */
+ struct trx_struct*trx, /*!< in: transaction */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name); /*!< in: name to print */
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+ FILE* f, /*!< in: output stream */
+ struct trx_struct*trx, /*!< in: transaction (NULL=no quotes) */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name, /*!< in: name to print */
+ ulint namelen);/*!< in: length of name */
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src); /*!< in: input file to be appended to output */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+ char* str, /*!< out: string */
+ size_t size, /*!< in: str size */
+ const char* fmt, /*!< in: format */
+ ...); /*!< in: format values */
+#else
+/**********************************************************************//**
+A wrapper for snprintf(3), formatted output conversion into
+a limited buffer. */
+# define ut_snprintf snprintf
+#endif /* __WIN__ */
+
+#ifndef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#endif
+
diff --git a/storage/xtradb/include/ut0ut.ic b/storage/xtradb/include/ut0ut.ic
new file mode 100644
index 00000000000..6f55c7e410e
--- /dev/null
+++ b/storage/xtradb/include/ut0ut.ic
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2) /*!< in: second number */
+{
+ return((n1 <= n2) ? n1 : n2);
+}
+
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+ ulint n1, /*!< in: first number */
+ ulint n2) /*!< in: second number */
+{
+ return((n1 <= n2) ? n2 : n1);
+}
+
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+ ulint* a, /*!< out: more significant part of minimum */
+ ulint* b, /*!< out: less significant part of minimum */
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint b1, /*!< in: less significant part of first pair */
+ ulint a2, /*!< in: more significant part of second pair */
+ ulint b2) /*!< in: less significant part of second pair */
+{
+ if (a1 == a2) {
+ *a = a1;
+ *b = ut_min(b1, b2);
+ } else if (a1 < a2) {
+ *a = a1;
+ *b = b1;
+ } else {
+ *a = a2;
+ *b = b2;
+ }
+}
+
+/******************************************************//**
+Compares two ulints.
+@return 1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+ ulint a, /*!< in: ulint */
+ ulint b) /*!< in: ulint */
+{
+ if (a < b) {
+ return(-1);
+ } else if (a == b) {
+ return(0);
+ } else {
+ return(1);
+ }
+}
+
+/*******************************************************//**
+Compares two pairs of ulints.
+@return -1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+ ulint a1, /*!< in: more significant part of first pair */
+ ulint a2, /*!< in: less significant part of first pair */
+ ulint b1, /*!< in: more significant part of second pair */
+ ulint b2) /*!< in: less significant part of second pair */
+{
+ if (a1 > b1) {
+ return(1);
+ } else if (a1 < b1) {
+ return(-1);
+ } else if (a2 > b2) {
+ return(1);
+ } else if (a2 < b2) {
+ return(-1);
+ } else {
+ return(0);
+ }
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+ ulint n) /*!< in: number != 0 */
+{
+ ulint res;
+
+ res = 0;
+
+ ut_ad(n > 0);
+
+ n = n - 1;
+
+ for (;;) {
+ n = n / 2;
+
+ if (n == 0) {
+ break;
+ }
+
+ res++;
+ }
+
+ return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return 2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+ ulint n) /*!< in: number */
+{
+ return((ulint) 1 << n);
+}
diff --git a/storage/xtradb/include/ut0vec.h b/storage/xtradb/include/ut0vec.h
new file mode 100644
index 00000000000..a770f671cfc
--- /dev/null
+++ b/storage/xtradb/include/ut0vec.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "univ.i"
+#include "mem0mem.h"
+
+/** An automatically resizing vector data type. */
+typedef struct ib_vector_struct ib_vector_t;
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -Contains void* items.
+
+ -The items are owned by the caller.
+
+ -All memory allocation is done through a heap owned by the caller, who is
+ responsible for freeing it when done with the vector.
+
+ -When the vector is resized, the old memory area is left allocated since it
+ uses the same heap as the new memory area, so this is best used for
+ relatively small or short-lived uses.
+*/
+
+/****************************************************************//**
+Create a new vector with the given initial size.
+@return vector */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ mem_heap_t* heap, /*!< in: heap */
+ ulint size); /*!< in: initial size */
+
+/****************************************************************//**
+Push a new element to the vector, increasing its size if necessary. */
+UNIV_INTERN
+void
+ib_vector_push(
+/*===========*/
+ ib_vector_t* vec, /*!< in: vector */
+ void* elem); /*!< in: data element */
+
+/****************************************************************//**
+Get the number of elements in the vector.
+@return number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n); /*!< in: element index to get */
+
+/****************************************************************//**
+Remove the last element from the vector. */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ ib_vector_t* vec); /*!< in: vector */
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec); /*!< in,own: vector */
+
+/** An automatically resizing vector data type. */
+struct ib_vector_struct {
+ mem_heap_t* heap; /*!< heap */
+ void** data; /*!< data elements */
+ ulint used; /*!< number of elements currently used */
+ ulint total; /*!< number of elements allocated */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0vec.ic b/storage/xtradb/include/ut0vec.ic
new file mode 100644
index 00000000000..02e881f9bca
--- /dev/null
+++ b/storage/xtradb/include/ut0vec.ic
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get number of elements in vector.
+@return number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element.
+@return n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+ ib_vector_t* vec, /*!< in: vector */
+ ulint n) /*!< in: element index to get */
+{
+ ut_a(n < vec->used);
+
+ return(vec->data[n]);
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+ ib_vector_t* vec) /*!< in/out: vector */
+{
+ void* elem;
+
+ ut_a(vec->used > 0);
+ --vec->used;
+ elem = vec->data[vec->used];
+
+ ut_d(vec->data[vec->used] = NULL);
+ UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data));
+
+ return(elem);
+}
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+ ib_vector_t* vec) /*!< in, own: vector */
+{
+ mem_heap_free(vec->heap);
+}
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+ const ib_vector_t* vec) /*!< in: vector */
+{
+ return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h
new file mode 100644
index 00000000000..2ec0f16ab05
--- /dev/null
+++ b/storage/xtradb/include/ut0wqueue.h
@@ -0,0 +1,85 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#ifndef IB_WORK_QUEUE_H
+#define IB_WORK_QUEUE_H
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "os0sync.h"
+#include "sync0types.h"
+
+typedef struct ib_wqueue_struct ib_wqueue_t;
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void);
+/*===================*/
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+ ib_wqueue_t* wq, /*!< in: work queue */
+ void* item, /*!< in: work item */
+ mem_heap_t* heap); /*!< in: memory heap to use for allocating the
+ list node */
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+ ib_wqueue_t* wq); /*!< in: work queue */
+
+/* Work queue. */
+struct ib_wqueue_struct {
+ mutex_t mutex; /*!< mutex protecting everything */
+ ib_list_t* items; /*!< work item list */
+ os_event_t event; /*!< event we use to signal additions to list */
+};
+
+#endif
diff --git a/storage/xtradb/lock/lock0iter.c b/storage/xtradb/lock/lock0iter.c
new file mode 100644
index 00000000000..51d1802ccde
--- /dev/null
+++ b/storage/xtradb/lock/lock0iter.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0iter.c
+Lock queue iterator. Can iterate over table and record
+lock queues.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#ifdef UNIV_DEBUG
+# include "srv0srv.h" /* kernel_mutex */
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+ bit_no is calculated in this function by using
+ lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+ of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+ lock_queue_iterator_t* iter, /*!< out: iterator */
+ const lock_t* lock, /*!< in: lock to start from */
+ ulint bit_no) /*!< in: record number in the
+ heap */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ iter->current_lock = lock;
+
+ if (bit_no != ULINT_UNDEFINED) {
+
+ iter->bit_no = bit_no;
+ } else {
+
+ switch (lock_get_type_low(lock)) {
+ case LOCK_TABLE:
+ iter->bit_no = ULINT_UNDEFINED;
+ break;
+ case LOCK_REC:
+ iter->bit_no = lock_rec_find_set_bit(lock);
+ ut_a(iter->bit_no != ULINT_UNDEFINED);
+ break;
+ default:
+ ut_error;
+ }
+ }
+}
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return previous lock or NULL */
+UNIV_INTERN
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+ lock_queue_iterator_t* iter) /*!< in/out: iterator */
+{
+ const lock_t* prev_lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ switch (lock_get_type_low(iter->current_lock)) {
+ case LOCK_REC:
+ prev_lock = lock_rec_get_prev(
+ iter->current_lock, iter->bit_no);
+ break;
+ case LOCK_TABLE:
+ prev_lock = UT_LIST_GET_PREV(
+ un_member.tab_lock.locks, iter->current_lock);
+ break;
+ default:
+ ut_error;
+ }
+
+ if (prev_lock != NULL) {
+
+ iter->current_lock = prev_lock;
+ }
+
+ return(prev_lock);
+}
diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c
new file mode 100644
index 00000000000..1ded67d9147
--- /dev/null
+++ b/storage/xtradb/lock/lock0lock.c
@@ -0,0 +1,5773 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0lock.c
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+
+#ifdef UNIV_NONINL
+#include "lock0lock.ic"
+#include "lock0priv.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "usr0sess.h"
+#include "trx0purge.h"
+#include "dict0mem.h"
+#include "trx0sys.h"
+
+/* Restricts the length of search we will do in the waits-for
+graph of transactions */
+#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
+
+/* Restricts the recursion depth of the search we will do in the waits-for
+graph of transactions */
+#define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200
+
+/* When releasing transaction locks, this specifies how often we release
+the kernel mutex for a moment to give also others access to it */
+
+#define LOCK_RELEASE_KERNEL_INTERVAL 1000
+
+/* Safety margin when creating a new record lock: this many extra records
+can be inserted to the page without need to create a lock with a bigger
+bitmap */
+
+#define LOCK_PAGE_BITMAP_MARGIN 64
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+ There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+ Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+ What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+ We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+ How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+ A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ * IS IX S X AI
+ * IS + + + - +
+ * IX + + - - +
+ * S + - + - -
+ * X - - - - -
+ * AI + + - - -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+#define LK(a,b) (1 << ((a) * LOCK_NUM + (b)))
+#define LKS(a,b) LK(a,b) | LK(b,a)
+
+/* Define the lock compatibility matrix in a ulint. The first line below
+defines the diagonal entries. The following lines define the compatibility
+for LOCK_IX, LOCK_S, and LOCK_AUTO_INC using LKS(), since the matrix
+is symmetric. */
+#define LOCK_MODE_COMPATIBILITY 0 \
+ | LK(LOCK_IS, LOCK_IS) | LK(LOCK_IX, LOCK_IX) | LK(LOCK_S, LOCK_S) \
+ | LKS(LOCK_IX, LOCK_IS) | LKS(LOCK_IS, LOCK_AUTO_INC) \
+ | LKS(LOCK_S, LOCK_IS) \
+ | LKS(LOCK_AUTO_INC, LOCK_IS) | LKS(LOCK_AUTO_INC, LOCK_IX)
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ * IS IX S X AI
+ * IS + - - - -
+ * IX + + - - -
+ * S + - + - -
+ * X + + + + +
+ * AI - - - - +
+ * See lock_mode_stronger_or_eq().
+ */
+
+/* Define the stronger-or-equal lock relation in a ulint. This relation
+contains all pairs LK(mode1, mode2) where mode1 is stronger than or
+equal to mode2. */
+#define LOCK_MODE_STRONGER_OR_EQ 0 \
+ | LK(LOCK_IS, LOCK_IS) \
+ | LK(LOCK_IX, LOCK_IS) | LK(LOCK_IX, LOCK_IX) \
+ | LK(LOCK_S, LOCK_IS) | LK(LOCK_S, LOCK_S) \
+ | LK(LOCK_AUTO_INC, LOCK_AUTO_INC) \
+ | LK(LOCK_X, LOCK_IS) | LK(LOCK_X, LOCK_IX) | LK(LOCK_X, LOCK_S) \
+ | LK(LOCK_X, LOCK_AUTO_INC) | LK(LOCK_X, LOCK_X)
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool lock_print_waits = FALSE;
+
+/*********************************************************************//**
+Validates the lock system.
+@return TRUE if ok */
+static
+ibool
+lock_validate(void);
+/*===============*/
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no);/*!< in: page number */
+#endif /* UNIV_DEBUG */
+
+/* The lock system */
+UNIV_INTERN lock_sys_t* lock_sys = NULL;
+
+/* We store info on the latest deadlock error to this buffer. InnoDB
+Monitor will then fetch it and print */
+UNIV_INTERN ibool lock_deadlock_found = FALSE;
+UNIV_INTERN FILE* lock_latest_err_file;
+
+/* Flags for recursive deadlock search */
+#define LOCK_VICTIM_IS_START 1
+#define LOCK_VICTIM_IS_OTHER 2
+#define LOCK_EXCEED_MAX_DEPTH 3
+
+/********************************************************************//**
+Checks if a lock request results in a deadlock.
+@return TRUE if a deadlock was detected and we chose trx as a victim;
+FALSE if no deadlock, or there was a deadlock, but we chose other
+transaction(s) as victim(s) */
+static
+ibool
+lock_deadlock_occurs(
+/*=================*/
+ lock_t* lock, /*!< in: lock the transaction is requesting */
+ trx_t* trx); /*!< in: transaction */
+/********************************************************************//**
+Looks recursively for a deadlock.
+@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
+deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
+deadlock was found and we chose some other trx as a victim: we must do
+the search again in this last case because there may be another
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+static
+ulint
+lock_deadlock_recursive(
+/*====================*/
+ trx_t* start, /*!< in: recursion starting point */
+ trx_t* trx, /*!< in: a transaction waiting for a lock */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
+ ulint* cost, /*!< in/out: number of calculation steps thus
+ far: if this exceeds LOCK_MAX_N_STEPS_...
+ we return LOCK_EXCEED_MAX_DEPTH */
+ ulint depth); /*!< in: recursion depth: if this exceeds
+ LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
+ return LOCK_EXCEED_MAX_DEPTH */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+ const lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ if (i >= lock->un_member.rec_lock.n_bits) {
+
+ return(FALSE);
+ }
+
+ byte_index = i / 8;
+ bit_index = i % 8;
+
+ return(1 & ((const byte*) &lock[1])[byte_index] >> bit_index);
+}
+
+/*************************************************************************/
+
+#define lock_mutex_enter_kernel() mutex_enter(&kernel_mutex)
+#define lock_mutex_exit_kernel() mutex_exit(&kernel_mutex)
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+lock_check_trx_id_sanity(
+/*=====================*/
+ trx_id_t trx_id, /*!< in: trx id */
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ibool has_kernel_mutex)/*!< in: TRUE if the caller owns the
+ kernel mutex */
+{
+ ibool is_ok = TRUE;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!has_kernel_mutex) {
+ mutex_enter(&kernel_mutex);
+ }
+
+ /* A sanity check: the trx_id in rec must be smaller than the global
+ trx id counter */
+
+ if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: transaction id associated"
+ " with record\n",
+ stderr);
+ rec_print_new(stderr, rec, offsets);
+ fputs("InnoDB: in ", stderr);
+ dict_index_name_print(stderr, NULL, index);
+ fprintf(stderr, "\n"
+ "InnoDB: is " TRX_ID_FMT " which is higher than the"
+ " global trx id counter " TRX_ID_FMT "!\n"
+ "InnoDB: The table is corrupt. You have to do"
+ " dump + drop + reimport.\n",
+ TRX_ID_PREP_PRINTF(trx_id),
+ TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+
+ is_ok = FALSE;
+ }
+
+ if (!has_kernel_mutex) {
+ mutex_exit(&kernel_mutex);
+ }
+
+ return(is_ok);
+}
+
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return TRUE if sees, or FALSE if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+ const rec_t* rec, /*!< in: user record which should be read or
+ passed over by a read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ read_view_t* view) /*!< in: consistent read view */
+{
+ trx_id_t trx_id;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* NOTE that we call this function while holding the search
+ system latch. To obey the latching order we must NOT reserve the
+ kernel mutex here! */
+
+ trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ return(read_view_sees_trx_id(view, trx_id));
+}
+
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case FALSE, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return TRUE if certainly sees, or FALSE if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+ const rec_t* rec, /*!< in: user record which
+ should be read or passed over
+ by a read cursor */
+ const read_view_t* view) /*!< in: consistent read view */
+{
+ trx_id_t max_trx_id;
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ /* NOTE that we might call this function while holding the search
+ system latch. To obey the latching order we must NOT reserve the
+ kernel mutex here! */
+
+ if (recv_recovery_is_on()) {
+
+ return(FALSE);
+ }
+
+ max_trx_id = page_get_max_trx_id(page_align(rec));
+ ut_ad(!ut_dulint_is_zero(max_trx_id));
+
+ return(ut_dulint_cmp(max_trx_id, view->up_limit_id) < 0);
+}
+
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+ ulint n_cells) /*!< in: number of slots in lock hash table */
+{
+ lock_sys = mem_alloc(sizeof(lock_sys_t));
+
+ lock_sys->rec_hash = hash_create(n_cells);
+
+ /* hash_create_mutexes(lock_sys->rec_hash, 2, SYNC_REC_LOCK); */
+
+ lock_latest_err_file = os_file_create_tmpfile();
+ ut_a(lock_latest_err_file);
+}
+
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void)
+/*================*/
+{
+ if (lock_latest_err_file != NULL) {
+ fclose(lock_latest_err_file);
+ lock_latest_err_file = NULL;
+ }
+
+ hash_table_free(lock_sys->rec_hash);
+ mem_free(lock_sys);
+ lock_sys = NULL;
+}
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void)
+/*===============*/
+{
+ return((ulint)sizeof(lock_t));
+}
+
+/*********************************************************************//**
+Gets the mode of a lock.
+@return mode */
+UNIV_INLINE
+enum lock_mode
+lock_get_mode(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ return(lock->type_mode & LOCK_MODE_MASK);
+}
+
+/*********************************************************************//**
+Gets the wait flag of a lock.
+@return TRUE if waiting */
+UNIV_INLINE
+ibool
+lock_get_wait(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_ad(lock);
+
+ if (UNIV_UNLIKELY(lock->type_mode & LOCK_WAIT)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction. The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* dest, /*!< in: destination of ALTER TABLE */
+ enum lock_mode* mode) /*!< out: lock mode of the source table */
+{
+ dict_table_t* src;
+ lock_t* lock;
+
+ src = NULL;
+ *mode = LOCK_NONE;
+
+ for (lock = UT_LIST_GET_FIRST(trx->trx_locks);
+ lock;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+ lock_table_t* tab_lock;
+ enum lock_mode lock_mode;
+ if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+ /* We are only interested in table locks. */
+ continue;
+ }
+ tab_lock = &lock->un_member.tab_lock;
+ if (dest == tab_lock->table) {
+ /* We are not interested in the destination table. */
+ continue;
+ } else if (!src) {
+ /* This presumably is the source table. */
+ src = tab_lock->table;
+ if (UT_LIST_GET_LEN(src->locks) != 1
+ || UT_LIST_GET_FIRST(src->locks) != lock) {
+ /* We only support the case when
+ there is only one lock on this table. */
+ return(NULL);
+ }
+ } else if (src != tab_lock->table) {
+ /* The transaction is locking more than
+ two tables (src and dest): abort */
+ return(NULL);
+ }
+
+ /* Check that the source table is locked by
+ LOCK_IX or LOCK_IS. */
+ lock_mode = lock_get_mode(lock);
+ if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) {
+ if (*mode != LOCK_NONE && *mode != lock_mode) {
+ /* There are multiple locks on src. */
+ return(NULL);
+ }
+ *mode = lock_mode;
+ }
+ }
+
+ if (!src) {
+ /* No source table lock found: flag the situation to caller */
+ src = dest;
+ }
+
+ return(src);
+}
+
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx) /*!< in: transaction */
+{
+ const lock_t* lock;
+ ibool ok = FALSE;
+
+ ut_ad(table);
+ ut_ad(trx);
+
+ lock_mutex_enter_kernel();
+
+ for (lock = UT_LIST_GET_FIRST(table->locks);
+ lock;
+ lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) {
+ if (lock->trx != trx) {
+ /* A lock on the table is held
+ by some other transaction. */
+ goto not_ok;
+ }
+
+ if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+ /* We are interested in table locks only. */
+ continue;
+ }
+
+ switch (lock_get_mode(lock)) {
+ case LOCK_IX:
+ ok = TRUE;
+ break;
+ case LOCK_AUTO_INC:
+ /* It is allowed for trx to hold an
+ auto_increment lock. */
+ break;
+ default:
+not_ok:
+ /* Other table locks than LOCK_IX are not allowed. */
+ ok = FALSE;
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ lock_mutex_exit_kernel();
+
+ return(ok);
+}
+
+/*********************************************************************//**
+Sets the wait flag of a lock and the back pointer in trx to lock. */
+UNIV_INLINE
+void
+lock_set_lock_and_trx_wait(
+/*=======================*/
+ lock_t* lock, /*!< in: lock */
+ trx_t* trx) /*!< in: trx */
+{
+ ut_ad(lock);
+ ut_ad(trx->wait_lock == NULL);
+
+ trx->wait_lock = lock;
+ lock->type_mode |= LOCK_WAIT;
+}
+
+/**********************************************************************//**
+The back pointer to a waiting lock request in the transaction is set to NULL
+and the wait bit in lock type_mode is reset. */
+UNIV_INLINE
+void
+lock_reset_lock_and_trx_wait(
+/*=========================*/
+ lock_t* lock) /*!< in: record lock */
+{
+ ut_ad((lock->trx)->wait_lock == lock);
+ ut_ad(lock_get_wait(lock));
+
+ /* Reset the back pointer in trx to this waiting lock request */
+
+ (lock->trx)->wait_lock = NULL;
+ lock->type_mode &= ~LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Gets the gap flag of a record lock.
+@return TRUE if gap flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_gap(
+/*=============*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ if (lock->type_mode & LOCK_GAP) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the LOCK_REC_NOT_GAP flag of a record lock.
+@return TRUE if LOCK_REC_NOT_GAP flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_rec_not_gap(
+/*=====================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ if (lock->type_mode & LOCK_REC_NOT_GAP) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the waiting insert flag of a record lock.
+@return TRUE if gap flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_insert_intention(
+/*==========================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ if (lock->type_mode & LOCK_INSERT_INTENTION) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
+ || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
+ ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
+ || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+
+ return((LOCK_MODE_STRONGER_OR_EQ) & LK(mode1, mode2));
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+ enum lock_mode mode1, /*!< in: lock mode */
+ enum lock_mode mode2) /*!< in: lock mode */
+{
+ ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
+ || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
+ ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
+ || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+
+ return((LOCK_MODE_COMPATIBILITY) & LK(mode1, mode2));
+}
+
+/*********************************************************************//**
+Checks if a lock request for a new lock has to wait for request lock2.
+@return TRUE if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+ibool
+lock_rec_has_to_wait(
+/*=================*/
+ const trx_t* trx, /*!< in: trx of new lock */
+ ulint type_mode,/*!< in: precise mode of the new lock
+ to set: LOCK_S or LOCK_X, possibly
+ ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION */
+ const lock_t* lock2, /*!< in: another record lock; NOTE that
+ it is assumed that this has a lock bit
+ set on the same record as in the new
+ lock we are setting */
+ ibool lock_is_on_supremum) /*!< in: TRUE if we are setting the
+ lock on the 'supremum' record of an
+ index page: we know then that the lock
+ request is really for a 'gap' type lock */
+{
+ ut_ad(trx && lock2);
+ ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+ if (trx != lock2->trx
+ && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
+ lock_get_mode(lock2))) {
+
+ /* We have somewhat complex rules when gap type record locks
+ cause waits */
+
+ if ((lock_is_on_supremum || (type_mode & LOCK_GAP))
+ && !(type_mode & LOCK_INSERT_INTENTION)) {
+
+ /* Gap type locks without LOCK_INSERT_INTENTION flag
+ do not need to wait for anything. This is because
+ different users can have conflicting lock types
+ on gaps. */
+
+ return(FALSE);
+ }
+
+ if (!(type_mode & LOCK_INSERT_INTENTION)
+ && lock_rec_get_gap(lock2)) {
+
+ /* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
+ does not need to wait for a gap type lock */
+
+ return(FALSE);
+ }
+
+ if ((type_mode & LOCK_GAP)
+ && lock_rec_get_rec_not_gap(lock2)) {
+
+ /* Lock on gap does not need to wait for
+ a LOCK_REC_NOT_GAP type lock */
+
+ return(FALSE);
+ }
+
+ if (lock_rec_get_insert_intention(lock2)) {
+
+ /* No lock request needs to wait for an insert
+ intention lock to be removed. This is ok since our
+ rules allow conflicting locks on gaps. This eliminates
+ a spurious deadlock caused by a next-key lock waiting
+ for an insert intention lock; when the insert
+ intention lock was granted, the insert deadlocked on
+ the waiting next-key lock.
+
+ Also, insert intention locks do not disturb each
+ other. */
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+ const lock_t* lock1, /*!< in: waiting lock */
+ const lock_t* lock2) /*!< in: another lock; NOTE that it is
+ assumed that this has a lock bit set
+ on the same record as in lock1 if the
+ locks are record locks */
+{
+ ut_ad(lock1 && lock2);
+
+ if (lock1->trx != lock2->trx
+ && !lock_mode_compatible(lock_get_mode(lock1),
+ lock_get_mode(lock2))) {
+ if (lock_get_type_low(lock1) == LOCK_REC) {
+ ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+ /* If this lock request is for a supremum record
+ then the second bit on the lock bitmap is set */
+
+ return(lock_rec_has_to_wait(lock1->trx,
+ lock1->type_mode, lock2,
+ lock_rec_get_nth_bit(
+ lock1, 1)));
+ }
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*============== RECORD LOCK BASIC FUNCTIONS ============================*/
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+ const lock_t* lock) /*!< in: record lock */
+{
+ return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+UNIV_INLINE
+void
+lock_rec_set_nth_bit(
+/*=================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit */
+{
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte_index = i / 8;
+ bit_index = i % 8;
+
+ ((byte*) &lock[1])[byte_index] |= 1 << bit_index;
+}
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+ const lock_t* lock) /*!< in: record lock with at least one bit set */
+{
+ ulint i;
+
+ for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+
+ if (lock_rec_get_nth_bit(lock, i)) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Resets the nth bit of a record lock. */
+UNIV_INLINE
+void
+lock_rec_reset_nth_bit(
+/*===================*/
+ lock_t* lock, /*!< in: record lock */
+ ulint i) /*!< in: index of the bit which must be set to TRUE
+ when this function is called */
+{
+ ulint byte_index;
+ ulint bit_index;
+
+ ut_ad(lock);
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+ byte_index = i / 8;
+ bit_index = i % 8;
+
+ ((byte*) &lock[1])[byte_index] &= ~(1 << bit_index);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+ lock_t* lock) /*!< in: a record lock */
+{
+ ulint space;
+ ulint page_no;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ space = lock->un_member.rec_lock.space;
+ page_no = lock->un_member.rec_lock.page_no;
+
+ for (;;) {
+ lock = HASH_GET_NEXT(hash, lock);
+
+ if (!lock) {
+
+ break;
+ }
+
+ if ((lock->un_member.rec_lock.space == space)
+ && (lock->un_member.rec_lock.page_no == page_no)) {
+
+ break;
+ }
+ }
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by its
+file address.
+@return first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page_addr(
+/*============================*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = HASH_GET_FIRST(lock_sys->rec_hash,
+ lock_rec_hash(space, page_no));
+ while (lock) {
+ if ((lock->un_member.rec_lock.space == space)
+ && (lock->un_member.rec_lock.page_no == page_no)) {
+
+ break;
+ }
+
+ lock = HASH_GET_NEXT(hash, lock);
+ }
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Returns TRUE if there are explicit record locks on a page.
+@return TRUE if there are explicit record locks on the page */
+UNIV_INTERN
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+ ulint space, /*!< in: space id */
+ ulint page_no)/*!< in: page number */
+{
+ ibool ret;
+
+ mutex_enter(&kernel_mutex);
+
+ if (lock_rec_get_first_on_page_addr(space, page_no)) {
+ ret = TRUE;
+ } else {
+ ret = FALSE;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by a
+pointer to it.
+@return first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page(
+/*=======================*/
+ const buf_block_t* block) /*!< in: buffer block */
+{
+ ulint hash;
+ lock_t* lock;
+ ulint space = buf_block_get_space(block);
+ ulint page_no = buf_block_get_page_no(block);
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ hash = buf_block_get_lock_hash_val(block);
+
+ lock = HASH_GET_FIRST(lock_sys->rec_hash, hash);
+
+ while (lock) {
+ if ((lock->un_member.rec_lock.space == space)
+ && (lock->un_member.rec_lock.page_no == page_no)) {
+
+ break;
+ }
+
+ lock = HASH_GET_NEXT(hash, lock);
+ }
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+ ulint heap_no,/*!< in: heap number of the record */
+ lock_t* lock) /*!< in: lock */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ do {
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Gets the first explicit lock request on a record.
+@return first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first(
+/*===============*/
+ const buf_block_t* block, /*!< in: block containing the record */
+ ulint heap_no)/*!< in: heap number of the record */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ for (lock = lock_rec_get_first_on_page(block); lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ if (lock_rec_get_nth_bit(lock, heap_no)) {
+ break;
+ }
+ }
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
+pointer in the transaction! This function is used in lock object creation
+and resetting. */
+static
+void
+lock_rec_bitmap_reset(
+/*==================*/
+ lock_t* lock) /*!< in: record lock */
+{
+ ulint n_bytes;
+
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ /* Reset to zero the bitmap which resides immediately after the lock
+ struct */
+
+ n_bytes = lock_rec_get_n_bits(lock) / 8;
+
+ ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
+
+ memset(&lock[1], 0, n_bytes);
+}
+
+/*********************************************************************//**
+Copies a record lock to heap.
+@return copy of lock */
+static
+lock_t*
+lock_rec_copy(
+/*==========*/
+ const lock_t* lock, /*!< in: record lock */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint size;
+
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
+
+ return(mem_heap_dup(heap, lock, size));
+}
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+ const lock_t* in_lock,/*!< in: record lock */
+ ulint heap_no)/*!< in: heap number of the record */
+{
+ lock_t* lock;
+ ulint space;
+ ulint page_no;
+ lock_t* found_lock = NULL;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+ space = in_lock->un_member.rec_lock.space;
+ page_no = in_lock->un_member.rec_lock.page_no;
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ for (;;) {
+ ut_ad(lock);
+
+ if (lock == in_lock) {
+
+ return(found_lock);
+ }
+
+ if (lock_rec_get_nth_bit(lock, heap_no)) {
+
+ found_lock = lock;
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+}
+
+/*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_has(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table */
+ enum lock_mode mode) /*!< in: lock mode */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* Look for stronger locks the same trx already has on the table */
+
+ lock = UT_LIST_GET_LAST(table->locks);
+
+ while (lock != NULL) {
+
+ if (lock->trx == trx
+ && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+ /* The same trx already has locked the table in
+ a mode stronger or equal to the mode given */
+
+ ut_ad(!lock_get_wait(lock));
+
+ return(lock);
+ }
+
+ lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+ }
+
+ return(NULL);
+}
+
+/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_has_expl(
+/*==============*/
+ ulint precise_mode,/*!< in: LOCK_S or LOCK_X
+ possibly ORed to LOCK_GAP or
+ LOCK_REC_NOT_GAP, for a
+ supremum record we regard this
+ always a gap type request */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+ || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+ ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock) {
+ if (lock->trx == trx
+ && lock_mode_stronger_or_eq(lock_get_mode(lock),
+ precise_mode & LOCK_MODE_MASK)
+ && !lock_get_wait(lock)
+ && (!lock_rec_get_rec_not_gap(lock)
+ || (precise_mode & LOCK_REC_NOT_GAP)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM)
+ && (!lock_rec_get_gap(lock)
+ || (precise_mode & LOCK_GAP)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM)
+ && (!lock_rec_get_insert_intention(lock))) {
+
+ return(lock);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+
+ return(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some other transaction has a lock request in the queue.
+@return lock or NULL */
+static
+lock_t*
+lock_rec_other_has_expl_req(
+/*========================*/
+ enum lock_mode mode, /*!< in: LOCK_S or LOCK_X */
+ ulint gap, /*!< in: LOCK_GAP if also gap
+ locks are taken into account,
+ or 0 if not */
+ ulint wait, /*!< in: LOCK_WAIT if also
+ waiting locks are taken into
+ account, or 0 if not */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ const trx_t* trx) /*!< in: transaction, or NULL if
+ requests by all transactions
+ are taken into account */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+ ut_ad(gap == 0 || gap == LOCK_GAP);
+ ut_ad(wait == 0 || wait == LOCK_WAIT);
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock) {
+ if (lock->trx != trx
+ && (gap
+ || !(lock_rec_get_gap(lock)
+ || heap_no == PAGE_HEAP_NO_SUPREMUM))
+ && (wait || !lock_get_wait(lock))
+ && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+ return(lock);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+
+ return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting explicit lock request
+in the queue, so that we have to wait.
+@return lock or NULL */
+static
+lock_t*
+lock_rec_other_has_conflicting(
+/*===========================*/
+ enum lock_mode mode, /*!< in: LOCK_S or LOCK_X,
+ possibly ORed to LOCK_GAP or
+ LOC_REC_NOT_GAP,
+ LOCK_INSERT_INTENTION */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ trx_t* trx) /*!< in: our transaction */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ if (UNIV_LIKELY_NULL(lock)) {
+ if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+ do {
+ if (lock_rec_has_to_wait(trx, mode, lock,
+ TRUE)) {
+ return(lock);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ } while (lock);
+ } else {
+
+ do {
+ if (lock_rec_has_to_wait(trx, mode, lock,
+ FALSE)) {
+ return(lock);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ } while (lock);
+ }
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_find_similar_on_page(
+/*==========================*/
+ ulint type_mode, /*!< in: lock type_mode field */
+ ulint heap_no, /*!< in: heap number of the record */
+ lock_t* lock, /*!< in: lock_rec_get_first_on_page() */
+ const trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ while (lock != NULL) {
+ if (lock->trx == trx
+ && lock->type_mode == type_mode
+ && lock_rec_get_n_bits(lock) > heap_no) {
+
+ return(lock);
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index.
+@return transaction which has the x-lock, or NULL */
+static
+trx_t*
+lock_sec_rec_some_has_impl_off_kernel(
+/*==================================*/
+ const rec_t* rec, /*!< in: user record */
+ dict_index_t* index, /*!< in: secondary index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ const page_t* page = page_align(rec);
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* Some transaction may have an implicit x-lock on the record only
+ if the max trx id for the page >= min trx id for the trx list, or
+ database recovery is running. We do not write the changes of a page
+ max trx id to the log, and therefore during recovery, this value
+ for a page may be incorrect. */
+
+ if (!(ut_dulint_cmp(page_get_max_trx_id(page),
+ trx_list_get_min_trx_id()) >= 0)
+ && !recv_recovery_is_on()) {
+
+ return(NULL);
+ }
+
+ /* Ok, in this case it is possible that some transaction has an
+ implicit x-lock. We have to look in the clustered index. */
+
+ if (!lock_check_trx_id_sanity(page_get_max_trx_id(page),
+ rec, index, offsets, TRUE)) {
+ buf_page_print(page, 0);
+
+ /* The page is corrupt: try to avoid a crash by returning
+ NULL */
+ return(NULL);
+ }
+
+ return(row_vers_impl_x_locked_off_kernel(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+ ulint n_records = 0;
+ ulint n_bits;
+ ulint n_bit;
+
+ lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+ while (lock) {
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ n_bits = lock_rec_get_n_bits(lock);
+
+ for (n_bit = 0; n_bit < n_bits; n_bit++) {
+ if (lock_rec_get_nth_bit(lock, n_bit)) {
+ n_records++;
+ }
+ }
+ }
+
+ lock = UT_LIST_GET_NEXT(trx_locks, lock);
+ }
+
+ return (n_records);
+}
+
+/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return created lock */
+static
+lock_t*
+lock_rec_create(
+/*============*/
+ ulint type_mode,/*!< in: lock mode and wait
+ flag, type is ignored and
+ replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+ ulint page_no;
+ ulint space;
+ ulint n_bits;
+ ulint n_bytes;
+ const page_t* page;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ space = buf_block_get_space(block);
+ page_no = buf_block_get_page_no(block);
+ page = block->frame;
+
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ /* If rec is the supremum record, then we reset the gap and
+ LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+ automatically of the gap type */
+
+ if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+ type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+ }
+
+ /* Make lock bitmap bigger by a safety margin */
+ n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
+ n_bytes = 1 + n_bits / 8;
+
+ lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes);
+
+ UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+
+ lock->trx = trx;
+
+ lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC;
+ lock->index = index;
+
+ lock->un_member.rec_lock.space = space;
+ lock->un_member.rec_lock.page_no = page_no;
+ lock->un_member.rec_lock.n_bits = n_bytes * 8;
+
+ /* Reset to zero the bitmap which resides immediately after the
+ lock struct */
+
+ lock_rec_bitmap_reset(lock);
+
+ /* Set the bit corresponding to rec */
+ lock_rec_set_nth_bit(lock, heap_no);
+
+ HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
+ lock_rec_fold(space, page_no), lock);
+ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+
+ lock_set_lock_and_trx_wait(lock, trx);
+ }
+
+ return(lock);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a lock which cannot be granted immediately.
+Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that
+there was a deadlock, but another transaction was chosen as a victim,
+and we got the lock immediately: no need to wait then */
+static
+enum db_err
+lock_rec_enqueue_waiting(
+/*=====================*/
+ ulint type_mode,/*!< in: lock mode this
+ transaction is requesting:
+ LOCK_S or LOCK_X, possibly
+ ORed with LOCK_GAP or
+ LOCK_REC_NOT_GAP, ORed with
+ LOCK_INSERT_INTENTION if this
+ waiting lock request is set
+ when performing an insert of
+ an index record */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ lock_t* lock;
+ trx_t* trx;
+ ulint sec;
+ ulint ms;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* Test if there already is some other reason to suspend thread:
+ we do not enqueue a lock request if the query thread should be
+ stopped anyway */
+
+ if (UNIV_UNLIKELY(que_thr_stop(thr))) {
+
+ ut_error;
+
+ return(DB_QUE_THR_SUSPENDED);
+ }
+
+ trx = thr_get_trx(thr);
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: a record lock wait happens"
+ " in a dictionary operation!\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs(".\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n",
+ stderr);
+ }
+
+ /* Enqueue the lock request that will wait to be granted */
+ lock = lock_rec_create(type_mode | LOCK_WAIT,
+ block, heap_no, index, trx);
+
+ /* Check if a deadlock occurs: if yes, remove the lock request and
+ return an error code */
+
+ if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) {
+
+ lock_reset_lock_and_trx_wait(lock);
+ lock_rec_reset_nth_bit(lock, heap_no);
+
+ return(DB_DEADLOCK);
+ }
+
+ /* If there was a deadlock but we chose another transaction as a
+ victim, it is possible that we already have the lock now granted! */
+
+ if (trx->wait_lock == NULL) {
+
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ trx->que_state = TRX_QUE_LOCK_WAIT;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ trx->wait_started = time(NULL);
+ if (innobase_get_slow_log() && trx->take_stats) {
+ ut_usectime(&sec, &ms);
+ trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
+ }
+
+ ut_a(que_thr_stop(thr));
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fprintf(stderr, "Lock wait for trx %lu in index ",
+ (ulong) ut_dulint_get_low(trx->id));
+ ut_print_name(stderr, trx, FALSE, index->name);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Adds a record lock request in the record queue. The request is normally
+added as the last in the queue, but if there are no waiting lock requests
+on the record, and the request to be added is not a waiting request, we
+can reuse a suitable record lock object already existing on the same page,
+just setting the appropriate bit in its bitmap. This is a low-level function
+which does NOT check for deadlocks or lock compatibility!
+@return lock where the bit was set */
+static
+lock_t*
+lock_rec_add_to_queue(
+/*==================*/
+ ulint type_mode,/*!< in: lock mode, wait, gap
+ etc. flags; type is ignored
+ and replaced by LOCK_REC */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of the record */
+ dict_index_t* index, /*!< in: index of record */
+ trx_t* trx) /*!< in: transaction */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+#ifdef UNIV_DEBUG
+ switch (type_mode & LOCK_MODE_MASK) {
+ case LOCK_X:
+ case LOCK_S:
+ break;
+ default:
+ ut_error;
+ }
+
+ if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) {
+ enum lock_mode mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
+ ? LOCK_X
+ : LOCK_S;
+ lock_t* other_lock
+ = lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
+ block, heap_no, trx);
+ ut_a(!other_lock);
+ }
+#endif /* UNIV_DEBUG */
+
+ type_mode |= LOCK_REC;
+
+ /* If rec is the supremum record, then we can reset the gap bit, as
+ all locks on the supremum are automatically of the gap type, and we
+ try to avoid unnecessary memory consumption of a new record lock
+ struct for a gap type lock */
+
+ if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+ ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+ /* There should never be LOCK_REC_NOT_GAP on a supremum
+ record, but let us play safe */
+
+ type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+ }
+
+ /* Look for a waiting lock request on the same record or on a gap */
+
+ lock = lock_rec_get_first_on_page(block);
+
+ while (lock != NULL) {
+ if (lock_get_wait(lock)
+ && (lock_rec_get_nth_bit(lock, heap_no))) {
+
+ goto somebody_waits;
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) {
+
+ /* Look for a similar record lock on the same page:
+ if one is found and there are no waiting lock requests,
+ we can just set the bit */
+
+ lock = lock_rec_find_similar_on_page(
+ type_mode, heap_no,
+ lock_rec_get_first_on_page(block), trx);
+
+ if (lock) {
+
+ lock_rec_set_nth_bit(lock, heap_no);
+
+ return(lock);
+ }
+ }
+
+somebody_waits:
+ return(lock_rec_create(type_mode, block, heap_no, index, trx));
+}
+
+/** Record locking request status */
+enum lock_rec_req_status {
+ /** Failed to acquire a lock */
+ LOCK_REC_FAIL,
+ /** Succeeded in acquiring a lock (implicit or already acquired) */
+ LOCK_REC_SUCCESS,
+ /** Explicitly created a new lock */
+ LOCK_REC_SUCCESS_CREATED
+};
+
+/*********************************************************************//**
+This is a fast routine for locking a record in the most common cases:
+there are no explicit locks on the page, or there is just one lock, owned
+by this transaction, and of the right type_mode. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case of
+a page supremum record, a gap type lock.
+@return whether the locking succeeded */
+UNIV_INLINE
+enum lock_rec_req_status
+lock_rec_lock_fast(
+/*===============*/
+ ibool impl, /*!< in: if TRUE, no lock is set
+ if no wait is necessary: we
+ assume that the caller will
+ set an implicit lock */
+ ulint mode, /*!< in: lock mode: LOCK_X or
+ LOCK_S possibly ORed to either
+ LOCK_GAP or LOCK_REC_NOT_GAP */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of record */
+ dict_index_t* index, /*!< in: index of record */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ lock_t* lock;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
+ lock = lock_rec_get_first_on_page(block);
+
+ trx = thr_get_trx(thr);
+
+ if (lock == NULL) {
+ if (!impl) {
+ lock_rec_create(mode, block, heap_no, index, trx);
+ }
+
+ return(LOCK_REC_SUCCESS_CREATED);
+ }
+
+ if (lock_rec_get_next_on_page(lock)) {
+
+ return(LOCK_REC_FAIL);
+ }
+
+ if (lock->trx != trx
+ || lock->type_mode != (mode | LOCK_REC)
+ || lock_rec_get_n_bits(lock) <= heap_no) {
+
+ return(LOCK_REC_FAIL);
+ }
+
+ if (!impl) {
+ /* If the nth bit of the record lock is already set then we
+ do not set a new lock bit, otherwise we do set */
+
+ if (!lock_rec_get_nth_bit(lock, heap_no)) {
+ lock_rec_set_nth_bit(lock, heap_no);
+ return(LOCK_REC_SUCCESS_CREATED);
+ }
+ }
+
+ return(LOCK_REC_SUCCESS);
+}
+
+/*********************************************************************//**
+This is the general, and slower, routine for locking a record. This is a
+low-level function which does NOT look at implicit locks! Checks lock
+compatibility within explicit locks. This function sets a normal next-key
+lock, or in the case of a page supremum record, a gap type lock.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+enum db_err
+lock_rec_lock_slow(
+/*===============*/
+ ibool impl, /*!< in: if TRUE, no lock is set
+ if no wait is necessary: we
+ assume that the caller will
+ set an implicit lock */
+ ulint mode, /*!< in: lock mode: LOCK_X or
+ LOCK_S possibly ORed to either
+ LOCK_GAP or LOCK_REC_NOT_GAP */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of record */
+ dict_index_t* index, /*!< in: index of record */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
+ trx = thr_get_trx(thr);
+
+ if (lock_rec_has_expl(mode, block, heap_no, trx)) {
+ /* The trx already has a strong enough lock on rec: do
+ nothing */
+
+ } else if (lock_rec_other_has_conflicting(mode, block, heap_no, trx)) {
+
+ /* If another transaction has a non-gap conflicting request in
+ the queue, as this transaction does not have a lock strong
+ enough already granted on the record, we have to wait. */
+
+ return(lock_rec_enqueue_waiting(mode, block, heap_no,
+ index, thr));
+ } else if (!impl) {
+ /* Set the requested lock on the record */
+
+ lock_rec_add_to_queue(LOCK_REC | mode, block,
+ heap_no, index, trx);
+ return(DB_SUCCESS_LOCKED_REC);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Tries to lock the specified record in the mode requested. If not immediately
+possible, enqueues a waiting lock request. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case
+of a page supremum record, a gap type lock.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+enum db_err
+lock_rec_lock(
+/*==========*/
+ ibool impl, /*!< in: if TRUE, no lock is set
+ if no wait is necessary: we
+ assume that the caller will
+ set an implicit lock */
+ ulint mode, /*!< in: lock mode: LOCK_X or
+ LOCK_S possibly ORed to either
+ LOCK_GAP or LOCK_REC_NOT_GAP */
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no,/*!< in: heap number of record */
+ dict_index_t* index, /*!< in: index of record */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+ ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+ || (LOCK_MODE_MASK & mode) == LOCK_X);
+ ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+ || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+ || mode - (LOCK_MODE_MASK & mode) == 0);
+
+ /* We try a simplified and faster subroutine for the most
+ common cases */
+ switch (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) {
+ case LOCK_REC_SUCCESS:
+ return(DB_SUCCESS);
+ case LOCK_REC_SUCCESS_CREATED:
+ return(DB_SUCCESS_LOCKED_REC);
+ case LOCK_REC_FAIL:
+ return(lock_rec_lock_slow(impl, mode, block,
+ heap_no, index, thr));
+ }
+
+ ut_error;
+ return(DB_ERROR);
+}
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return TRUE if still has to wait */
+static
+ibool
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+ lock_t* wait_lock) /*!< in: waiting record lock */
+{
+ lock_t* lock;
+ ulint space;
+ ulint page_no;
+ ulint heap_no;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_wait(wait_lock));
+ ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
+
+ space = wait_lock->un_member.rec_lock.space;
+ page_no = wait_lock->un_member.rec_lock.page_no;
+ heap_no = lock_rec_find_set_bit(wait_lock);
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ while (lock != wait_lock) {
+
+ if (lock_rec_get_nth_bit(lock, heap_no)
+ && lock_has_to_wait(wait_lock, lock)) {
+
+ return(TRUE);
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************//**
+Grants a lock to a waiting lock request and releases the waiting
+transaction. */
+static
+void
+lock_grant(
+/*=======*/
+ lock_t* lock) /*!< in/out: waiting lock request */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock_reset_lock_and_trx_wait(lock);
+
+ if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+ trx_t* trx = lock->trx;
+ dict_table_t* table = lock->un_member.tab_lock.table;
+
+ if (table->autoinc_trx == trx) {
+ fprintf(stderr,
+ "InnoDB: Error: trx already had"
+ " an AUTO-INC lock!\n");
+ } else {
+ table->autoinc_trx = trx;
+
+ ib_vector_push(trx->autoinc_locks, lock);
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fprintf(stderr, "Lock wait for trx %lu ends\n",
+ (ulong) ut_dulint_get_low(lock->trx->id));
+ }
+#endif /* UNIV_DEBUG */
+
+ /* If we are resolving a deadlock by choosing another transaction
+ as a victim, then our original transaction may not be in the
+ TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+ for it */
+
+ if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {
+ trx_end_lock_wait(lock->trx);
+ }
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static
+void
+lock_rec_cancel(
+/*============*/
+ lock_t* lock) /*!< in: waiting record lock request */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+ /* Reset the bit (there can be only one set bit) in the lock bitmap */
+ lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+ /* Reset the wait flag and the back pointer to lock in trx */
+
+ lock_reset_lock_and_trx_wait(lock);
+
+ /* The following function releases the trx from lock wait */
+
+ trx_end_lock_wait(lock->trx);
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue and
+grants locks to other transactions in the queue if they now are entitled
+to a lock. NOTE: all record locks contained in in_lock are removed. */
+static
+void
+lock_rec_dequeue_from_page(
+/*=======================*/
+ lock_t* in_lock)/*!< in: record lock object: all record locks which
+ are contained in this lock object are removed;
+ transactions waiting behind will get their lock
+ requests granted, if they are now qualified to it */
+{
+ ulint space;
+ ulint page_no;
+ lock_t* lock;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+ trx = in_lock->trx;
+
+ space = in_lock->un_member.rec_lock.space;
+ page_no = in_lock->un_member.rec_lock.page_no;
+
+ HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+ lock_rec_fold(space, page_no), in_lock);
+
+ UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+
+ /* Check if waiting locks in the queue can now be granted: grant
+ locks if there are no conflicting locks ahead. */
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ while (lock != NULL) {
+ if (lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ lock_grant(lock);
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue. */
+static
+void
+lock_rec_discard(
+/*=============*/
+ lock_t* in_lock)/*!< in: record lock object: all record locks which
+ are contained in this lock object are removed */
+{
+ ulint space;
+ ulint page_no;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+ trx = in_lock->trx;
+
+ space = in_lock->un_member.rec_lock.space;
+ page_no = in_lock->un_member.rec_lock.page_no;
+
+ HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+ lock_rec_fold(space, page_no), in_lock);
+
+ UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+static
+void
+lock_rec_free_all_from_discard_page(
+/*================================*/
+ const buf_block_t* block) /*!< in: page to be discarded */
+{
+ ulint space;
+ ulint page_no;
+ lock_t* lock;
+ lock_t* next_lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ space = buf_block_get_space(block);
+ page_no = buf_block_get_page_no(block);
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ while (lock != NULL) {
+ ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+ ut_ad(!lock_get_wait(lock));
+
+ next_lock = lock_rec_get_next_on_page(lock);
+
+ lock_rec_discard(lock);
+
+ lock = next_lock;
+ }
+}
+
+/*============= RECORD LOCK MOVING AND INHERITING ===================*/
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+static
+void
+lock_rec_reset_and_release_wait(
+/*============================*/
+ const buf_block_t* block, /*!< in: buffer block containing
+ the record */
+ ulint heap_no)/*!< in: heap number of record */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock != NULL) {
+ if (lock_get_wait(lock)) {
+ lock_rec_cancel(lock);
+ } else {
+ lock_rec_reset_nth_bit(lock, heap_no);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+}
+
+/*************************************************************//**
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap(
+/*====================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no) /*!< in: heap_no of the
+ donating record */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ /* If srv_locks_unsafe_for_binlog is TRUE or session is using
+ READ COMMITTED isolation level, we do not want locks set
+ by an UPDATE or a DELETE to be inherited as gap type locks. But we
+ DO want S-locks set by a consistency constraint to be inherited also
+ then. */
+
+ while (lock != NULL) {
+ if (!lock_rec_get_insert_intention(lock)
+ && !((srv_locks_unsafe_for_binlog
+ || lock->trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED)
+ && lock_get_mode(lock) == LOCK_X)) {
+
+ lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
+ | lock_get_mode(lock),
+ heir_block, heir_heap_no,
+ lock->index, lock->trx);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+}
+
+/*************************************************************//**
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ ulint heir_heap_no, /*!< in: heap_no of
+ record which inherits */
+ ulint heap_no) /*!< in: heap_no of record
+ from which inherited;
+ does NOT reset the locks
+ on this record */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock != NULL) {
+ if (!lock_rec_get_insert_intention(lock)
+ && (heap_no == PAGE_HEAP_NO_SUPREMUM
+ || !lock_rec_get_rec_not_gap(lock))) {
+
+ lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
+ | lock_get_mode(lock),
+ block, heir_heap_no,
+ lock->index, lock->trx);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+static
+void
+lock_rec_move(
+/*==========*/
+ const buf_block_t* receiver, /*!< in: buffer block containing
+ the receiving record */
+ const buf_block_t* donator, /*!< in: buffer block containing
+ the donating record */
+ ulint receiver_heap_no,/*!< in: heap_no of the record
+ which gets the locks; there
+ must be no lock requests
+ on it! */
+ ulint donator_heap_no)/*!< in: heap_no of the record
+ which gives the locks */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = lock_rec_get_first(donator, donator_heap_no);
+
+ ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL);
+
+ while (lock != NULL) {
+ const ulint type_mode = lock->type_mode;
+
+ lock_rec_reset_nth_bit(lock, donator_heap_no);
+
+ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ /* Note that we FIRST reset the bit, and then set the lock:
+ the function works also if donator == receiver */
+
+ lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no,
+ lock->index, lock->trx);
+ lock = lock_rec_get_next(donator_heap_no, lock);
+ }
+
+ ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL);
+}
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+ const buf_block_t* block, /*!< in: old index page, now
+ reorganized */
+ const buf_block_t* oblock) /*!< in: copy of the old, not
+ reorganized page */
+{
+ lock_t* lock;
+ UT_LIST_BASE_NODE_T(lock_t) old_locks;
+ mem_heap_t* heap = NULL;
+ ulint comp;
+
+ lock_mutex_enter_kernel();
+
+ lock = lock_rec_get_first_on_page(block);
+
+ if (lock == NULL) {
+ lock_mutex_exit_kernel();
+
+ return;
+ }
+
+ heap = mem_heap_create(256);
+
+ /* Copy first all the locks on the page to heap and reset the
+ bitmaps in the original locks; chain the copies of the locks
+ using the trx_locks field in them. */
+
+ UT_LIST_INIT(old_locks);
+
+ do {
+ /* Make a copy of the lock */
+ lock_t* old_lock = lock_rec_copy(lock, heap);
+
+ UT_LIST_ADD_LAST(trx_locks, old_locks, old_lock);
+
+ /* Reset bitmap of lock */
+ lock_rec_bitmap_reset(lock);
+
+ if (lock_get_wait(lock)) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock != NULL);
+
+ comp = page_is_comp(block->frame);
+ ut_ad(comp == page_is_comp(oblock->frame));
+
+ for (lock = UT_LIST_GET_FIRST(old_locks); lock;
+ lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+ /* NOTE: we copy also the locks set on the infimum and
+ supremum of the page; the infimum may carry locks if an
+ update of a record is occurring on the page, and its locks
+ were temporarily stored on the infimum */
+ page_cur_t cur1;
+ page_cur_t cur2;
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_set_before_first(oblock, &cur2);
+
+ /* Set locks according to old locks */
+ for (;;) {
+ ulint old_heap_no;
+ ulint new_heap_no;
+
+ ut_ad(comp || !memcmp(page_cur_get_rec(&cur1),
+ page_cur_get_rec(&cur2),
+ rec_get_data_size_old(
+ page_cur_get_rec(
+ &cur2))));
+ if (UNIV_LIKELY(comp)) {
+ old_heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur2));
+ new_heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur1));
+ } else {
+ old_heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur2));
+ new_heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur1));
+ }
+
+ if (lock_rec_get_nth_bit(lock, old_heap_no)) {
+
+ /* Clear the bit in old_lock. */
+ ut_d(lock_rec_reset_nth_bit(lock,
+ old_heap_no));
+
+ /* NOTE that the old lock bitmap could be too
+ small for the new heap number! */
+
+ lock_rec_add_to_queue(lock->type_mode, block,
+ new_heap_no,
+ lock->index, lock->trx);
+
+ /* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM
+ && lock_get_wait(lock)) {
+ fprintf(stderr,
+ "---\n--\n!!!Lock reorg: supr type %lu\n",
+ lock->type_mode);
+ } */
+ }
+
+ if (UNIV_UNLIKELY
+ (new_heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+ ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+ break;
+ }
+
+ page_cur_move_to_next(&cur1);
+ page_cur_move_to_next(&cur2);
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ ulint i = lock_rec_find_set_bit(lock);
+
+ /* Check that all locks were moved. */
+ if (UNIV_UNLIKELY(i != ULINT_UNDEFINED)) {
+ fprintf(stderr,
+ "lock_move_reorganize_page():"
+ " %lu not moved in %p\n",
+ (ulong) i, (void*) lock);
+ ut_error;
+ }
+ }
+#endif /* UNIV_DEBUG */
+ }
+
+ lock_mutex_exit_kernel();
+
+ mem_heap_free(heap);
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block)));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec) /*!< in: record on page: this
+ is the first record moved */
+{
+ lock_t* lock;
+ const ulint comp = page_rec_is_comp(rec);
+
+ lock_mutex_enter_kernel();
+
+ /* Note: when we move locks from record to record, waiting locks
+ and possible granted gap type locks behind them are enqueued in
+ the original order, because new elements are inserted to a hash
+ table to the end of the hash chain, and lock_rec_add_to_queue
+ does not reuse locks if there are waiters in the queue. */
+
+ for (lock = lock_rec_get_first_on_page(block); lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ page_cur_t cur1;
+ page_cur_t cur2;
+ const ulint type_mode = lock->type_mode;
+
+ page_cur_position(rec, block, &cur1);
+
+ if (page_cur_is_before_first(&cur1)) {
+ page_cur_move_to_next(&cur1);
+ }
+
+ page_cur_set_before_first(new_block, &cur2);
+ page_cur_move_to_next(&cur2);
+
+ /* Copy lock requests on user records to new page and
+ reset the lock bits on the old */
+
+ while (!page_cur_is_after_last(&cur1)) {
+ ulint heap_no;
+
+ if (comp) {
+ heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur1));
+ } else {
+ heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur1));
+ ut_ad(!memcmp(page_cur_get_rec(&cur1),
+ page_cur_get_rec(&cur2),
+ rec_get_data_size_old(
+ page_cur_get_rec(&cur2))));
+ }
+
+ if (lock_rec_get_nth_bit(lock, heap_no)) {
+ lock_rec_reset_nth_bit(lock, heap_no);
+
+ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ if (comp) {
+ heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur2));
+ } else {
+ heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur2));
+ }
+
+ lock_rec_add_to_queue(type_mode,
+ new_block, heap_no,
+ lock->index, lock->trx);
+ }
+
+ page_cur_move_to_next(&cur1);
+ page_cur_move_to_next(&cur2);
+ }
+ }
+
+ lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block)));
+ ut_ad(lock_rec_validate_page(buf_block_get_space(new_block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(new_block)));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+ const buf_block_t* new_block, /*!< in: index page to move to */
+ const buf_block_t* block, /*!< in: index page */
+ const rec_t* rec, /*!< in: record on page:
+ this is the first
+ record NOT copied */
+ const rec_t* old_end) /*!< in: old
+ previous-to-last
+ record on new_page
+ before the records
+ were copied */
+{
+ lock_t* lock;
+ const ulint comp = page_rec_is_comp(rec);
+
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(new_block->frame == page_align(old_end));
+
+ lock_mutex_enter_kernel();
+
+ for (lock = lock_rec_get_first_on_page(block); lock;
+ lock = lock_rec_get_next_on_page(lock)) {
+ page_cur_t cur1;
+ page_cur_t cur2;
+ const ulint type_mode = lock->type_mode;
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ page_cur_position(old_end, new_block, &cur2);
+ page_cur_move_to_next(&cur2);
+
+ /* Copy lock requests on user records to new page and
+ reset the lock bits on the old */
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ ulint heap_no;
+
+ if (comp) {
+ heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur1));
+ } else {
+ heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur1));
+ ut_ad(!memcmp(page_cur_get_rec(&cur1),
+ page_cur_get_rec(&cur2),
+ rec_get_data_size_old(
+ page_cur_get_rec(
+ &cur2))));
+ }
+
+ if (lock_rec_get_nth_bit(lock, heap_no)) {
+ lock_rec_reset_nth_bit(lock, heap_no);
+
+ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+ lock_reset_lock_and_trx_wait(lock);
+ }
+
+ if (comp) {
+ heap_no = rec_get_heap_no_new(
+ page_cur_get_rec(&cur2));
+ } else {
+ heap_no = rec_get_heap_no_old(
+ page_cur_get_rec(&cur2));
+ }
+
+ lock_rec_add_to_queue(type_mode,
+ new_block, heap_no,
+ lock->index, lock->trx);
+ }
+
+ page_cur_move_to_next(&cur1);
+ page_cur_move_to_next(&cur2);
+ }
+
+#ifdef UNIV_DEBUG
+ if (page_rec_is_supremum(rec)) {
+ ulint i;
+
+ for (i = PAGE_HEAP_NO_USER_LOW;
+ i < lock_rec_get_n_bits(lock); i++) {
+ if (UNIV_UNLIKELY
+ (lock_rec_get_nth_bit(lock, i))) {
+
+ fprintf(stderr,
+ "lock_move_rec_list_start():"
+ " %lu not moved in %p\n",
+ (ulong) i, (void*) lock);
+ ut_error;
+ }
+ }
+ }
+#endif /* UNIV_DEBUG */
+ }
+
+ lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+ ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+ buf_block_get_zip_size(block),
+ buf_block_get_page_no(block)));
+#endif
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block) /*!< in: left page */
+{
+ ulint heap_no = lock_get_min_heap_no(right_block);
+
+ lock_mutex_enter_kernel();
+
+ /* Move the locks on the supremum of the left page to the supremum
+ of the right page */
+
+ lock_rec_move(right_block, left_block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+ /* Inherit the locks to the supremum of left page from the successor
+ of the infimum on right page */
+
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+ const buf_block_t* right_block, /*!< in: right page to
+ which merged */
+ const rec_t* orig_succ, /*!< in: original
+ successor of infimum
+ on the right page
+ before merge */
+ const buf_block_t* left_block) /*!< in: merged index
+ page which will be
+ discarded */
+{
+ lock_mutex_enter_kernel();
+
+ /* Inherit the locks from the supremum of the left page to the
+ original successor of infimum on the right page, to which the left
+ page was merged */
+
+ lock_rec_inherit_to_gap(right_block, left_block,
+ page_rec_get_heap_no(orig_succ),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page, releasing
+ waiting transactions */
+
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+
+ lock_rec_free_all_from_discard_page(left_block);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+ const buf_block_t* block, /*!< in: index page to which copied */
+ const buf_block_t* root) /*!< in: root page */
+{
+ lock_mutex_enter_kernel();
+
+ /* Move the locks on the supremum of the root to the supremum
+ of block */
+
+ lock_rec_move(block, root,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+ const buf_block_t* new_block, /*!< in: index page to
+ which copied */
+ const buf_block_t* block) /*!< in: index page;
+ NOT the root! */
+{
+ lock_mutex_enter_kernel();
+
+ /* Move the locks on the supremum of the old page to the supremum
+ of new_page */
+
+ lock_rec_move(new_block, block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+ lock_rec_free_all_from_discard_page(block);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+ const buf_block_t* right_block, /*!< in: right page */
+ const buf_block_t* left_block) /*!< in: left page */
+{
+ ulint heap_no = lock_get_min_heap_no(right_block);
+
+ lock_mutex_enter_kernel();
+
+ /* Inherit the locks to the supremum of the left page from the
+ successor of the infimum on the right page */
+
+ lock_rec_inherit_to_gap(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+ const buf_block_t* left_block, /*!< in: left page to
+ which merged */
+ const rec_t* orig_pred, /*!< in: original predecessor
+ of supremum on the left page
+ before merge */
+ const buf_block_t* right_block) /*!< in: merged index page
+ which will be discarded */
+{
+ const rec_t* left_next_rec;
+
+ ut_ad(left_block->frame == page_align(orig_pred));
+
+ lock_mutex_enter_kernel();
+
+ left_next_rec = page_rec_get_next_const(orig_pred);
+
+ if (!page_rec_is_supremum(left_next_rec)) {
+
+ /* Inherit the locks on the supremum of the left page to the
+ first record which was moved from the right page */
+
+ lock_rec_inherit_to_gap(left_block, left_block,
+ page_rec_get_heap_no(left_next_rec),
+ PAGE_HEAP_NO_SUPREMUM);
+
+ /* Reset the locks on the supremum of the left page,
+ releasing waiting transactions */
+
+ lock_rec_reset_and_release_wait(left_block,
+ PAGE_HEAP_NO_SUPREMUM);
+ }
+
+ /* Move the locks from the supremum of right page to the supremum
+ of the left page */
+
+ lock_rec_move(left_block, right_block,
+ PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+ lock_rec_free_all_from_discard_page(right_block);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+ const buf_block_t* heir_block, /*!< in: block containing the
+ record which inherits */
+ const buf_block_t* block, /*!< in: block containing the
+ record from which inherited;
+ does NOT reset the locks on
+ this record */
+ ulint heir_heap_no, /*!< in: heap_no of the
+ inheriting record */
+ ulint heap_no) /*!< in: heap_no of the
+ donating record */
+{
+ mutex_enter(&kernel_mutex);
+
+ lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
+
+ lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+ const buf_block_t* heir_block, /*!< in: index page
+ which will inherit the locks */
+ ulint heir_heap_no, /*!< in: heap_no of the record
+ which will inherit the locks */
+ const buf_block_t* block) /*!< in: index page
+ which will be discarded */
+{
+ const page_t* page = block->frame;
+ const rec_t* rec;
+ ulint heap_no;
+
+ lock_mutex_enter_kernel();
+
+ if (!lock_rec_get_first_on_page(block)) {
+ /* No locks exist on page, nothing to do */
+
+ lock_mutex_exit_kernel();
+
+ return;
+ }
+
+ /* Inherit all the locks on the page to the record and reset all
+ the locks on the page */
+
+ if (page_is_comp(page)) {
+ rec = page + PAGE_NEW_INFIMUM;
+
+ do {
+ heap_no = rec_get_heap_no_new(rec);
+
+ lock_rec_inherit_to_gap(heir_block, block,
+ heir_heap_no, heap_no);
+
+ lock_rec_reset_and_release_wait(block, heap_no);
+
+ rec = page + rec_get_next_offs(rec, TRUE);
+ } while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+ } else {
+ rec = page + PAGE_OLD_INFIMUM;
+
+ do {
+ heap_no = rec_get_heap_no_old(rec);
+
+ lock_rec_inherit_to_gap(heir_block, block,
+ heir_heap_no, heap_no);
+
+ lock_rec_reset_and_release_wait(block, heap_no);
+
+ rec = page + rec_get_next_offs(rec, FALSE);
+ } while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+ }
+
+ lock_rec_free_all_from_discard_page(block);
+
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: the inserted record */
+{
+ ulint receiver_heap_no;
+ ulint donator_heap_no;
+
+ ut_ad(block->frame == page_align(rec));
+
+ /* Inherit the gap-locking locks for rec, in gap mode, from the next
+ record */
+
+ if (page_rec_is_comp(rec)) {
+ receiver_heap_no = rec_get_heap_no_new(rec);
+ donator_heap_no = rec_get_heap_no_new(
+ page_rec_get_next_low(rec, TRUE));
+ } else {
+ receiver_heap_no = rec_get_heap_no_old(rec);
+ donator_heap_no = rec_get_heap_no_old(
+ page_rec_get_next_low(rec, FALSE));
+ }
+
+ lock_mutex_enter_kernel();
+ lock_rec_inherit_to_gap_if_gap_lock(block,
+ receiver_heap_no, donator_heap_no);
+ lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: the record to be removed */
+{
+ const page_t* page = block->frame;
+ ulint heap_no;
+ ulint next_heap_no;
+
+ ut_ad(page == page_align(rec));
+
+ if (page_is_comp(page)) {
+ heap_no = rec_get_heap_no_new(rec);
+ next_heap_no = rec_get_heap_no_new(page
+ + rec_get_next_offs(rec,
+ TRUE));
+ } else {
+ heap_no = rec_get_heap_no_old(rec);
+ next_heap_no = rec_get_heap_no_old(page
+ + rec_get_next_offs(rec,
+ FALSE));
+ }
+
+ lock_mutex_enter_kernel();
+
+ /* Let the next record inherit the locks from rec, in gap mode */
+
+ lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no);
+
+ /* Reset the lock bits on rec and release waiting transactions */
+
+ lock_rec_reset_and_release_wait(block, heap_no);
+
+ lock_mutex_exit_kernel();
+}
+
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is moved in such an update, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec) /*!< in: record whose lock state
+ is stored on the infimum
+ record of the same page; lock
+ bits are reset on the
+ record */
+{
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ ut_ad(block->frame == page_align(rec));
+
+ lock_mutex_enter_kernel();
+
+ lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
+
+ lock_mutex_exit_kernel();
+}
+
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record whose lock state
+ is restored */
+ const buf_block_t* donator)/*!< in: page (rec is not
+ necessarily on this page)
+ whose infimum stored the lock
+ state; lock bits are reset on
+ the infimum */
+{
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter_kernel();
+
+ lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
+
+ lock_mutex_exit_kernel();
+}
+
+/*=========== DEADLOCK CHECKING ======================================*/
+
+/********************************************************************//**
+Checks if a lock request results in a deadlock.
+@return TRUE if a deadlock was detected and we chose trx as a victim;
+FALSE if no deadlock, or there was a deadlock, but we chose other
+transaction(s) as victim(s) */
+static
+ibool
+lock_deadlock_occurs(
+/*=================*/
+ lock_t* lock, /*!< in: lock the transaction is requesting */
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_t* mark_trx;
+ ulint ret;
+ ulint cost = 0;
+
+ ut_ad(trx);
+ ut_ad(lock);
+ ut_ad(mutex_own(&kernel_mutex));
+retry:
+ /* We check that adding this trx to the waits-for graph
+ does not produce a cycle. First mark all active transactions
+ with 0: */
+
+ mark_trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (mark_trx) {
+ mark_trx->deadlock_mark = 0;
+ mark_trx = UT_LIST_GET_NEXT(trx_list, mark_trx);
+ }
+
+ ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
+
+ switch (ret) {
+ case LOCK_VICTIM_IS_OTHER:
+ /* We chose some other trx as a victim: retry if there still
+ is a deadlock */
+ goto retry;
+
+ case LOCK_EXCEED_MAX_DEPTH:
+ /* If the lock search exceeds the max step
+ or the max depth, the current trx will be
+ the victim. Print its information. */
+ rewind(lock_latest_err_file);
+ ut_print_timestamp(lock_latest_err_file);
+
+ fputs("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+ " WAITS-FOR GRAPH, WE WILL ROLL BACK"
+ " FOLLOWING TRANSACTION \n",
+ lock_latest_err_file);
+
+ fputs("\n*** TRANSACTION:\n", lock_latest_err_file);
+ trx_print(lock_latest_err_file, trx, 3000);
+
+ fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+ lock_latest_err_file);
+
+ if (lock_get_type(lock) == LOCK_REC) {
+ lock_rec_print(lock_latest_err_file, lock);
+ } else {
+ lock_table_print(lock_latest_err_file, lock);
+ }
+ break;
+
+ case LOCK_VICTIM_IS_START:
+ srv_n_lock_deadlock_count++;
+ fputs("*** WE ROLL BACK TRANSACTION (2)\n",
+ lock_latest_err_file);
+ break;
+
+ default:
+ /* No deadlock detected*/
+ return(FALSE);
+ }
+
+ lock_deadlock_found = TRUE;
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Looks recursively for a deadlock.
+@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
+deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
+deadlock was found and we chose some other trx as a victim: we must do
+the search again in this last case because there may be another
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+static
+ulint
+lock_deadlock_recursive(
+/*====================*/
+ trx_t* start, /*!< in: recursion starting point */
+ trx_t* trx, /*!< in: a transaction waiting for a lock */
+ lock_t* wait_lock, /*!< in: lock that is waiting to be granted */
+ ulint* cost, /*!< in/out: number of calculation steps thus
+ far: if this exceeds LOCK_MAX_N_STEPS_...
+ we return LOCK_EXCEED_MAX_DEPTH */
+ ulint depth) /*!< in: recursion depth: if this exceeds
+ LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
+ return LOCK_EXCEED_MAX_DEPTH */
+{
+ ulint ret;
+ lock_t* lock;
+ trx_t* lock_trx;
+ ulint heap_no = ULINT_UNDEFINED;
+
+ ut_a(trx);
+ ut_a(start);
+ ut_a(wait_lock);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (trx->deadlock_mark == 1) {
+ /* We have already exhaustively searched the subtree starting
+ from this trx */
+
+ return(0);
+ }
+
+ *cost = *cost + 1;
+
+ if (lock_get_type_low(wait_lock) == LOCK_REC) {
+ ulint space;
+ ulint page_no;
+
+ heap_no = lock_rec_find_set_bit(wait_lock);
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ space = wait_lock->un_member.rec_lock.space;
+ page_no = wait_lock->un_member.rec_lock.page_no;
+
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ /* Position the iterator on the first matching record lock. */
+ while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no)) {
+
+ lock = lock_rec_get_next_on_page(lock);
+ }
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
+
+ ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
+
+ } else {
+ lock = wait_lock;
+ }
+
+ /* Look at the locks ahead of wait_lock in the lock queue */
+
+ for (;;) {
+ /* Get previous table lock. */
+ if (heap_no == ULINT_UNDEFINED) {
+
+ lock = UT_LIST_GET_PREV(
+ un_member.tab_lock.locks, lock);
+ }
+
+ if (lock == NULL) {
+ /* We can mark this subtree as searched */
+ trx->deadlock_mark = 1;
+
+ return(FALSE);
+ }
+
+ if (lock_has_to_wait(wait_lock, lock)) {
+
+ ibool too_far
+ = depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
+ || *cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK;
+
+ lock_trx = lock->trx;
+
+ if (lock_trx == start) {
+
+ /* We came back to the recursion starting
+ point: a deadlock detected; or we have
+ searched the waits-for graph too long */
+
+ FILE* ef = lock_latest_err_file;
+
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs("\n*** (1) TRANSACTION:\n", ef);
+
+ trx_print(ef, wait_lock->trx, 3000);
+
+ fputs("*** (1) WAITING FOR THIS LOCK"
+ " TO BE GRANTED:\n", ef);
+
+ if (lock_get_type_low(wait_lock) == LOCK_REC) {
+ lock_rec_print(ef, wait_lock);
+ } else {
+ lock_table_print(ef, wait_lock);
+ }
+
+ fputs("*** (2) TRANSACTION:\n", ef);
+
+ trx_print(ef, lock->trx, 3000);
+
+ fputs("*** (2) HOLDS THE LOCK(S):\n", ef);
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ lock_rec_print(ef, lock);
+ } else {
+ lock_table_print(ef, lock);
+ }
+
+ fputs("*** (2) WAITING FOR THIS LOCK"
+ " TO BE GRANTED:\n", ef);
+
+ if (lock_get_type_low(start->wait_lock)
+ == LOCK_REC) {
+ lock_rec_print(ef, start->wait_lock);
+ } else {
+ lock_table_print(ef, start->wait_lock);
+ }
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fputs("Deadlock detected\n",
+ stderr);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (trx_weight_cmp(wait_lock->trx,
+ start) >= 0) {
+ /* Our recursion starting point
+ transaction is 'smaller', let us
+ choose 'start' as the victim and roll
+ back it */
+
+ return(LOCK_VICTIM_IS_START);
+ }
+
+ lock_deadlock_found = TRUE;
+
+ /* Let us choose the transaction of wait_lock
+ as a victim to try to avoid deadlocking our
+ recursion starting point transaction */
+
+ fputs("*** WE ROLL BACK TRANSACTION (1)\n",
+ ef);
+
+ wait_lock->trx->was_chosen_as_deadlock_victim
+ = TRUE;
+
+ lock_cancel_waiting_and_release(wait_lock);
+
+ /* Since trx and wait_lock are no longer
+ in the waits-for graph, we can return FALSE;
+ note that our selective algorithm can choose
+ several transactions as victims, but still
+ we may end up rolling back also the recursion
+ starting point transaction! */
+
+ return(LOCK_VICTIM_IS_OTHER);
+ }
+
+ if (too_far) {
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fputs("Deadlock search exceeds"
+ " max steps or depth.\n",
+ stderr);
+ }
+#endif /* UNIV_DEBUG */
+ /* The information about transaction/lock
+ to be rolled back is available in the top
+ level. Do not print anything here. */
+ return(LOCK_EXCEED_MAX_DEPTH);
+ }
+
+ if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ /* Another trx ahead has requested lock in an
+ incompatible mode, and is itself waiting for
+ a lock */
+
+ ret = lock_deadlock_recursive(
+ start, lock_trx,
+ lock_trx->wait_lock, cost, depth + 1);
+
+ if (ret != 0) {
+
+ return(ret);
+ }
+ }
+ }
+ /* Get the next record lock to check. */
+ if (heap_no != ULINT_UNDEFINED) {
+
+ ut_a(lock != NULL);
+
+ do {
+ lock = lock_rec_get_next_on_page(lock);
+ } while (lock != NULL
+ && lock != wait_lock
+ && !lock_rec_get_nth_bit(lock, heap_no));
+
+ if (lock == wait_lock) {
+ lock = NULL;
+ }
+ }
+ }/* end of the 'for (;;)'-loop */
+}
+
+/*========================= TABLE LOCKS ==============================*/
+
+/*********************************************************************//**
+Creates a table lock object and adds it as the last in the lock queue
+of the table. Does NOT check for deadlocks or lock compatibility.
+@return own: new lock object */
+UNIV_INLINE
+lock_t*
+lock_table_create(
+/*==============*/
+ dict_table_t* table, /*!< in: database table in dictionary cache */
+ ulint type_mode,/*!< in: lock mode possibly ORed with
+ LOCK_WAIT */
+ trx_t* trx) /*!< in: trx */
+{
+ lock_t* lock;
+
+ ut_ad(table && trx);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
+ ++table->n_waiting_or_granted_auto_inc_locks;
+ }
+
+ /* For AUTOINC locking we reuse the lock instance only if
+ there is no wait involved else we allocate the waiting lock
+ from the transaction lock heap. */
+ if (type_mode == LOCK_AUTO_INC) {
+
+ lock = table->autoinc_lock;
+
+ table->autoinc_trx = trx;
+
+ ib_vector_push(trx->autoinc_locks, lock);
+ } else {
+ lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t));
+ }
+
+ UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+
+ lock->type_mode = type_mode | LOCK_TABLE;
+ lock->trx = trx;
+
+ lock->un_member.tab_lock.table = table;
+
+ UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+
+ if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+
+ lock_set_lock_and_trx_wait(lock, trx);
+ }
+
+ return(lock);
+}
+
+/*************************************************************//**
+Removes a table lock request from the queue and the trx list of locks;
+this is a low-level function which does NOT check if waiting requests
+can now be granted. */
+UNIV_INLINE
+void
+lock_table_remove_low(
+/*==================*/
+ lock_t* lock) /*!< in: table lock */
+{
+ trx_t* trx;
+ dict_table_t* table;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx = lock->trx;
+ table = lock->un_member.tab_lock.table;
+
+ /* Remove the table from the transaction's AUTOINC vector, if
+ the lock that is being release is an AUTOINC lock. */
+ if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+
+ /* The table's AUTOINC lock can get transferred to
+ another transaction before we get here. */
+ if (table->autoinc_trx == trx) {
+ table->autoinc_trx = NULL;
+ }
+
+ /* The locks must be freed in the reverse order from
+ the one in which they were acquired. This is to avoid
+ traversing the AUTOINC lock vector unnecessarily.
+
+ We only store locks that were granted in the
+ trx->autoinc_locks vector (see lock_table_create()
+ and lock_grant()). Therefore it can be empty and we
+ need to check for that. */
+
+ if (!lock_get_wait(lock)
+ && !ib_vector_is_empty(trx->autoinc_locks)) {
+ lock_t* autoinc_lock;
+
+ autoinc_lock = ib_vector_pop(trx->autoinc_locks);
+ ut_a(autoinc_lock == lock);
+ }
+
+ ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
+ --table->n_waiting_or_granted_auto_inc_locks;
+ }
+
+ UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock);
+ UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a table lock which cannot be granted
+immediately. Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another
+transaction was chosen as a victim, and we got the lock immediately:
+no need to wait then */
+static
+ulint
+lock_table_enqueue_waiting(
+/*=======================*/
+ ulint mode, /*!< in: lock mode this transaction is
+ requesting */
+ dict_table_t* table, /*!< in: table */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ lock_t* lock;
+ trx_t* trx;
+ ulint sec;
+ ulint ms;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* Test if there already is some other reason to suspend thread:
+ we do not enqueue a lock request if the query thread should be
+ stopped anyway */
+
+ if (que_thr_stop(thr)) {
+ ut_error;
+
+ return(DB_QUE_THR_SUSPENDED);
+ }
+
+ trx = thr_get_trx(thr);
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ break;
+ case TRX_DICT_OP_TABLE:
+ case TRX_DICT_OP_INDEX:
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: a table lock wait happens"
+ " in a dictionary operation!\n"
+ "InnoDB: Table name ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs(".\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n",
+ stderr);
+ }
+
+ /* Enqueue the lock request that will wait to be granted */
+
+ lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+
+ /* Check if a deadlock occurs: if yes, remove the lock request and
+ return an error code */
+
+ if (lock_deadlock_occurs(lock, trx)) {
+
+ /* The order here is important, we don't want to
+ lose the state of the lock before calling remove. */
+ lock_table_remove_low(lock);
+ lock_reset_lock_and_trx_wait(lock);
+
+ return(DB_DEADLOCK);
+ }
+
+ if (trx->wait_lock == NULL) {
+ /* Deadlock resolution chose another transaction as a victim,
+ and we accidentally got our lock granted! */
+
+ return(DB_SUCCESS);
+ }
+
+ if (innobase_get_slow_log() && trx->take_stats) {
+ ut_usectime(&sec, &ms);
+ trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
+ }
+ trx->que_state = TRX_QUE_LOCK_WAIT;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ trx->wait_started = time(NULL);
+
+ ut_a(que_thr_stop(thr));
+
+ return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if other transactions have an incompatible mode lock request in
+the lock queue.
+@return lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_other_has_incompatible(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction, or NULL if all
+ transactions should be included */
+ ulint wait, /*!< in: LOCK_WAIT if also waiting locks are
+ taken into account, or 0 if not */
+ dict_table_t* table, /*!< in: table */
+ enum lock_mode mode) /*!< in: lock mode */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = UT_LIST_GET_LAST(table->locks);
+
+ while (lock != NULL) {
+
+ if ((lock->trx != trx)
+ && (!lock_mode_compatible(lock_get_mode(lock), mode))
+ && (wait || !(lock_get_wait(lock)))) {
+
+ return(lock);
+ }
+
+ lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+ }
+
+ return(NULL);
+}
+
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_table(
+/*=======*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+ does nothing */
+ dict_table_t* table, /*!< in: database table in dictionary cache */
+ enum lock_mode mode, /*!< in: lock mode */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ ulint err;
+
+ ut_ad(table && thr);
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_a(flags == 0);
+
+ trx = thr_get_trx(thr);
+
+ lock_mutex_enter_kernel();
+
+ /* Look for stronger locks the same trx already has on the table */
+
+ if (lock_table_has(trx, table, mode)) {
+
+ lock_mutex_exit_kernel();
+
+ return(DB_SUCCESS);
+ }
+
+ /* We have to check if the new lock is compatible with any locks
+ other transactions have in the table lock queue. */
+
+ if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
+
+ /* Another trx has a request on the table in an incompatible
+ mode: this trx may have to wait */
+
+ err = lock_table_enqueue_waiting(mode | flags, table, thr);
+
+ lock_mutex_exit_kernel();
+
+ return(err);
+ }
+
+ lock_table_create(table, mode | flags, trx);
+
+ ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+
+ lock_mutex_exit_kernel();
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Checks if a waiting table lock request still has to wait in a queue.
+@return TRUE if still has to wait */
+static
+ibool
+lock_table_has_to_wait_in_queue(
+/*============================*/
+ lock_t* wait_lock) /*!< in: waiting table lock */
+{
+ dict_table_t* table;
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(lock_get_wait(wait_lock));
+
+ table = wait_lock->un_member.tab_lock.table;
+
+ lock = UT_LIST_GET_FIRST(table->locks);
+
+ while (lock != wait_lock) {
+
+ if (lock_has_to_wait(wait_lock, lock)) {
+
+ return(TRUE);
+ }
+
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************//**
+Removes a table lock request, waiting or granted, from the queue and grants
+locks to other transactions in the queue, if they now are entitled to a
+lock. */
+static
+void
+lock_table_dequeue(
+/*===============*/
+ lock_t* in_lock)/*!< in: table lock object; transactions waiting
+ behind will get their lock requests granted, if
+ they are now qualified to it */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
+
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
+
+ lock_table_remove_low(in_lock);
+
+ /* Check if waiting locks in the queue can now be granted: grant
+ locks if there are no conflicting locks ahead. */
+
+ while (lock != NULL) {
+
+ if (lock_get_wait(lock)
+ && !lock_table_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ lock_grant(lock);
+ }
+
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+ }
+}
+
+/*=========================== LOCK RELEASE ==============================*/
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+ trx_t* trx, /*!< in: transaction that has
+ set a record lock */
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record */
+ enum lock_mode lock_mode)/*!< in: LOCK_S or LOCK_X */
+{
+ lock_t* first_lock;
+ lock_t* lock;
+ ulint heap_no;
+
+ ut_ad(trx && rec);
+ ut_ad(block->frame == page_align(rec));
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ mutex_enter(&kernel_mutex);
+
+ first_lock = lock_rec_get_first(block, heap_no);
+
+ /* Find the last lock with the same lock_mode and transaction
+ from the record. */
+
+ for (lock = first_lock; lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+ if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
+ ut_a(!lock_get_wait(lock));
+ lock_rec_reset_nth_bit(lock, heap_no);
+ goto released;
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: unlock row could not"
+ " find a %lu mode lock on the record\n",
+ (ulong) lock_mode);
+
+ return;
+
+released:
+ /* Check if we can now grant waiting lock requests */
+
+ for (lock = first_lock; lock != NULL;
+ lock = lock_rec_get_next(heap_no, lock)) {
+ if (lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ lock_grant(lock);
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+UNIV_INTERN
+void
+lock_release_off_kernel(
+/*====================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t* table;
+ ulint count;
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = UT_LIST_GET_LAST(trx->trx_locks);
+
+ count = 0;
+
+ while (lock != NULL) {
+
+ count++;
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+
+ lock_rec_dequeue_from_page(lock);
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ if (lock_get_mode(lock) != LOCK_IS
+ && !ut_dulint_is_zero(trx->undo_no)) {
+
+ /* The trx may have modified the table. We
+ block the use of the MySQL query cache for
+ all currently active transactions. */
+
+ table = lock->un_member.tab_lock.table;
+
+ table->query_cache_inv_trx_id
+ = trx_sys->max_trx_id;
+ }
+
+ lock_table_dequeue(lock);
+ }
+
+ if (count == LOCK_RELEASE_KERNEL_INTERVAL) {
+ /* Release the kernel mutex for a while, so that we
+ do not monopolize it */
+
+ lock_mutex_exit_kernel();
+
+ lock_mutex_enter_kernel();
+
+ count = 0;
+ }
+
+ lock = UT_LIST_GET_LAST(trx->trx_locks);
+ }
+
+ ut_a(ib_vector_size(trx->autoinc_locks) == 0);
+
+ mem_heap_empty(trx->lock_heap);
+}
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+ lock_t* lock) /*!< in: waiting lock request */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+
+ lock_rec_dequeue_from_page(lock);
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ if (lock->trx->autoinc_locks != NULL) {
+ /* Release the transaction's AUTOINC locks/ */
+ lock_release_autoinc_locks(lock->trx);
+ }
+
+ lock_table_dequeue(lock);
+ }
+
+ /* Reset the wait flag and the back pointer to lock in trx */
+
+ lock_reset_lock_and_trx_wait(lock);
+
+ /* The following function releases the trx from lock wait */
+
+ trx_end_lock_wait(lock->trx);
+}
+
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+ (lock_get_mode(lock) == LOCK_S \
+ || lock_get_mode(lock) == LOCK_X)
+
+
+/*********************************************************************//**
+Removes locks of a transaction on a table to be dropped.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+static
+void
+lock_remove_all_on_table_for_trx(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to be dropped */
+ trx_t* trx, /*!< in: a transaction */
+ ibool remove_also_table_sx_locks)/*!< in: also removes
+ table S and X locks */
+{
+ lock_t* lock;
+ lock_t* prev_lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = UT_LIST_GET_LAST(trx->trx_locks);
+
+ while (lock != NULL) {
+ prev_lock = UT_LIST_GET_PREV(trx_locks, lock);
+
+ if (lock_get_type_low(lock) == LOCK_REC
+ && lock->index->table == table) {
+ ut_a(!lock_get_wait(lock));
+
+ lock_rec_discard(lock);
+ } else if (lock_get_type_low(lock) & LOCK_TABLE
+ && lock->un_member.tab_lock.table == table
+ && (remove_also_table_sx_locks
+ || !IS_LOCK_S_OR_X(lock))) {
+
+ ut_a(!lock_get_wait(lock));
+
+ lock_table_remove_low(lock);
+ }
+
+ lock = prev_lock;
+ }
+}
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+ dict_table_t* table, /*!< in: table to be dropped
+ or truncated */
+ ibool remove_also_table_sx_locks)/*!< in: also removes
+ table S and X locks */
+{
+ lock_t* lock;
+ lock_t* prev_lock;
+
+ mutex_enter(&kernel_mutex);
+
+ lock = UT_LIST_GET_FIRST(table->locks);
+
+ while (lock != NULL) {
+
+ prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
+ lock);
+
+ /* If we should remove all locks (remove_also_table_sx_locks
+ is TRUE), or if the lock is not table-level S or X lock,
+ then check we are not going to remove a wait lock. */
+ if (remove_also_table_sx_locks
+ || !(lock_get_type(lock) == LOCK_TABLE
+ && IS_LOCK_S_OR_X(lock))) {
+
+ ut_a(!lock_get_wait(lock));
+ }
+
+ lock_remove_all_on_table_for_trx(table, lock->trx,
+ remove_also_table_sx_locks);
+
+ if (prev_lock == NULL) {
+ if (lock == UT_LIST_GET_FIRST(table->locks)) {
+ /* lock was not removed, pick its successor */
+ lock = UT_LIST_GET_NEXT(
+ un_member.tab_lock.locks, lock);
+ } else {
+ /* lock was removed, pick the first one */
+ lock = UT_LIST_GET_FIRST(table->locks);
+ }
+ } else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks,
+ prev_lock) != lock) {
+ /* If lock was removed by
+ lock_remove_all_on_table_for_trx() then pick the
+ successor of prev_lock ... */
+ lock = UT_LIST_GET_NEXT(
+ un_member.tab_lock.locks, prev_lock);
+ } else {
+ /* ... otherwise pick the successor of lock. */
+ lock = UT_LIST_GET_NEXT(
+ un_member.tab_lock.locks, lock);
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*===================== VALIDATION AND DEBUGGING ====================*/
+
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock) /*!< in: table type lock */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(lock_get_type_low(lock) == LOCK_TABLE);
+
+ fputs("TABLE LOCK table ", file);
+ ut_print_name(file, lock->trx, TRUE,
+ lock->un_member.tab_lock.table->name);
+ fprintf(file, " trx id " TRX_ID_FMT,
+ TRX_ID_PREP_PRINTF(lock->trx->id));
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ fputs(" lock mode S", file);
+ } else if (lock_get_mode(lock) == LOCK_X) {
+ fputs(" lock mode X", file);
+ } else if (lock_get_mode(lock) == LOCK_IS) {
+ fputs(" lock mode IS", file);
+ } else if (lock_get_mode(lock) == LOCK_IX) {
+ fputs(" lock mode IX", file);
+ } else if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+ fputs(" lock mode AUTO-INC", file);
+ } else {
+ fprintf(file, " unknown lock mode %lu",
+ (ulong) lock_get_mode(lock));
+ }
+
+ if (lock_get_wait(lock)) {
+ fputs(" waiting", file);
+ }
+
+ putc('\n', file);
+}
+
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const lock_t* lock) /*!< in: record type lock */
+{
+ const buf_block_t* block;
+ ulint space;
+ ulint page_no;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ space = lock->un_member.rec_lock.space;
+ page_no = lock->un_member.rec_lock.page_no;
+
+ fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ",
+ (ulong) space, (ulong) page_no,
+ (ulong) lock_rec_get_n_bits(lock));
+ dict_index_name_print(file, lock->trx, lock->index);
+ fprintf(file, " trx id " TRX_ID_FMT,
+ TRX_ID_PREP_PRINTF(lock->trx->id));
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ fputs(" lock mode S", file);
+ } else if (lock_get_mode(lock) == LOCK_X) {
+ fputs(" lock_mode X", file);
+ } else {
+ ut_error;
+ }
+
+ if (lock_rec_get_gap(lock)) {
+ fputs(" locks gap before rec", file);
+ }
+
+ if (lock_rec_get_rec_not_gap(lock)) {
+ fputs(" locks rec but not gap", file);
+ }
+
+ if (lock_rec_get_insert_intention(lock)) {
+ fputs(" insert intention", file);
+ }
+
+ if (lock_get_wait(lock)) {
+ fputs(" waiting", file);
+ }
+
+ mtr_start(&mtr);
+
+ putc('\n', file);
+
+ if ( srv_show_verbose_locks ) {
+ block = buf_page_try_get(space, page_no, &mtr);
+
+ for (i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+ if (!lock_rec_get_nth_bit(lock, i)) {
+ continue;
+ }
+
+ fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
+ if (block) {
+ const rec_t* rec;
+
+ rec = page_find_rec_with_heap_no(
+ buf_block_get_frame(block), i);
+
+ offsets = rec_get_offsets(
+ rec, lock->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ putc(' ', file);
+ rec_print_new(file, rec, offsets);
+ }
+
+ putc('\n', file);
+ }
+ }
+
+ mtr_commit(&mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+#ifdef UNIV_DEBUG
+/* Print the number of lock structs from lock_print_info_summary() only
+in non-production builds for performance reasons, see
+http://bugs.mysql.com/36942 */
+#define PRINT_NUM_OF_LOCK_STRUCTS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+/*********************************************************************//**
+Calculates the number of record lock structs in the record lock hash table.
+@return number of record locks */
+static
+ulint
+lock_get_n_rec_locks(void)
+/*======================*/
+{
+ lock_t* lock;
+ ulint n_locks = 0;
+ ulint i;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+
+ lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+
+ while (lock) {
+ n_locks++;
+
+ lock = HASH_GET_NEXT(hash, lock);
+ }
+ }
+
+ return(n_locks);
+}
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+ FILE* file, /*!< in: file where to print */
+ ibool nowait) /*!< in: whether to wait for the kernel mutex */
+{
+ /* if nowait is FALSE, wait on the kernel mutex,
+ otherwise return immediately if fail to obtain the
+ mutex. */
+ if (!nowait) {
+ lock_mutex_enter_kernel();
+ } else if (mutex_enter_nowait(&kernel_mutex)) {
+ fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+ "SKIP LOCK INFO PRINTING\n", file);
+ return(FALSE);
+ }
+
+ if (lock_deadlock_found) {
+ fputs("------------------------\n"
+ "LATEST DETECTED DEADLOCK\n"
+ "------------------------\n", file);
+
+ ut_copy_file(file, lock_latest_err_file);
+ }
+
+ fputs("------------\n"
+ "TRANSACTIONS\n"
+ "------------\n", file);
+
+ fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+
+ fprintf(file,
+ "Purge done for trx's n:o < " TRX_ID_FMT
+ " undo n:o < " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no),
+ TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no));
+
+ fprintf(file,
+ "History list length %lu\n",
+ (ulong) trx_sys->rseg_history_len);
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+ fprintf(file,
+ "Total number of lock structs in row lock hash table %lu\n",
+ (ulong) lock_get_n_rec_locks());
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Prints info of locks for each transaction. */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+ FILE* file) /*!< in: file where to print */
+{
+ lock_t* lock;
+ ibool load_page_first = TRUE;
+ ulint nth_trx = 0;
+ ulint nth_lock = 0;
+ ulint i;
+ mtr_t mtr;
+ trx_t* trx;
+
+ fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+ /* First print info on non-active transactions */
+
+ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+
+ while (trx) {
+ if (trx->conc_state == TRX_NOT_STARTED) {
+ fputs("---", file);
+ trx_print(file, trx, 600);
+ }
+
+ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+ }
+
+loop:
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ i = 0;
+
+ /* Since we temporarily release the kernel mutex when
+ reading a database page in below, variable trx may be
+ obsolete now and we must loop through the trx list to
+ get probably the same trx, or some other trx. */
+
+ while (trx && (i < nth_trx)) {
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ i++;
+ }
+
+ if (trx == NULL) {
+ lock_mutex_exit_kernel();
+
+ ut_ad(lock_validate());
+
+ return;
+ }
+
+ if (nth_lock == 0) {
+ fputs("---", file);
+ trx_print(file, trx, 600);
+
+ if (trx->read_view) {
+ fprintf(file,
+ "Trx read view will not see trx with"
+ " id >= " TRX_ID_FMT
+ ", sees < " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(
+ trx->read_view->low_limit_id),
+ TRX_ID_PREP_PRINTF(
+ trx->read_view->up_limit_id));
+ }
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+ fprintf(file,
+ "------- TRX HAS BEEN WAITING %lu SEC"
+ " FOR THIS LOCK TO BE GRANTED:\n",
+ (ulong) difftime(time(NULL),
+ trx->wait_started));
+
+ if (lock_get_type_low(trx->wait_lock) == LOCK_REC) {
+ lock_rec_print(file, trx->wait_lock);
+ } else {
+ lock_table_print(file, trx->wait_lock);
+ }
+
+ fputs("------------------\n", file);
+ }
+ }
+
+ if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) {
+ nth_trx++;
+ goto loop;
+ }
+
+ i = 0;
+
+ /* Look at the note about the trx loop above why we loop here:
+ lock may be an obsolete pointer now. */
+
+ lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+ while (lock && (i < nth_lock)) {
+ lock = UT_LIST_GET_NEXT(trx_locks, lock);
+ i++;
+ }
+
+ if (lock == NULL) {
+ nth_trx++;
+ nth_lock = 0;
+
+ goto loop;
+ }
+
+ if (lock_get_type_low(lock) == LOCK_REC) {
+ if (load_page_first) {
+ ulint space = lock->un_member.rec_lock.space;
+ ulint zip_size= fil_space_get_zip_size(space);
+ ulint page_no = lock->un_member.rec_lock.page_no;
+
+ if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+ /* It is a single table tablespace and
+ the .ibd file is missing (TRUNCATE
+ TABLE probably stole the locks): just
+ print the lock without attempting to
+ load the page in the buffer pool. */
+
+ fprintf(file, "RECORD LOCKS on"
+ " non-existing space %lu\n",
+ (ulong) space);
+ goto print_rec;
+ }
+
+ lock_mutex_exit_kernel();
+
+ mtr_start(&mtr);
+
+ buf_page_get_with_no_latch(space, zip_size,
+ page_no, &mtr);
+
+ mtr_commit(&mtr);
+
+ load_page_first = FALSE;
+
+ lock_mutex_enter_kernel();
+
+ goto loop;
+ }
+
+print_rec:
+ lock_rec_print(file, lock);
+ } else {
+ ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+ lock_table_print(file, lock);
+ }
+
+ load_page_first = TRUE;
+
+ nth_lock++;
+
+ if (nth_lock >= srv_show_locks_held) {
+ fputs("TOO MANY LOCKS PRINTED FOR THIS TRX:"
+ " SUPPRESSING FURTHER PRINTS\n",
+ file);
+
+ nth_trx++;
+ nth_lock = 0;
+
+ goto loop;
+ }
+
+ goto loop;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the lock queue on a table.
+@return TRUE if ok */
+static
+ibool
+lock_table_queue_validate(
+/*======================*/
+ dict_table_t* table) /*!< in: table */
+{
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ lock = UT_LIST_GET_FIRST(table->locks);
+
+ while (lock) {
+ ut_a(((lock->trx)->conc_state == TRX_ACTIVE)
+ || ((lock->trx)->conc_state == TRX_PREPARED)
+ || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY));
+
+ if (!lock_get_wait(lock)) {
+
+ ut_a(!lock_table_other_has_incompatible(
+ lock->trx, 0, table,
+ lock_get_mode(lock)));
+ } else {
+
+ ut_a(lock_table_has_to_wait_in_queue(lock));
+ }
+
+ lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a single record.
+@return TRUE if ok */
+static
+ibool
+lock_rec_queue_validate(
+/*====================*/
+ const buf_block_t* block, /*!< in: buffer block containing rec */
+ const rec_t* rec, /*!< in: record to look at */
+ dict_index_t* index, /*!< in: index, or NULL if not known */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ trx_t* impl_trx;
+ lock_t* lock;
+ ulint heap_no;
+
+ ut_a(rec);
+ ut_a(block->frame == page_align(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter_kernel();
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock) {
+ switch(lock->trx->conc_state) {
+ case TRX_ACTIVE:
+ case TRX_PREPARED:
+ case TRX_COMMITTED_IN_MEMORY:
+ break;
+ default:
+ ut_error;
+ }
+
+ ut_a(trx_in_trx_list(lock->trx));
+
+ if (lock_get_wait(lock)) {
+ ut_a(lock_rec_has_to_wait_in_queue(lock));
+ }
+
+ if (index) {
+ ut_a(lock->index == index);
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+
+ lock_mutex_exit_kernel();
+
+ return(TRUE);
+ }
+
+ if (!index);
+ else if (dict_index_is_clust(index)) {
+
+ impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+
+ if (impl_trx
+ && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
+ block, heap_no, impl_trx)) {
+
+ ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, impl_trx));
+ }
+#if 0
+ } else {
+
+ /* The kernel mutex may get released temporarily in the
+ next function call: we have to release lock table mutex
+ to obey the latching order */
+
+ /* If this thread is holding the file space latch
+ (fil_space_t::latch), the following check WILL break
+ latching order and may cause a deadlock of threads. */
+
+ /* NOTE: This is a bogus check that would fail in the
+ following case: Our transaction is updating a
+ row. After it has updated the clustered index record,
+ it goes to a secondary index record and finds someone
+ else holding an explicit S- or X-lock on that
+ secondary index record, presumably from a locking
+ read. Our transaction cannot update the secondary
+ index immediately, but places a waiting X-lock request
+ on the secondary index record. There is nothing
+ illegal in this. The assertion is simply too strong. */
+
+ /* From the locking point of view, each secondary
+ index is a separate table. A lock that is held on
+ secondary index rec does not give any rights to modify
+ or read the clustered index rec. Therefore, we can
+ think of the sec index as a separate 'table' from the
+ clust index 'table'. Conversely, a transaction that
+ has acquired a lock on and modified a clustered index
+ record may need to wait for a lock on the
+ corresponding record in a secondary index. */
+
+ impl_trx = lock_sec_rec_some_has_impl_off_kernel(
+ rec, index, offsets);
+
+ if (impl_trx
+ && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
+ block, heap_no, impl_trx)) {
+
+ ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, impl_trx));
+ }
+#endif
+ }
+
+ lock = lock_rec_get_first(block, heap_no);
+
+ while (lock) {
+ ut_a(lock->trx->conc_state == TRX_ACTIVE
+ || lock->trx->conc_state == TRX_PREPARED
+ || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+ ut_a(trx_in_trx_list(lock->trx));
+
+ if (index) {
+ ut_a(lock->index == index);
+ }
+
+ if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
+
+ enum lock_mode mode;
+
+ if (lock_get_mode(lock) == LOCK_S) {
+ mode = LOCK_X;
+ } else {
+ mode = LOCK_S;
+ }
+ ut_a(!lock_rec_other_has_expl_req(
+ mode, 0, 0, block, heap_no, lock->trx));
+
+ } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
+
+ ut_a(lock_rec_has_to_wait_in_queue(lock));
+ }
+
+ lock = lock_rec_get_next(heap_no, lock);
+ }
+
+ lock_mutex_exit_kernel();
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no)/*!< in: page number */
+{
+ dict_index_t* index;
+ buf_block_t* block;
+ const page_t* page;
+ lock_t* lock;
+ const rec_t* rec;
+ ulint nth_lock = 0;
+ ulint nth_bit = 0;
+ ulint i;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mtr_start(&mtr);
+
+ ut_ad(zip_size != ULINT_UNDEFINED);
+ block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ page = block->frame;
+
+ lock_mutex_enter_kernel();
+loop:
+ lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+ if (!lock) {
+ goto function_exit;
+ }
+
+ for (i = 0; i < nth_lock; i++) {
+
+ lock = lock_rec_get_next_on_page(lock);
+
+ if (!lock) {
+ goto function_exit;
+ }
+ }
+
+ ut_a(trx_in_trx_list(lock->trx));
+ ut_a(lock->trx->conc_state == TRX_ACTIVE
+ || lock->trx->conc_state == TRX_PREPARED
+ || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+
+# ifdef UNIV_SYNC_DEBUG
+ /* Only validate the record queues when this thread is not
+ holding a space->latch. Deadlocks are possible due to
+ latching order violation when UNIV_DEBUG is defined while
+ UNIV_SYNC_DEBUG is not. */
+ if (!sync_thread_levels_contains(SYNC_FSP))
+# endif /* UNIV_SYNC_DEBUG */
+ for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
+
+ if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
+
+ index = lock->index;
+ rec = page_find_rec_with_heap_no(page, i);
+ ut_a(rec);
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ fprintf(stderr,
+ "Validating %lu %lu\n",
+ (ulong) space, (ulong) page_no);
+
+ lock_mutex_exit_kernel();
+
+ /* If this thread is holding the file space
+ latch (fil_space_t::latch), the following
+ check WILL break the latching order and may
+ cause a deadlock of threads. */
+
+ lock_rec_queue_validate(block, rec, index, offsets);
+
+ lock_mutex_enter_kernel();
+
+ nth_bit = i + 1;
+
+ goto loop;
+ }
+ }
+
+ nth_bit = 0;
+ nth_lock++;
+
+ goto loop;
+
+function_exit:
+ lock_mutex_exit_kernel();
+
+ mtr_commit(&mtr);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock system.
+@return TRUE if ok */
+static
+ibool
+lock_validate(void)
+/*===============*/
+{
+ lock_t* lock;
+ trx_t* trx;
+ dulint limit;
+ ulint space;
+ ulint page_no;
+ ulint i;
+
+ lock_mutex_enter_kernel();
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+ while (lock) {
+ if (lock_get_type_low(lock) & LOCK_TABLE) {
+
+ lock_table_queue_validate(
+ lock->un_member.tab_lock.table);
+ }
+
+ lock = UT_LIST_GET_NEXT(trx_locks, lock);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+
+ limit = ut_dulint_zero;
+
+ for (;;) {
+ lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+
+ while (lock) {
+ ut_a(trx_in_trx_list(lock->trx));
+
+ space = lock->un_member.rec_lock.space;
+ page_no = lock->un_member.rec_lock.page_no;
+
+ if (ut_dulint_cmp(
+ ut_dulint_create(space, page_no),
+ limit) >= 0) {
+ break;
+ }
+
+ lock = HASH_GET_NEXT(hash, lock);
+ }
+
+ if (!lock) {
+
+ break;
+ }
+
+ lock_mutex_exit_kernel();
+
+ lock_rec_validate_page(space,
+ fil_space_get_zip_size(space),
+ page_no);
+
+ lock_mutex_enter_kernel();
+
+ limit = ut_dulint_create(space, page_no + 1);
+ }
+ }
+
+ lock_mutex_exit_kernel();
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG bit is
+ set, does nothing */
+ const rec_t* rec, /*!< in: record after which to insert */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ dict_index_t* index, /*!< in: index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in/out: mini-transaction */
+ ibool* inherit)/*!< out: set to TRUE if the new
+ inserted record maybe should inherit
+ LOCK_GAP type locks from the successor
+ record */
+{
+ const rec_t* next_rec;
+ trx_t* trx;
+ lock_t* lock;
+ ulint err;
+ ulint next_rec_heap_no;
+
+ ut_ad(block->frame == page_align(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx = thr_get_trx(thr);
+ next_rec = page_rec_get_next_const(rec);
+ next_rec_heap_no = page_rec_get_heap_no(next_rec);
+
+ lock_mutex_enter_kernel();
+
+ /* When inserting a record into an index, the table must be at
+ least IX-locked or we must be building an index, in which case
+ the table must be at least S-locked. */
+ ut_ad(lock_table_has(trx, index->table, LOCK_IX)
+ || (*index->name == TEMP_INDEX_PREFIX
+ && lock_table_has(trx, index->table, LOCK_S)));
+
+ lock = lock_rec_get_first(block, next_rec_heap_no);
+
+ if (UNIV_LIKELY(lock == NULL)) {
+ /* We optimize CPU time usage in the simplest case */
+
+ lock_mutex_exit_kernel();
+
+ if (!dict_index_is_clust(index)) {
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ trx->id, mtr);
+ }
+
+ *inherit = FALSE;
+
+ return(DB_SUCCESS);
+ }
+
+ *inherit = TRUE;
+
+ /* If another transaction has an explicit lock request which locks
+ the gap, waiting or granted, on the successor, the insert has to wait.
+
+ An exception is the case where the lock by the another transaction
+ is a gap type lock which it placed to wait for its turn to insert. We
+ do not consider that kind of a lock conflicting with our insert. This
+ eliminates an unnecessary deadlock which resulted when 2 transactions
+ had to wait for their insert. Both had waiting gap type lock requests
+ on the successor, which produced an unnecessary deadlock. */
+
+ if (lock_rec_other_has_conflicting(
+ LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+ block, next_rec_heap_no, trx)) {
+
+ /* Note that we may get DB_SUCCESS also here! */
+ err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
+ | LOCK_INSERT_INTENTION,
+ block, next_rec_heap_no,
+ index, thr);
+ } else {
+ err = DB_SUCCESS;
+ }
+
+ lock_mutex_exit_kernel();
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ /* fall through */
+ case DB_SUCCESS:
+ if (dict_index_is_clust(index)) {
+ break;
+ }
+ /* Update the page max trx id field */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ trx->id, mtr);
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(next_rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(lock_rec_queue_validate(block,
+ next_rec, index, offsets));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ return(err);
+}
+
+/*********************************************************************//**
+If a transaction has an implicit x-lock on a record, but no explicit x-lock
+set on the record, sets one for it. NOTE that in the case of a secondary
+index, the kernel mutex may get temporarily released. */
+static
+void
+lock_rec_convert_impl_to_expl(
+/*==========================*/
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record on page */
+ dict_index_t* index, /*!< in: index of record */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ trx_t* impl_trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(page_rec_is_user_rec(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+
+ if (dict_index_is_clust(index)) {
+ impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+ } else {
+ impl_trx = lock_sec_rec_some_has_impl_off_kernel(
+ rec, index, offsets);
+ }
+
+ if (impl_trx) {
+ ulint heap_no = page_rec_get_heap_no(rec);
+
+ /* If the transaction has no explicit x-lock set on the
+ record, set one for it */
+
+ if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
+ heap_no, impl_trx)) {
+
+ lock_rec_add_to_queue(
+ LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, impl_trx);
+ }
+ }
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+ ulint heap_no;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap_no = rec_offs_comp(offsets)
+ ? rec_get_heap_no_new(rec)
+ : rec_get_heap_no_old(rec);
+
+ lock_mutex_enter_kernel();
+
+ ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+ /* If a transaction has no explicit x-lock set on the record, set one
+ for it */
+
+ lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, thr);
+
+ lock_mutex_exit_kernel();
+
+ ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+ if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (delete
+mark or delete unmark) of a secondary index record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ buf_block_t* block, /*!< in/out: buffer block of rec */
+ const rec_t* rec, /*!< in: record which should be
+ modified; NOTE: as this is a secondary
+ index, we always have to modify the
+ clustered index record first: see the
+ comment below */
+ dict_index_t* index, /*!< in: secondary index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in/out: mini-transaction */
+{
+ ulint err;
+ ulint heap_no;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ /* Another transaction cannot have an implicit lock on the record,
+ because when we come here, we already have modified the clustered
+ index record, and this would not have been possible if another active
+ transaction had modified this secondary index record. */
+
+ lock_mutex_enter_kernel();
+
+ ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+ err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+ block, heap_no, index, thr);
+
+ lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG
+ {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+#endif /* UNIV_DEBUG */
+
+ if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+ /* Update the page max trx id field */
+ /* It might not be necessary to do this if
+ err == DB_SUCCESS (no new lock created),
+ but it should not cost too much performance. */
+ page_update_max_trx_id(block,
+ buf_block_get_page_zip(block),
+ thr_get_trx(thr)->id, mtr);
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: secondary index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ enum db_err err;
+ ulint heap_no;
+
+ ut_ad(!dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter_kernel();
+
+ ut_ad(mode != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad(mode != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+ /* Some transaction may have an implicit x-lock on the record only
+ if the max trx id for the page >= min trx id for the trx list or a
+ database recovery is running. */
+
+ if (((ut_dulint_cmp(page_get_max_trx_id(block->frame),
+ trx_list_get_min_trx_id()) >= 0)
+ || recv_recovery_is_on())
+ && !page_rec_is_supremum(rec)) {
+
+ lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+ }
+
+ err = lock_rec_lock(FALSE, mode | gap_mode,
+ block, heap_no, index, thr);
+
+ lock_mutex_exit_kernel();
+
+ ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ enum db_err err;
+ ulint heap_no;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(block->frame == page_align(rec));
+ ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+ ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+ || gap_mode == LOCK_REC_NOT_GAP);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (flags & BTR_NO_LOCKING_FLAG) {
+
+ return(DB_SUCCESS);
+ }
+
+ heap_no = page_rec_get_heap_no(rec);
+
+ lock_mutex_enter_kernel();
+
+ ut_ad(mode != LOCK_X
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+ ut_ad(mode != LOCK_S
+ || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+ if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
+
+ lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+ }
+
+ err = lock_rec_lock(FALSE, mode | gap_mode,
+ block, heap_no, index, thr);
+
+ lock_mutex_exit_kernel();
+
+ ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+ return(err);
+}
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+ ulint flags, /*!< in: if BTR_NO_LOCKING_FLAG
+ bit is set, does nothing */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: user record or page
+ supremum record which should
+ be read or passed over by a
+ read cursor */
+ dict_index_t* index, /*!< in: clustered index */
+ enum lock_mode mode, /*!< in: mode of the lock which
+ the read cursor should set on
+ records: LOCK_S or LOCK_X; the
+ latter is possible in
+ SELECT FOR UPDATE */
+ ulint gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ulint err;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &tmp_heap);
+ err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
+ offsets, mode, gap_mode, thr);
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+ err = DB_SUCCESS;
+ }
+
+ return(err);
+}
+
+/*******************************************************************//**
+Release the last lock from the transaction's autoinc locks. */
+UNIV_INLINE
+void
+lock_release_autoinc_last_lock(
+/*===========================*/
+ ib_vector_t* autoinc_locks) /*!< in/out: vector of AUTOINC locks */
+{
+ ulint last;
+ lock_t* lock;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_a(!ib_vector_is_empty(autoinc_locks));
+
+ /* The lock to be release must be the last lock acquired. */
+ last = ib_vector_size(autoinc_locks) - 1;
+ lock = ib_vector_get(autoinc_locks, last);
+
+ /* Should have only AUTOINC locks in the vector. */
+ ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
+ ut_a(lock_get_type(lock) == LOCK_TABLE);
+
+ ut_a(lock->un_member.tab_lock.table != NULL);
+
+ /* This will remove the lock from the trx autoinc_locks too. */
+ lock_table_dequeue(lock);
+}
+
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+UNIV_INTERN
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ ut_a(trx->autoinc_locks != NULL);
+
+ return(!ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+UNIV_INTERN
+void
+lock_release_autoinc_locks(
+/*=======================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ ut_a(trx->autoinc_locks != NULL);
+
+ /* We release the locks in the reverse order. This is to
+ avoid searching the vector for the element to delete at
+ the lower level. See (lock_table_remove_low()) for details. */
+ while (!ib_vector_is_empty(trx->autoinc_locks)) {
+
+ /* lock_table_remove_low() will also remove the lock from
+ the transaction's autoinc_locks vector. */
+ lock_release_autoinc_last_lock(trx->autoinc_locks);
+ }
+
+ /* Should release all locks. */
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ return(lock_get_type_low(lock));
+}
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return transaction id */
+UNIV_INTERN
+ullint
+lock_get_trx_id(
+/*============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ return(trx_get_id(lock->trx));
+}
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ibool is_gap_lock;
+
+ is_gap_lock = lock_get_type_low(lock) == LOCK_REC
+ && lock_rec_get_gap(lock);
+
+ switch (lock_get_mode(lock)) {
+ case LOCK_S:
+ if (is_gap_lock) {
+ return("S,GAP");
+ } else {
+ return("S");
+ }
+ case LOCK_X:
+ if (is_gap_lock) {
+ return("X,GAP");
+ } else {
+ return("X");
+ }
+ case LOCK_IS:
+ if (is_gap_lock) {
+ return("IS,GAP");
+ } else {
+ return("IS");
+ }
+ case LOCK_IX:
+ if (is_gap_lock) {
+ return("IX,GAP");
+ } else {
+ return("IX");
+ }
+ case LOCK_AUTO_INC:
+ return("AUTO_INC");
+ default:
+ return("UNKNOWN");
+ }
+}
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ switch (lock_get_type_low(lock)) {
+ case LOCK_REC:
+ return("RECORD");
+ case LOCK_TABLE:
+ return("TABLE");
+ default:
+ return("UNKNOWN");
+ }
+}
+
+/*******************************************************************//**
+Gets the table on which the lock is.
+@return table */
+UNIV_INLINE
+dict_table_t*
+lock_get_table(
+/*===========*/
+ const lock_t* lock) /*!< in: lock */
+{
+ switch (lock_get_type_low(lock)) {
+ case LOCK_REC:
+ return(lock->index->table);
+ case LOCK_TABLE:
+ return(lock->un_member.tab_lock.table);
+ default:
+ ut_error;
+ return(NULL);
+ }
+}
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return id of the table */
+UNIV_INTERN
+ullint
+lock_get_table_id(
+/*==============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ dict_table_t* table;
+
+ table = lock_get_table(lock);
+
+ return((ullint)ut_conv_dulint_to_longlong(table->id));
+}
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ dict_table_t* table;
+
+ table = lock_get_table(lock);
+
+ return(table->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->index);
+}
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->index->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->un_member.rec_lock.space);
+}
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+ return(lock->un_member.rec_lock.page_no);
+}
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
new file mode 100644
index 00000000000..b9f19aeff31
--- /dev/null
+++ b/storage/xtradb/log/log0log.c
@@ -0,0 +1,3507 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.c
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "log0log.h"
+
+#ifdef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "srv0srv.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0boot.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+1) Every change to a contents of a data page must be done
+through mtr, which in mtr_commit() writes log records
+to the InnoDB redo log.
+
+2) Normally these changes are performed using a mlog_write_ulint()
+or similar function.
+
+3) In some page level operations only a code number of a
+c-function and its parameters are written to the log to
+reduce the size of the log.
+
+ 3a) You should not add parameters to these kind of functions
+ (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
+
+ 3b) You should not add such functionality which either change
+ working when compared with the old or are dependent on data
+ outside of the page. These kind of functions should implement
+ self-contained page transformation and it should be unchanged
+ if you don't have very essential reasons to change log
+ semantics or format.
+
+*/
+
+/* Current free limit of space 0; protected by the log sys mutex; 0 means
+uninitialized */
+UNIV_INTERN ulint log_fsp_current_free_limit = 0;
+
+/* Global log system variable */
+UNIV_INTERN log_t* log_sys = NULL;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+
+/* These control how often we print warnings if the last checkpoint is too
+old */
+UNIV_INTERN ibool log_has_printed_chkp_warning = FALSE;
+UNIV_INTERN time_t log_last_warning_time;
+
+#ifdef UNIV_LOG_ARCHIVE
+/* Pointer to this variable is used as the i/o-message when we do i/o to an
+archive */
+UNIV_INTERN byte log_archive_io;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* A margin for free space in the log buffer before a log entry is catenated */
+#define LOG_BUF_WRITE_MARGIN (4 * OS_FILE_LOG_BLOCK_SIZE)
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO 2
+#define LOG_BUF_FLUSH_MARGIN (LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
+
+/* Margin for the free space in the smallest log group, before a new query
+step which modifies the database, is started */
+
+#define LOG_CHECKPOINT_FREE_PER_THREAD (4 * UNIV_PAGE_SIZE)
+#define LOG_CHECKPOINT_EXTRA_FREE (8 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous making of a new checkpoint; the value
+should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
+
+#define LOG_POOL_CHECKPOINT_RATIO_ASYNC 32
+
+/* This parameter controls synchronous preflushing of modified buffer pages */
+#define LOG_POOL_PREFLUSH_RATIO_SYNC 16
+
+/* The same ratio for asynchronous preflushing; this value should be less than
+the previous */
+#define LOG_POOL_PREFLUSH_RATIO_ASYNC 8
+
+/* Extra margin, in addition to one log file, used in archiving */
+#define LOG_ARCHIVE_EXTRA_MARGIN (4 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous writing to the archive */
+#define LOG_ARCHIVE_RATIO_ASYNC 16
+
+/* Codes used in unlocking flush latches */
+#define LOG_UNLOCK_NONE_FLUSHED_LOCK 1
+#define LOG_UNLOCK_FLUSH_LOCK 2
+
+/* States of an archiving operation */
+#define LOG_ARCHIVE_READ 1
+#define LOG_ARCHIVE_WRITE 2
+
+/******************************************************//**
+Completes a checkpoint write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void);
+/*============================*/
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void);
+/*=========================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/****************************************************************//**
+Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
+so that we know that the limit has been written to a log checkpoint field
+on disk. */
+UNIV_INTERN
+void
+log_fsp_current_free_limit_set_and_checkpoint(
+/*==========================================*/
+ ulint limit) /*!< in: limit to set */
+{
+ ibool success;
+
+ mutex_enter(&(log_sys->mutex));
+
+ log_fsp_current_free_limit = limit;
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Try to make a synchronous checkpoint */
+
+ success = FALSE;
+
+ while (!success) {
+ success = log_checkpoint(TRUE, TRUE);
+ }
+}
+
+/****************************************************************//**
+Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
+exists.
+@return LSN of oldest modification */
+static
+ib_uint64_t
+log_buf_pool_get_oldest_modification(void)
+/*======================================*/
+{
+ ib_uint64_t lsn;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ lsn = buf_pool_get_oldest_modification();
+
+ if (!lsn) {
+
+ lsn = log_sys->lsn;
+ }
+
+ return(lsn);
+}
+
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return start lsn of the log record */
+UNIV_INTERN
+ib_uint64_t
+log_reserve_and_open(
+/*=================*/
+ ulint len) /*!< in: length of data to be catenated */
+{
+ log_t* log = log_sys;
+ ulint len_upper_limit;
+#ifdef UNIV_LOG_ARCHIVE
+ ulint archived_lsn_age;
+ ulint dummy;
+#endif /* UNIV_LOG_ARCHIVE */
+#ifdef UNIV_DEBUG
+ ulint count = 0;
+#endif /* UNIV_DEBUG */
+
+ ut_a(len < log->buf_size / 2);
+loop:
+ mutex_enter(&(log->mutex));
+ ut_ad(!recv_no_log_write);
+
+ /* Calculate an upper limit for the space the string may take in the
+ log buffer */
+
+ len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4;
+
+ if (log->buf_free + len_upper_limit > log->buf_size) {
+
+ mutex_exit(&(log->mutex));
+
+ /* Not enough free space, do a syncronous flush of the log
+ buffer */
+
+ log_buffer_flush_to_disk();
+
+ srv_log_waits++;
+
+ ut_ad(++count < 50);
+
+ goto loop;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ if (log->archiving_state != LOG_ARCH_OFF) {
+
+ archived_lsn_age = log->lsn - log->archived_lsn;
+ if (archived_lsn_age + len_upper_limit
+ > log->max_archived_lsn_age) {
+ /* Not enough free archived space in log groups: do a
+ synchronous archive write batch: */
+
+ mutex_exit(&(log->mutex));
+
+ ut_ad(len_upper_limit <= log->max_archived_lsn_age);
+
+ log_archive_do(TRUE, &dummy);
+
+ ut_ad(++count < 50);
+
+ goto loop;
+ }
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+ log->old_buf_free = log->buf_free;
+ log->old_lsn = log->lsn;
+#endif
+ return(log->lsn);
+}
+
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+ byte* str, /*!< in: string */
+ ulint str_len) /*!< in: string length */
+{
+ log_t* log = log_sys;
+ ulint len;
+ ulint data_len;
+ byte* log_block;
+
+ ut_ad(mutex_own(&(log->mutex)));
+part_loop:
+ ut_ad(!recv_no_log_write);
+ /* Calculate a part length */
+
+ data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
+
+ if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+ /* The string fits within the current log block */
+
+ len = str_len;
+ } else {
+ data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+
+ len = OS_FILE_LOG_BLOCK_SIZE
+ - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+ - LOG_BLOCK_TRL_SIZE;
+ }
+
+ ut_memcpy(log->buf + log->buf_free, str, len);
+
+ str_len -= len;
+ str = str + len;
+
+ log_block = ut_align_down(log->buf + log->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE);
+ log_block_set_data_len(log_block, data_len);
+
+ if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+ /* This block became full */
+ log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
+ log_block_set_checkpoint_no(log_block,
+ log_sys->next_checkpoint_no);
+ len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
+
+ log->lsn += len;
+
+ /* Initialize the next block header */
+ log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
+ } else {
+ log->lsn += len;
+ }
+
+ log->buf_free += len;
+
+ ut_ad(log->buf_free <= log->buf_size);
+
+ if (str_len > 0) {
+ goto part_loop;
+ }
+
+ srv_log_write_requests++;
+}
+
+/************************************************************//**
+*/
+UNIV_INLINE
+ulint
+log_max_modified_age_async()
+{
+ if (srv_checkpoint_age_target) {
+ return(ut_min(log_sys->max_modified_age_async,
+ srv_checkpoint_age_target
+ - srv_checkpoint_age_target / 8));
+ } else {
+ return(log_sys->max_modified_age_async);
+ }
+}
+
+UNIV_INLINE
+ulint
+log_max_checkpoint_age_async()
+{
+ if (srv_checkpoint_age_target) {
+ return(ut_min(log_sys->max_checkpoint_age_async,
+ srv_checkpoint_age_target));
+ } else {
+ return(log_sys->max_checkpoint_age_async);
+ }
+}
+
+/************************************************************//**
+Closes the log.
+@return lsn */
+UNIV_INTERN
+ib_uint64_t
+log_close(void)
+/*===========*/
+{
+ byte* log_block;
+ ulint first_rec_group;
+ ib_uint64_t oldest_lsn;
+ ib_uint64_t lsn;
+ log_t* log = log_sys;
+ ib_uint64_t checkpoint_age;
+
+ ut_ad(mutex_own(&(log->mutex)));
+ ut_ad(!recv_no_log_write);
+
+ lsn = log->lsn;
+
+ log_block = ut_align_down(log->buf + log->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE);
+ first_rec_group = log_block_get_first_rec_group(log_block);
+
+ if (first_rec_group == 0) {
+ /* We initialized a new log block which was not written
+ full by the current mtr: the next mtr log record group
+ will start within this block at the offset data_len */
+
+ log_block_set_first_rec_group(
+ log_block, log_block_get_data_len(log_block));
+ }
+
+ if (log->buf_free > log->max_buf_free) {
+
+ log->check_flush_or_checkpoint = TRUE;
+ }
+
+ checkpoint_age = lsn - log->last_checkpoint_lsn;
+
+ if (checkpoint_age >= log->log_group_capacity) {
+ /* TODO: split btr_store_big_rec_extern_fields() into small
+ steps so that we can release all latches in the middle, and
+ call log_free_check() to ensure we never write over log written
+ after the latest checkpoint. In principle, we should split all
+ big_rec operations, but other operations are smaller. */
+
+ if (!log_has_printed_chkp_warning
+ || difftime(time(NULL), log_last_warning_time) > 15) {
+
+ log_has_printed_chkp_warning = TRUE;
+ log_last_warning_time = time(NULL);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: the age of the last"
+ " checkpoint is %lu,\n"
+ "InnoDB: which exceeds the log group"
+ " capacity %lu.\n"
+ "InnoDB: If you are using big"
+ " BLOB or TEXT rows, you must set the\n"
+ "InnoDB: combined size of log files"
+ " at least 10 times bigger than the\n"
+ "InnoDB: largest such row.\n",
+ (ulong) checkpoint_age,
+ (ulong) log->log_group_capacity);
+ }
+ }
+
+ if (checkpoint_age <= log_max_modified_age_async()) {
+
+ goto function_exit;
+ }
+
+ oldest_lsn = buf_pool_get_oldest_modification();
+
+ if (!oldest_lsn
+ || lsn - oldest_lsn > log_max_modified_age_async()
+ || checkpoint_age > log_max_checkpoint_age_async()) {
+
+ log->check_flush_or_checkpoint = TRUE;
+ }
+function_exit:
+
+#ifdef UNIV_LOG_DEBUG
+ log_check_log_recs(log->buf + log->old_buf_free,
+ log->buf_free - log->old_buf_free, log->old_lsn);
+#endif
+
+ return(lsn);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Pads the current log block full with dummy log records. Used in producing
+consistent archived log files. */
+static
+void
+log_pad_current_log_block(void)
+/*===========================*/
+{
+ byte b = MLOG_DUMMY_RECORD;
+ ulint pad_length;
+ ulint i;
+ ib_uint64_t lsn;
+
+ /* We retrieve lsn only because otherwise gcc crashed on HP-UX */
+ lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
+
+ pad_length = OS_FILE_LOG_BLOCK_SIZE
+ - (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+ - LOG_BLOCK_TRL_SIZE;
+
+ for (i = 0; i < pad_length; i++) {
+ log_write_low(&b, 1);
+ }
+
+ lsn = log_sys->lsn;
+
+ log_close();
+ log_release();
+
+ ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return capacity in bytes */
+UNIV_INTERN
+ulint
+log_group_get_capacity(
+/*===================*/
+ const log_group_t* group) /*!< in: log group */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are not
+included.
+@return size offset (<= offset) */
+UNIV_INLINE
+ulint
+log_group_calc_size_offset(
+/*=======================*/
+ ulint offset, /*!< in: real offset within the
+ log group */
+ const log_group_t* group) /*!< in: log group */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are
+included.
+@return real offset (>= offset) */
+UNIV_INLINE
+ulint
+log_group_calc_real_offset(
+/*=======================*/
+ ulint offset, /*!< in: size offset within the
+ log group */
+ const log_group_t* group) /*!< in: log group */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ return(offset + LOG_FILE_HDR_SIZE
+ * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
+}
+
+/******************************************************//**
+Calculates the offset of an lsn within a log group.
+@return offset within the log group */
+static
+ulint
+log_group_calc_lsn_offset(
+/*======================*/
+ ib_uint64_t lsn, /*!< in: lsn, must be within 4 GB of
+ group->lsn */
+ const log_group_t* group) /*!< in: log group */
+{
+ ib_uint64_t gr_lsn;
+ ib_int64_t gr_lsn_size_offset;
+ ib_int64_t difference;
+ ib_int64_t group_size;
+ ib_int64_t offset;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ /* If total log file size is > 2 GB we can easily get overflows
+ with 32-bit integers. Use 64-bit integers instead. */
+
+ gr_lsn = group->lsn;
+
+ gr_lsn_size_offset = (ib_int64_t)
+ log_group_calc_size_offset(group->lsn_offset, group);
+
+ group_size = (ib_int64_t) log_group_get_capacity(group);
+
+ if (lsn >= gr_lsn) {
+
+ difference = (ib_int64_t) (lsn - gr_lsn);
+ } else {
+ difference = (ib_int64_t) (gr_lsn - lsn);
+
+ difference = difference % group_size;
+
+ difference = group_size - difference;
+ }
+
+ offset = (gr_lsn_size_offset + difference) % group_size;
+
+ if (sizeof(ulint) == 4) {
+ ut_a(offset < (((ib_int64_t) 1) << 32)); /* offset must be < 4 GB */
+ }
+
+ /* fprintf(stderr,
+ "Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
+ (ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
+ */
+
+ return(log_group_calc_real_offset((ulint)offset, group));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool log_debug_writes = FALSE;
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+ ib_int64_t* log_file_offset, /*!< out: offset in that file
+ (including the header) */
+ ib_uint64_t first_header_lsn, /*!< in: first log file start
+ lsn */
+ ib_uint64_t lsn, /*!< in: lsn whose position to
+ determine */
+ ulint n_log_files, /*!< in: total number of log
+ files */
+ ib_int64_t log_file_size) /*!< in: log file size
+ (including the header) */
+{
+ ib_int64_t capacity = log_file_size - LOG_FILE_HDR_SIZE;
+ ulint file_no;
+ ib_int64_t add_this_many;
+
+ if (lsn < first_header_lsn) {
+ add_this_many = 1 + (first_header_lsn - lsn)
+ / (capacity * (ib_int64_t)n_log_files);
+ lsn += add_this_many
+ * capacity * (ib_int64_t)n_log_files;
+ }
+
+ ut_a(lsn >= first_header_lsn);
+
+ file_no = ((ulint)((lsn - first_header_lsn) / capacity))
+ % n_log_files;
+ *log_file_offset = (lsn - first_header_lsn) % capacity;
+
+ *log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
+
+ return(file_no);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+ log_group_t* group, /*!< in/out: group */
+ ib_uint64_t lsn) /*!< in: lsn for which the values should be
+ set */
+{
+ group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
+ group->lsn = lsn;
+}
+
+/*****************************************************************//**
+Calculates the recommended highest values for lsn - last_checkpoint_lsn,
+lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age.
+@return error value FALSE if the smallest log group is too small to
+accommodate the number of OS threads in the database server */
+static
+ibool
+log_calc_max_ages(void)
+/*===================*/
+{
+ log_group_t* group;
+ ulint margin;
+ ulint free;
+ ibool success = TRUE;
+ ulint smallest_capacity;
+ ulint archive_margin;
+ ulint smallest_archive_margin;
+
+ mutex_enter(&(log_sys->mutex));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ ut_ad(group);
+
+ smallest_capacity = ULINT_MAX;
+ smallest_archive_margin = ULINT_MAX;
+
+ while (group) {
+ if (log_group_get_capacity(group) < smallest_capacity) {
+
+ smallest_capacity = log_group_get_capacity(group);
+ }
+
+ archive_margin = log_group_get_capacity(group)
+ - (group->file_size - LOG_FILE_HDR_SIZE)
+ - LOG_ARCHIVE_EXTRA_MARGIN;
+
+ if (archive_margin < smallest_archive_margin) {
+
+ smallest_archive_margin = archive_margin;
+ }
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ /* Add extra safety */
+ smallest_capacity = smallest_capacity - smallest_capacity / 10;
+
+ /* For each OS thread we must reserve so much free space in the
+ smallest log group that it can accommodate the log entries produced
+ by single query steps: running out of free log space is a serious
+ system error which requires rebooting the database. */
+
+ free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
+ + LOG_CHECKPOINT_EXTRA_FREE;
+ if (free >= smallest_capacity / 2) {
+ success = FALSE;
+
+ goto failure;
+ } else {
+ margin = smallest_capacity - free;
+ }
+
+ margin = ut_min(margin, log_sys->adm_checkpoint_interval);
+
+ margin = margin - margin / 10; /* Add still some extra safety */
+
+ log_sys->log_group_capacity = smallest_capacity;
+
+ log_sys->max_modified_age_async = margin
+ - margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
+ log_sys->max_modified_age_sync = margin
+ - margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
+
+ log_sys->max_checkpoint_age_async = margin - margin
+ / LOG_POOL_CHECKPOINT_RATIO_ASYNC;
+ log_sys->max_checkpoint_age = margin;
+
+#ifdef UNIV_LOG_ARCHIVE
+ log_sys->max_archived_lsn_age = smallest_archive_margin;
+
+ log_sys->max_archived_lsn_age_async = smallest_archive_margin
+ - smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC;
+#endif /* UNIV_LOG_ARCHIVE */
+failure:
+ mutex_exit(&(log_sys->mutex));
+
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Error: ib_logfiles are too small"
+ " for innodb_thread_concurrency %lu.\n"
+ "InnoDB: The combined size of ib_logfiles"
+ " should be bigger than\n"
+ "InnoDB: 200 kB * innodb_thread_concurrency.\n"
+ "InnoDB: To get mysqld to start up, set"
+ " innodb_thread_concurrency in my.cnf\n"
+ "InnoDB: to a lower value, for example, to 8."
+ " After an ERROR-FREE shutdown\n"
+ "InnoDB: of mysqld you can adjust the size of"
+ " ib_logfiles, as explained in\n"
+ "InnoDB: " REFMAN "adding-and-removing.html\n"
+ "InnoDB: Cannot continue operation."
+ " Calling exit(1).\n",
+ (ulong)srv_thread_concurrency);
+
+ exit(1);
+ }
+
+ return(success);
+}
+
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void)
+/*==========*/
+{
+ log_sys = mem_alloc(sizeof(log_t));
+
+ mutex_create(&log_sys->mutex, SYNC_LOG);
+
+ mutex_enter(&(log_sys->mutex));
+
+ /* Start the lsn from one log block from zero: this way every
+ log record has a start lsn != zero, a fact which we will use */
+
+ log_sys->lsn = LOG_START_LSN;
+
+ ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
+ ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
+
+ log_sys->buf_ptr = mem_alloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+ log_sys->buf = ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE);
+
+ log_sys->buf_size = LOG_BUFFER_SIZE;
+
+ memset(log_sys->buf, '\0', LOG_BUFFER_SIZE);
+
+ log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
+ - LOG_BUF_FLUSH_MARGIN;
+ log_sys->check_flush_or_checkpoint = TRUE;
+ UT_LIST_INIT(log_sys->log_groups);
+
+ log_sys->n_log_ios = 0;
+
+ log_sys->n_log_ios_old = log_sys->n_log_ios;
+ log_sys->last_printout_time = time(NULL);
+ /*----------------------------*/
+
+ log_sys->buf_next_to_write = 0;
+
+ log_sys->write_lsn = 0;
+ log_sys->current_flush_lsn = 0;
+ log_sys->flushed_to_disk_lsn = 0;
+
+ log_sys->written_to_some_lsn = log_sys->lsn;
+ log_sys->written_to_all_lsn = log_sys->lsn;
+
+ log_sys->n_pending_writes = 0;
+
+ log_sys->no_flush_event = os_event_create(NULL);
+
+ os_event_set(log_sys->no_flush_event);
+
+ log_sys->one_flushed_event = os_event_create(NULL);
+
+ os_event_set(log_sys->one_flushed_event);
+
+ /*----------------------------*/
+ log_sys->adm_checkpoint_interval = ULINT_MAX;
+
+ log_sys->next_checkpoint_no = 0;
+ log_sys->last_checkpoint_lsn = log_sys->lsn;
+ log_sys->n_pending_checkpoint_writes = 0;
+
+ rw_lock_create(&log_sys->checkpoint_lock, SYNC_NO_ORDER_CHECK);
+
+ log_sys->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+ log_sys->checkpoint_buf = ut_align(log_sys->checkpoint_buf_ptr,
+ OS_FILE_LOG_BLOCK_SIZE);
+ memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+ /*----------------------------*/
+
+#ifdef UNIV_LOG_ARCHIVE
+ /* Under MySQL, log archiving is always off */
+ log_sys->archiving_state = LOG_ARCH_OFF;
+ log_sys->archived_lsn = log_sys->lsn;
+ log_sys->next_archived_lsn = 0;
+
+ log_sys->n_pending_archive_ios = 0;
+
+ rw_lock_create(&log_sys->archive_lock, SYNC_NO_ORDER_CHECK);
+
+ log_sys->archive_buf = NULL;
+
+ /* ut_align(
+ ut_malloc(LOG_ARCHIVE_BUF_SIZE
+ + OS_FILE_LOG_BLOCK_SIZE),
+ OS_FILE_LOG_BLOCK_SIZE); */
+ log_sys->archive_buf_size = 0;
+
+ /* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
+
+ log_sys->archiving_on = os_event_create(NULL);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ /*----------------------------*/
+
+ log_block_init(log_sys->buf, log_sys->lsn);
+ log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+ log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+ log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
+
+ mutex_exit(&(log_sys->mutex));
+
+#ifdef UNIV_LOG_DEBUG
+ recv_sys_create();
+ recv_sys_init(buf_pool_get_curr_size());
+
+ recv_sys->parse_start_lsn = log_sys->lsn;
+ recv_sys->scanned_lsn = log_sys->lsn;
+ recv_sys->scanned_checkpoint_no = 0;
+ recv_sys->recovered_lsn = log_sys->lsn;
+ recv_sys->limit_lsn = IB_ULONGLONG_MAX;
+#endif
+}
+
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+ ulint id, /*!< in: group id */
+ ulint n_files, /*!< in: number of log files */
+ ulint file_size, /*!< in: log file size in bytes */
+ ulint space_id, /*!< in: space id of the file space
+ which contains the log files of this
+ group */
+ ulint archive_space_id __attribute__((unused)))
+ /*!< in: space id of the file space
+ which contains some archived log
+ files for this group; currently, only
+ for the first log group this is
+ used */
+{
+ ulint i;
+
+ log_group_t* group;
+
+ group = mem_alloc(sizeof(log_group_t));
+
+ group->id = id;
+ group->n_files = n_files;
+ group->file_size = file_size;
+ group->space_id = space_id;
+ group->state = LOG_GROUP_OK;
+ group->lsn = LOG_START_LSN;
+ group->lsn_offset = LOG_FILE_HDR_SIZE;
+ group->n_pending_writes = 0;
+
+ group->file_header_bufs_ptr = mem_alloc(sizeof(byte*) * n_files);
+ group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+#ifdef UNIV_LOG_ARCHIVE
+ group->archive_file_header_bufs_ptr = mem_alloc(
+ sizeof(byte*) * n_files);
+ group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ for (i = 0; i < n_files; i++) {
+ group->file_header_bufs_ptr[i] = mem_alloc(
+ LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+ group->file_header_bufs[i] = ut_align(
+ group->file_header_bufs_ptr[i],
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ memset(*(group->file_header_bufs + i), '\0',
+ LOG_FILE_HDR_SIZE);
+
+#ifdef UNIV_LOG_ARCHIVE
+ group->archive_file_header_bufs_ptr[i] = mem_alloc(
+ LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+ group->archive_file_header_bufs[i] = ut_align(
+ group->archive_file_header_bufs_ptr[i],
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ memset(*(group->archive_file_header_bufs + i), '\0',
+ LOG_FILE_HDR_SIZE);
+#endif /* UNIV_LOG_ARCHIVE */
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ group->archive_space_id = archive_space_id;
+
+ group->archived_file_no = 0;
+ group->archived_offset = 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ group->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+ group->checkpoint_buf = ut_align(group->checkpoint_buf_ptr,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+
+ UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
+
+ ut_a(log_calc_max_ages());
+}
+
+/******************************************************************//**
+Does the unlockings needed in flush i/o completion. */
+UNIV_INLINE
+void
+log_flush_do_unlocks(
+/*=================*/
+ ulint code) /*!< in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK
+ and LOG_UNLOCK_NONE_FLUSHED_LOCK */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ /* NOTE that we must own the log mutex when doing the setting of the
+ events: this is because transactions will wait for these events to
+ be set, and at that moment the log flush they were waiting for must
+ have ended. If the log mutex were not reserved here, the i/o-thread
+ calling this function might be preempted for a while, and when it
+ resumed execution, it might be that a new flush had been started, and
+ this function would erroneously signal the NEW flush as completed.
+ Thus, the changes in the state of these events are performed
+ atomically in conjunction with the changes in the state of
+ log_sys->n_pending_writes etc. */
+
+ if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) {
+ os_event_set(log_sys->one_flushed_event);
+ }
+
+ if (code & LOG_UNLOCK_FLUSH_LOCK) {
+ os_event_set(log_sys->no_flush_event);
+ }
+}
+
+/******************************************************************//**
+Checks if a flush is completed for a log group and does the completion
+routine if yes.
+@return LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */
+UNIV_INLINE
+ulint
+log_group_check_flush_completion(
+/*=============================*/
+ log_group_t* group) /*!< in: log group */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (!log_sys->one_flushed && group->n_pending_writes == 0) {
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Log flushed first to group %lu\n",
+ (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+ log_sys->written_to_some_lsn = log_sys->write_lsn;
+ log_sys->one_flushed = TRUE;
+
+ return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes && (group->n_pending_writes == 0)) {
+
+ fprintf(stderr, "Log flushed to group %lu\n",
+ (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+ return(0);
+}
+
+/******************************************************//**
+Checks if a flush is completed and does the completion routine if yes.
+@return LOG_UNLOCK_FLUSH_LOCK or 0 */
+static
+ulint
+log_sys_check_flush_completion(void)
+/*================================*/
+{
+ ulint move_start;
+ ulint move_end;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (log_sys->n_pending_writes == 0) {
+
+ log_sys->written_to_all_lsn = log_sys->write_lsn;
+ log_sys->buf_next_to_write = log_sys->write_end_offset;
+
+ if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
+ /* Move the log buffer content to the start of the
+ buffer */
+
+ move_start = ut_calc_align_down(
+ log_sys->write_end_offset,
+ OS_FILE_LOG_BLOCK_SIZE);
+ move_end = ut_calc_align(log_sys->buf_free,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ ut_memmove(log_sys->buf, log_sys->buf + move_start,
+ move_end - move_start);
+ log_sys->buf_free -= move_start;
+
+ log_sys->buf_next_to_write -= move_start;
+ }
+
+ return(LOG_UNLOCK_FLUSH_LOCK);
+ }
+
+ return(0);
+}
+
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+ log_group_t* group) /*!< in: log group or a dummy pointer */
+{
+ ulint unlock;
+
+#ifdef UNIV_LOG_ARCHIVE
+ if ((byte*)group == &log_archive_io) {
+ /* It was an archive write */
+
+ log_io_complete_archive();
+
+ return;
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ if ((ulint)group & 0x1UL) {
+ /* It was a checkpoint write */
+ group = (log_group_t*)((ulint)group - 1);
+
+ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
+ fil_flush(group->space_id);
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Checkpoint info written to group %lu\n",
+ group->id);
+ }
+#endif /* UNIV_DEBUG */
+ log_io_complete_checkpoint();
+
+ return;
+ }
+
+ ut_error; /*!< We currently use synchronous writing of the
+ logs and cannot end up here! */
+
+ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && srv_flush_log_at_trx_commit != 2) {
+
+ fil_flush(group->space_id);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+ ut_ad(!recv_no_log_write);
+
+ ut_a(group->n_pending_writes > 0);
+ ut_a(log_sys->n_pending_writes > 0);
+
+ group->n_pending_writes--;
+ log_sys->n_pending_writes--;
+
+ unlock = log_group_check_flush_completion(group);
+ unlock = unlock | log_sys_check_flush_completion();
+
+ log_flush_do_unlocks(unlock);
+
+ mutex_exit(&(log_sys->mutex));
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_file_header_flush(
+/*========================*/
+ log_group_t* group, /*!< in: log group */
+ ulint nth_file, /*!< in: header to the nth file in the
+ log file space */
+ ib_uint64_t start_lsn) /*!< in: log file data starts at this
+ lsn */
+{
+ byte* buf;
+ ulint dest_offset;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_ad(!recv_no_log_write);
+ ut_a(nth_file < group->n_files);
+
+ buf = *(group->file_header_bufs + nth_file);
+
+ mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+ mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn);
+
+ /* Wipe over possible label of ibbackup --restore */
+ memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, " ", 4);
+
+ dest_offset = nth_file * group->file_size;
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Writing log file header to group %lu file %lu\n",
+ (ulong) group->id, (ulong) nth_file);
+ }
+#endif /* UNIV_DEBUG */
+ if (log_do_write) {
+ log_sys->n_log_ios++;
+
+ srv_os_log_pending_writes++;
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
+ dest_offset / UNIV_PAGE_SIZE,
+ dest_offset % UNIV_PAGE_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE,
+ buf, group);
+
+ srv_os_log_pending_writes--;
+ }
+}
+
+/******************************************************//**
+Stores a 4-byte checksum to the trailer checksum field of a log block
+before writing it to a log file. This checksum is used in recovery to
+check the consistency of a log block. */
+static
+void
+log_block_store_checksum(
+/*=====================*/
+ byte* block) /*!< in/out: pointer to a log block */
+{
+ log_block_set_checksum(block, log_block_calc_checksum(block));
+}
+
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+ log_group_t* group, /*!< in: log group */
+ byte* buf, /*!< in: buffer */
+ ulint len, /*!< in: buffer len; must be divisible
+ by OS_FILE_LOG_BLOCK_SIZE */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the buffer; must
+ be divisible by
+ OS_FILE_LOG_BLOCK_SIZE */
+ ulint new_data_offset)/*!< in: start offset of new data in
+ buf: this parameter is used to decide
+ if we have to write a new log file
+ header */
+{
+ ulint write_len;
+ ibool write_header;
+ ulint next_offset;
+ ulint i;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_ad(!recv_no_log_write);
+ ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_a(((ulint) start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ if (new_data_offset == 0) {
+ write_header = TRUE;
+ } else {
+ write_header = FALSE;
+ }
+loop:
+ if (len == 0) {
+
+ return;
+ }
+
+ next_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+ if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE)
+ && write_header) {
+ /* We start to write a new log file instance in the group */
+
+ log_group_file_header_flush(group,
+ next_offset / group->file_size,
+ start_lsn);
+ srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE;
+ srv_log_writes++;
+ }
+
+ if ((next_offset % group->file_size) + len > group->file_size) {
+
+ write_len = group->file_size
+ - (next_offset % group->file_size);
+ } else {
+ write_len = len;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+
+ fprintf(stderr,
+ "Writing log file segment to group %lu"
+ " offset %lu len %lu\n"
+ "start lsn %llu\n"
+ "First block n:o %lu last block n:o %lu\n",
+ (ulong) group->id, (ulong) next_offset,
+ (ulong) write_len,
+ start_lsn,
+ (ulong) log_block_get_hdr_no(buf),
+ (ulong) log_block_get_hdr_no(
+ buf + write_len - OS_FILE_LOG_BLOCK_SIZE));
+ ut_a(log_block_get_hdr_no(buf)
+ == log_block_convert_lsn_to_no(start_lsn));
+
+ for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+
+ ut_a(log_block_get_hdr_no(buf) + i
+ == log_block_get_hdr_no(
+ buf + i * OS_FILE_LOG_BLOCK_SIZE));
+ }
+ }
+#endif /* UNIV_DEBUG */
+ /* Calculate the checksums for each log block and write them to
+ the trailer fields of the log blocks */
+
+ for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+ log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
+ }
+
+ if (log_do_write) {
+ log_sys->n_log_ios++;
+
+ srv_os_log_pending_writes++;
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
+ next_offset / UNIV_PAGE_SIZE,
+ next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
+
+ srv_os_log_pending_writes--;
+
+ srv_os_log_written+= write_len;
+ srv_log_writes++;
+ }
+
+ if (write_len < len) {
+ start_lsn += write_len;
+ len -= write_len;
+ buf += write_len;
+
+ write_header = TRUE;
+
+ goto loop;
+ }
+}
+
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+ ib_uint64_t lsn, /*!< in: log sequence number up to which
+ the log should be written,
+ IB_ULONGLONG_MAX if not specified */
+ ulint wait, /*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+ ibool flush_to_disk)
+ /*!< in: TRUE if we want the written log
+ also to be flushed to disk */
+{
+ log_group_t* group;
+ ulint start_offset;
+ ulint end_offset;
+ ulint area_start;
+ ulint area_end;
+#ifdef UNIV_DEBUG
+ ulint loop_count = 0;
+#endif /* UNIV_DEBUG */
+ ulint unlock;
+
+ if (recv_no_ibuf_operations) {
+ /* Recovery is running and no operations on the log files are
+ allowed yet (the variable name .._no_ibuf_.. is misleading) */
+
+ return;
+ }
+
+loop:
+#ifdef UNIV_DEBUG
+ loop_count++;
+
+ ut_ad(loop_count < 5);
+
+# if 0
+ if (loop_count > 2) {
+ fprintf(stderr, "Log loop count %lu\n", loop_count);
+ }
+# endif
+#endif
+
+ mutex_enter(&(log_sys->mutex));
+ ut_ad(!recv_no_log_write);
+
+ if (flush_to_disk
+ && log_sys->flushed_to_disk_lsn >= lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+ if (!flush_to_disk
+ && (log_sys->written_to_all_lsn >= lsn
+ || (log_sys->written_to_some_lsn >= lsn
+ && wait != LOG_WAIT_ALL_GROUPS))) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+ if (log_sys->n_pending_writes > 0) {
+ /* A write (+ possibly flush to disk) is running */
+
+ if (flush_to_disk
+ && log_sys->current_flush_lsn >= lsn) {
+ /* The write + flush will write enough: wait for it to
+ complete */
+
+ goto do_waits;
+ }
+
+ if (!flush_to_disk
+ && log_sys->write_lsn >= lsn) {
+ /* The write will write enough: wait for it to
+ complete */
+
+ goto do_waits;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Wait for the write to complete and try to start a new
+ write */
+
+ os_event_wait(log_sys->no_flush_event);
+
+ goto loop;
+ }
+
+ if (!flush_to_disk
+ && log_sys->buf_free == log_sys->buf_next_to_write) {
+ /* Nothing to write and no flush to disk requested */
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Writing log from %llu up to lsn %llu\n",
+ log_sys->written_to_all_lsn,
+ log_sys->lsn);
+ }
+#endif /* UNIV_DEBUG */
+ log_sys->n_pending_writes++;
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+ group->n_pending_writes++; /*!< We assume here that we have only
+ one log group! */
+
+ os_event_reset(log_sys->no_flush_event);
+ os_event_reset(log_sys->one_flushed_event);
+
+ start_offset = log_sys->buf_next_to_write;
+ end_offset = log_sys->buf_free;
+
+ area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
+ area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
+
+ ut_ad(area_end - area_start > 0);
+
+ log_sys->write_lsn = log_sys->lsn;
+
+ if (flush_to_disk) {
+ log_sys->current_flush_lsn = log_sys->lsn;
+ }
+
+ log_sys->one_flushed = FALSE;
+
+ log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
+ log_block_set_checkpoint_no(
+ log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+ log_sys->next_checkpoint_no);
+
+ /* Copy the last, incompletely written, log block a log block length
+ up, so that when the flush operation writes from the log buffer, the
+ segment to write will not be changed by writers to the log */
+
+ ut_memcpy(log_sys->buf + area_end,
+ log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
+ log_sys->write_end_offset = log_sys->buf_free;
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ /* Do the write to the log files */
+
+ while (group) {
+ log_group_write_buf(
+ group, log_sys->buf + area_start,
+ area_end - area_start,
+ ut_uint64_align_down(log_sys->written_to_all_lsn,
+ OS_FILE_LOG_BLOCK_SIZE),
+ start_offset - area_start);
+
+ log_group_set_fields(group, log_sys->write_lsn);
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
+ || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+ /* O_DSYNC means the OS did not buffer the log file at all:
+ so we have also flushed to disk what we have written */
+
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+
+ } else if (flush_to_disk) {
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ fil_flush(group->space_id);
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+ }
+
+ mutex_enter(&(log_sys->mutex));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ ut_a(group->n_pending_writes == 1);
+ ut_a(log_sys->n_pending_writes == 1);
+
+ group->n_pending_writes--;
+ log_sys->n_pending_writes--;
+
+ unlock = log_group_check_flush_completion(group);
+ unlock = unlock | log_sys_check_flush_completion();
+
+ log_flush_do_unlocks(unlock);
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+
+do_waits:
+ mutex_exit(&(log_sys->mutex));
+
+ switch (wait) {
+ case LOG_WAIT_ONE_GROUP:
+ os_event_wait(log_sys->one_flushed_event);
+ break;
+ case LOG_WAIT_ALL_GROUPS:
+ os_event_wait(log_sys->no_flush_event);
+ break;
+#ifdef UNIV_DEBUG
+ case LOG_NO_WAIT:
+ break;
+ default:
+ ut_error;
+#endif /* UNIV_DEBUG */
+ }
+}
+
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void)
+/*==========================*/
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+}
+
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+ ibool flush) /*!< in: flush the logs to disk */
+{
+ ib_uint64_t lsn;
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_write_up_to(lsn, LOG_NO_WAIT, flush);
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+static
+void
+log_flush_margin(void)
+/*==================*/
+{
+ log_t* log = log_sys;
+ ib_uint64_t lsn = 0;
+
+ mutex_enter(&(log->mutex));
+
+ if (log->buf_free > log->max_buf_free) {
+
+ if (log->n_pending_writes > 0) {
+ /* A flush is running: hope that it will provide enough
+ free space */
+ } else {
+ lsn = log->lsn;
+ }
+ }
+
+ mutex_exit(&(log->mutex));
+
+ if (lsn) {
+ log_write_up_to(lsn, LOG_NO_WAIT, FALSE);
+ }
+}
+
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool. NOTE: this function may only be called if the calling thread owns
+no synchronization objects!
+@return FALSE if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+UNIV_INTERN
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+ ib_uint64_t new_oldest, /*!< in: try to advance
+ oldest_modified_lsn at least
+ to this lsn */
+ ibool sync) /*!< in: TRUE if synchronous
+ operation is desired */
+{
+ ulint n_pages;
+
+ if (recv_recovery_on) {
+ /* If the recovery is running, we must first apply all
+ log records to their respective file pages to get the
+ right modify lsn values to these pages: otherwise, there
+ might be pages on disk which are not yet recovered to the
+ current lsn, and even after calling this function, we could
+ not know how up-to-date the disk version of the database is,
+ and we could not make a new checkpoint on the basis of the
+ info on the buffer pool only. */
+
+ recv_apply_hashed_log_recs(TRUE);
+ }
+
+ n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest);
+
+ if (sync) {
+ buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+ }
+
+ if (n_pages == ULINT_UNDEFINED) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/******************************************************//**
+Completes a checkpoint. */
+static
+void
+log_complete_checkpoint(void)
+/*=========================*/
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_ad(log_sys->n_pending_checkpoint_writes == 0);
+
+ log_sys->next_checkpoint_no++;
+
+ log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
+
+ rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
+}
+
+/******************************************************//**
+Completes an asynchronous checkpoint info write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void)
+/*============================*/
+{
+ mutex_enter(&(log_sys->mutex));
+
+ ut_ad(log_sys->n_pending_checkpoint_writes > 0);
+
+ log_sys->n_pending_checkpoint_writes--;
+
+ if (log_sys->n_pending_checkpoint_writes == 0) {
+ log_complete_checkpoint();
+ }
+
+ mutex_exit(&(log_sys->mutex));
+}
+
+/*******************************************************************//**
+Writes info to a checkpoint about a log group. */
+static
+void
+log_checkpoint_set_nth_group_info(
+/*==============================*/
+ byte* buf, /*!< in: buffer for checkpoint info */
+ ulint n, /*!< in: nth slot */
+ ulint file_no,/*!< in: archived file number */
+ ulint offset) /*!< in: archived file offset */
+{
+ ut_ad(n < LOG_MAX_N_GROUPS);
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+ + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no);
+ mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+ + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset);
+}
+
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+ const byte* buf, /*!< in: buffer containing checkpoint info */
+ ulint n, /*!< in: nth slot */
+ ulint* file_no,/*!< out: archived file number */
+ ulint* offset) /*!< out: archived file offset */
+{
+ ut_ad(n < LOG_MAX_N_GROUPS);
+
+ *file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+ + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
+ *offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+ + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET);
+}
+
+/******************************************************//**
+Writes the checkpoint info to a log group header. */
+static
+void
+log_group_checkpoint(
+/*=================*/
+ log_group_t* group) /*!< in: log group */
+{
+ log_group_t* group2;
+#ifdef UNIV_LOG_ARCHIVE
+ ib_uint64_t archived_lsn;
+ ib_uint64_t next_archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+ ulint write_offset;
+ ulint fold;
+ byte* buf;
+ ulint i;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+#if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
+# error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
+#endif
+
+ buf = group->checkpoint_buf;
+
+ mach_write_ull(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
+ mach_write_ull(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+ log_group_calc_lsn_offset(
+ log_sys->next_checkpoint_lsn, group));
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
+
+#ifdef UNIV_LOG_ARCHIVE
+#error "UNIV_LOG_ARCHIVE could not be enabled"
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+ archived_lsn = IB_ULONGLONG_MAX;
+ } else {
+ archived_lsn = log_sys->archived_lsn;
+
+ if (archived_lsn != log_sys->next_archived_lsn) {
+ next_archived_lsn = log_sys->next_archived_lsn;
+ /* For debugging only */
+ }
+ }
+
+ mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn);
+#else /* UNIV_LOG_ARCHIVE */
+ mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN,
+ (ib_uint64_t)log_group_calc_lsn_offset(
+ log_sys->next_checkpoint_lsn, group));
+#endif /* UNIV_LOG_ARCHIVE */
+
+ for (i = 0; i < LOG_MAX_N_GROUPS; i++) {
+ log_checkpoint_set_nth_group_info(buf, i, 0, 0);
+ }
+
+ group2 = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group2) {
+ log_checkpoint_set_nth_group_info(buf, group2->id,
+#ifdef UNIV_LOG_ARCHIVE
+ group2->archived_file_no,
+ group2->archived_offset
+#else /* UNIV_LOG_ARCHIVE */
+ 0, 0
+#endif /* UNIV_LOG_ARCHIVE */
+ );
+
+ group2 = UT_LIST_GET_NEXT(log_groups, group2);
+ }
+
+ fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+ mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+ fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+ LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+ mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+ /* Starting from InnoDB-3.23.50, we also write info on allocated
+ size in the tablespace */
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT,
+ log_fsp_current_free_limit);
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N,
+ LOG_CHECKPOINT_FSP_MAGIC_N_VAL);
+
+ /* We alternate the physical place of the checkpoint info in the first
+ log file */
+
+ if ((log_sys->next_checkpoint_no & 1) == 0) {
+ write_offset = LOG_CHECKPOINT_1;
+ } else {
+ write_offset = LOG_CHECKPOINT_2;
+ }
+
+ if (log_do_write) {
+ if (log_sys->n_pending_checkpoint_writes == 0) {
+
+ rw_lock_x_lock_gen(&(log_sys->checkpoint_lock),
+ LOG_CHECKPOINT);
+ }
+
+ log_sys->n_pending_checkpoint_writes++;
+
+ log_sys->n_log_ios++;
+
+ /* We send as the last parameter the group machine address
+ added with 1, as we want to distinguish between a normal log
+ file write and a checkpoint field write */
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->space_id, 0,
+ write_offset / UNIV_PAGE_SIZE,
+ write_offset % UNIV_PAGE_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE,
+ buf, ((byte*)group + 1));
+
+ ut_ad(((ulint)group & 0x1UL) == 0);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+ byte* hdr_buf,/*!< in: buffer which will be written to the
+ start of the first log file */
+ ib_uint64_t start) /*!< in: lsn of the start of the first log file;
+ we pretend that there is a checkpoint at
+ start + LOG_BLOCK_HDR_SIZE */
+{
+ ulint fold;
+ byte* buf;
+ ib_uint64_t lsn;
+
+ mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0);
+ mach_write_ull(hdr_buf + LOG_FILE_START_LSN, start);
+
+ lsn = start + LOG_BLOCK_HDR_SIZE;
+
+ /* Write the label of ibbackup --restore */
+ strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+ "ibbackup ");
+ ut_sprintf_timestamp((char*) hdr_buf
+ + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+ + (sizeof "ibbackup ") - 1));
+ buf = hdr_buf + LOG_CHECKPOINT_1;
+
+ mach_write_ull(buf + LOG_CHECKPOINT_NO, 0);
+ mach_write_ull(buf + LOG_CHECKPOINT_LSN, lsn);
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+ LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
+
+ mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
+
+ mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, IB_ULONGLONG_MAX);
+
+ fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+ mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+ fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+ LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+ mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+ /* Starting from InnoDB-3.23.50, we should also write info on
+ allocated size in the tablespace, but unfortunately we do not
+ know it here */
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+ log_group_t* group, /*!< in: log group */
+ ulint field) /*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ log_sys->n_log_ios++;
+
+ fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0,
+ field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+}
+
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void)
+/*==================================*/
+{
+ log_group_t* group;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group) {
+ log_group_checkpoint(group);
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+}
+
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is
+ desired */
+ ibool write_always) /*!< in: the function normally checks if the
+ the new checkpoint would have a greater
+ lsn than the previous one: if not, then no
+ physical write is done; by setting this
+ parameter TRUE, a physical write will always be
+ made to log files */
+{
+ ib_uint64_t oldest_lsn;
+
+ if (recv_recovery_is_on()) {
+ recv_apply_hashed_log_recs(TRUE);
+ }
+
+ if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+ fil_flush_file_spaces(FIL_TABLESPACE);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+
+ ut_ad(!recv_no_log_write);
+ oldest_lsn = log_buf_pool_get_oldest_modification();
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Because log also contains headers and dummy log records,
+ if the buffer pool contains no dirty buffers, oldest_lsn
+ gets the value log_sys->lsn from the previous function,
+ and we must make sure that the log is flushed up to that
+ lsn. If there are dirty buffers in the buffer pool, then our
+ write-ahead-logging algorithm ensures that the log has been flushed
+ up to oldest_lsn. */
+
+ log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+ mutex_enter(&(log_sys->mutex));
+
+ if (!write_always
+ && log_sys->last_checkpoint_lsn >= oldest_lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(TRUE);
+ }
+
+ ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn);
+
+ if (log_sys->n_pending_checkpoint_writes > 0) {
+ /* A checkpoint write is running */
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (sync) {
+ /* Wait for the checkpoint write to complete */
+ rw_lock_s_lock(&(log_sys->checkpoint_lock));
+ rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+ }
+
+ return(FALSE);
+ }
+
+ log_sys->next_checkpoint_lsn = oldest_lsn;
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr, "Making checkpoint no %lu at lsn %llu\n",
+ (ulong) log_sys->next_checkpoint_no,
+ oldest_lsn);
+ }
+#endif /* UNIV_DEBUG */
+
+ log_groups_write_checkpoint_info();
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (sync) {
+ /* Wait for the checkpoint write to complete */
+ rw_lock_s_lock(&(log_sys->checkpoint_lock));
+ rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+ }
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+ ib_uint64_t lsn, /*!< in: make a checkpoint at this or a
+ later lsn, if IB_ULONGLONG_MAX, makes
+ a checkpoint at the latest lsn */
+ ibool write_always) /*!< in: the function normally checks if
+ the new checkpoint would have a
+ greater lsn than the previous one: if
+ not, then no physical write is done;
+ by setting this parameter TRUE, a
+ physical write will always be made to
+ log files */
+{
+ /* Preflush pages synchronously */
+
+ while (!log_preflush_pool_modified_pages(lsn, TRUE));
+
+ while (!log_checkpoint(TRUE, write_always));
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+static
+void
+log_checkpoint_margin(void)
+/*=======================*/
+{
+ log_t* log = log_sys;
+ ib_uint64_t age;
+ ib_uint64_t checkpoint_age;
+ ib_uint64_t advance;
+ ib_uint64_t oldest_lsn;
+ ibool sync;
+ ibool checkpoint_sync;
+ ibool do_checkpoint;
+ ibool success;
+loop:
+ sync = FALSE;
+ checkpoint_sync = FALSE;
+ do_checkpoint = FALSE;
+
+ mutex_enter(&(log->mutex));
+ ut_ad(!recv_no_log_write);
+
+ if (log->check_flush_or_checkpoint == FALSE) {
+ mutex_exit(&(log->mutex));
+
+ return;
+ }
+
+ oldest_lsn = log_buf_pool_get_oldest_modification();
+
+ age = log->lsn - oldest_lsn;
+
+ if (age > log->max_modified_age_sync) {
+
+ /* A flush is urgent: we have to do a synchronous preflush */
+
+ sync = TRUE;
+ advance = 2 * (age - log->max_modified_age_sync);
+ } else if (age > log_max_modified_age_async()) {
+
+ /* A flush is not urgent: we do an asynchronous preflush */
+ advance = age - log_max_modified_age_async();
+ } else {
+ advance = 0;
+ }
+
+ checkpoint_age = log->lsn - log->last_checkpoint_lsn;
+
+ if (checkpoint_age > log->max_checkpoint_age) {
+ /* A checkpoint is urgent: we do it synchronously */
+
+ checkpoint_sync = TRUE;
+
+ do_checkpoint = TRUE;
+
+ } else if (checkpoint_age > log_max_checkpoint_age_async()) {
+ /* A checkpoint is not urgent: do it asynchronously */
+
+ do_checkpoint = TRUE;
+
+ log->check_flush_or_checkpoint = FALSE;
+ } else {
+ log->check_flush_or_checkpoint = FALSE;
+ }
+
+ mutex_exit(&(log->mutex));
+
+ if (advance) {
+ ib_uint64_t new_oldest = oldest_lsn + advance;
+
+ success = log_preflush_pool_modified_pages(new_oldest, sync);
+
+ /* If the flush succeeded, this thread has done its part
+ and can proceed. If it did not succeed, there was another
+ thread doing a flush at the same time. If sync was FALSE,
+ the flush was not urgent, and we let this thread proceed.
+ Otherwise, we let it start from the beginning again. */
+
+ if (sync && !success) {
+ mutex_enter(&(log->mutex));
+
+ log->check_flush_or_checkpoint = TRUE;
+
+ mutex_exit(&(log->mutex));
+ goto loop;
+ }
+ }
+
+ if (do_checkpoint) {
+ log_checkpoint(checkpoint_sync, FALSE);
+
+ if (checkpoint_sync) {
+
+ goto loop;
+ }
+ }
+}
+
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+ ulint type, /*!< in: LOG_ARCHIVE or LOG_RECOVER */
+ byte* buf, /*!< in: buffer where to read */
+ log_group_t* group, /*!< in: log group */
+ ib_uint64_t start_lsn, /*!< in: read area start */
+ ib_uint64_t end_lsn) /*!< in: read area end */
+{
+ ulint len;
+ ulint source_offset;
+ ibool sync;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ sync = (type == LOG_RECOVER);
+loop:
+ source_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+ len = (ulint) (end_lsn - start_lsn);
+
+ ut_ad(len != 0);
+
+ if ((source_offset % group->file_size) + len > group->file_size) {
+
+ len = group->file_size - (source_offset % group->file_size);
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ if (type == LOG_ARCHIVE) {
+
+ log_sys->n_pending_archive_ios++;
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ log_sys->n_log_ios++;
+
+ fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
+ source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE,
+ len, buf, NULL);
+
+ start_lsn += len;
+ buf += len;
+
+ if (start_lsn != end_lsn) {
+
+ goto loop;
+ }
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+ char* buf, /*!< in: buffer where to write */
+ ulint id __attribute__((unused)),
+ /*!< in: group id;
+ currently we only archive the first group */
+ ulint file_no)/*!< in: file number */
+{
+ sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_archive_file_header_write(
+/*================================*/
+ log_group_t* group, /*!< in: log group */
+ ulint nth_file, /*!< in: header to the nth file in the
+ archive log file space */
+ ulint file_no, /*!< in: archived file number */
+ ib_uint64_t start_lsn) /*!< in: log file data starts at this
+ lsn */
+{
+ byte* buf;
+ ulint dest_offset;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ ut_a(nth_file < group->n_files);
+
+ buf = *(group->archive_file_header_bufs + nth_file);
+
+ mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+ mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn);
+ mach_write_to_4(buf + LOG_FILE_NO, file_no);
+
+ mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE);
+
+ dest_offset = nth_file * group->file_size;
+
+ log_sys->n_log_ios++;
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+ dest_offset / UNIV_PAGE_SIZE,
+ dest_offset % UNIV_PAGE_SIZE,
+ 2 * OS_FILE_LOG_BLOCK_SIZE,
+ buf, &log_archive_io);
+}
+
+/******************************************************//**
+Writes a log file header to a completed archived log file. */
+static
+void
+log_group_archive_completed_header_write(
+/*=====================================*/
+ log_group_t* group, /*!< in: log group */
+ ulint nth_file, /*!< in: header to the nth file in the
+ archive log file space */
+ ib_uint64_t end_lsn) /*!< in: end lsn of the file */
+{
+ byte* buf;
+ ulint dest_offset;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_a(nth_file < group->n_files);
+
+ buf = *(group->archive_file_header_bufs + nth_file);
+
+ mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE);
+ mach_write_ull(buf + LOG_FILE_END_LSN, end_lsn);
+
+ dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED;
+
+ log_sys->n_log_ios++;
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+ dest_offset / UNIV_PAGE_SIZE,
+ dest_offset % UNIV_PAGE_SIZE,
+ OS_FILE_LOG_BLOCK_SIZE,
+ buf + LOG_FILE_ARCH_COMPLETED,
+ &log_archive_io);
+}
+
+/******************************************************//**
+Does the archive writes for a single log group. */
+static
+void
+log_group_archive(
+/*==============*/
+ log_group_t* group) /*!< in: log group */
+{
+ os_file_t file_handle;
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ char name[1024];
+ byte* buf;
+ ulint len;
+ ibool ret;
+ ulint next_offset;
+ ulint n_files;
+ ulint open_mode;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ start_lsn = log_sys->archived_lsn;
+
+ ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ end_lsn = log_sys->next_archived_lsn;
+
+ ut_a(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+ buf = log_sys->archive_buf;
+
+ n_files = 0;
+
+ next_offset = group->archived_offset;
+loop:
+ if ((next_offset % group->file_size == 0)
+ || (fil_space_get_size(group->archive_space_id) == 0)) {
+
+ /* Add the file to the archive file space; create or open the
+ file */
+
+ if (next_offset % group->file_size == 0) {
+ open_mode = OS_FILE_CREATE;
+ } else {
+ open_mode = OS_FILE_OPEN;
+ }
+
+ log_archived_file_name_gen(name, group->id,
+ group->archived_file_no + n_files);
+
+ file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
+ OS_DATA_FILE, &ret);
+
+ if (!ret && (open_mode == OS_FILE_CREATE)) {
+ file_handle = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_AIO,
+ OS_DATA_FILE, &ret);
+ }
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Cannot create or open"
+ " archive log file %s.\n"
+ "InnoDB: Cannot continue operation.\n"
+ "InnoDB: Check that the log archive"
+ " directory exists,\n"
+ "InnoDB: you have access rights to it, and\n"
+ "InnoDB: there is space available.\n", name);
+ exit(1);
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr, "Created archive file %s\n", name);
+ }
+#endif /* UNIV_DEBUG */
+
+ ret = os_file_close(file_handle);
+
+ ut_a(ret);
+
+ /* Add the archive file as a node to the space */
+
+ fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
+ group->archive_space_id, FALSE);
+
+ if (next_offset % group->file_size == 0) {
+ log_group_archive_file_header_write(
+ group, n_files,
+ group->archived_file_no + n_files,
+ start_lsn);
+
+ next_offset += LOG_FILE_HDR_SIZE;
+ }
+ }
+
+ len = end_lsn - start_lsn;
+
+ if (group->file_size < (next_offset % group->file_size) + len) {
+
+ len = group->file_size - (next_offset % group->file_size);
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Archiving starting at lsn %llu, len %lu"
+ " to group %lu\n",
+ start_lsn,
+ (ulong) len, (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+
+ log_sys->n_pending_archive_ios++;
+
+ log_sys->n_log_ios++;
+
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id,
+ next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE,
+ ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
+ &log_archive_io);
+
+ start_lsn += len;
+ next_offset += len;
+ buf += len;
+
+ if (next_offset % group->file_size == 0) {
+ n_files++;
+ }
+
+ if (end_lsn != start_lsn) {
+
+ goto loop;
+ }
+
+ group->next_archived_file_no = group->archived_file_no + n_files;
+ group->next_archived_offset = next_offset % group->file_size;
+
+ ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+}
+
+/*****************************************************//**
+(Writes to the archive of each log group.) Currently, only the first
+group is archived. */
+static
+void
+log_archive_groups(void)
+/*====================*/
+{
+ log_group_t* group;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ log_group_archive(group);
+}
+
+/*****************************************************//**
+Completes the archiving write phase for (each log group), currently,
+the first log group. */
+static
+void
+log_archive_write_complete_groups(void)
+/*===================================*/
+{
+ log_group_t* group;
+ ulint end_offset;
+ ulint trunc_files;
+ ulint n_files;
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ ulint i;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ group->archived_file_no = group->next_archived_file_no;
+ group->archived_offset = group->next_archived_offset;
+
+ /* Truncate from the archive file space all but the last
+ file, or if it has been written full, all files */
+
+ n_files = (UNIV_PAGE_SIZE
+ * fil_space_get_size(group->archive_space_id))
+ / group->file_size;
+ ut_ad(n_files > 0);
+
+ end_offset = group->archived_offset;
+
+ if (end_offset % group->file_size == 0) {
+
+ trunc_files = n_files;
+ } else {
+ trunc_files = n_files - 1;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes && trunc_files) {
+ fprintf(stderr,
+ "Complete file(s) archived to group %lu\n",
+ (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Calculate the archive file space start lsn */
+ start_lsn = log_sys->next_archived_lsn
+ - (end_offset - LOG_FILE_HDR_SIZE + trunc_files
+ * (group->file_size - LOG_FILE_HDR_SIZE));
+ end_lsn = start_lsn;
+
+ for (i = 0; i < trunc_files; i++) {
+
+ end_lsn += group->file_size - LOG_FILE_HDR_SIZE;
+
+ /* Write a notice to the headers of archived log
+ files that the file write has been completed */
+
+ log_group_archive_completed_header_write(group, i, end_lsn);
+ }
+
+ fil_space_truncate_start(group->archive_space_id,
+ trunc_files * group->file_size);
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fputs("Archiving writes completed\n", stderr);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_archive_check_completion_low(void)
+/*==================================*/
+{
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (log_sys->n_pending_archive_ios == 0
+ && log_sys->archiving_phase == LOG_ARCHIVE_READ) {
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fputs("Archiving read completed\n", stderr);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Archive buffer has now been read in: start archive writes */
+
+ log_sys->archiving_phase = LOG_ARCHIVE_WRITE;
+
+ log_archive_groups();
+ }
+
+ if (log_sys->n_pending_archive_ios == 0
+ && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) {
+
+ log_archive_write_complete_groups();
+
+ log_sys->archived_lsn = log_sys->next_archived_lsn;
+
+ rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+ }
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void)
+/*=========================*/
+{
+ log_group_t* group;
+
+ mutex_enter(&(log_sys->mutex));
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ mutex_exit(&(log_sys->mutex));
+
+ fil_flush(group->archive_space_id);
+
+ mutex_enter(&(log_sys->mutex));
+
+ ut_ad(log_sys->n_pending_archive_ios > 0);
+
+ log_sys->n_pending_archive_ios--;
+
+ log_archive_check_completion_low();
+
+ mutex_exit(&(log_sys->mutex));
+}
+
+/********************************************************************//**
+Starts an archiving operation.
+@return TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+ ibool sync, /*!< in: TRUE if synchronous operation is desired */
+ ulint* n_bytes)/*!< out: archive log buffer size, 0 if nothing to
+ archive */
+{
+ ibool calc_new_limit;
+ ib_uint64_t start_lsn;
+ ib_uint64_t limit_lsn;
+
+ calc_new_limit = TRUE;
+loop:
+ mutex_enter(&(log_sys->mutex));
+
+ switch (log_sys->archiving_state) {
+ case LOG_ARCH_OFF:
+arch_none:
+ mutex_exit(&(log_sys->mutex));
+
+ *n_bytes = 0;
+
+ return(TRUE);
+ case LOG_ARCH_STOPPED:
+ case LOG_ARCH_STOPPING2:
+ mutex_exit(&(log_sys->mutex));
+
+ os_event_wait(log_sys->archiving_on);
+
+ goto loop;
+ }
+
+ start_lsn = log_sys->archived_lsn;
+
+ if (calc_new_limit) {
+ ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
+ limit_lsn = start_lsn + log_sys->archive_buf_size;
+
+ *n_bytes = log_sys->archive_buf_size;
+
+ if (limit_lsn >= log_sys->lsn) {
+
+ limit_lsn = ut_uint64_align_down(
+ log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE);
+ }
+ }
+
+ if (log_sys->archived_lsn >= limit_lsn) {
+
+ goto arch_none;
+ }
+
+ if (log_sys->written_to_all_lsn < limit_lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+ calc_new_limit = FALSE;
+
+ goto loop;
+ }
+
+ if (log_sys->n_pending_archive_ios > 0) {
+ /* An archiving operation is running */
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (sync) {
+ rw_lock_s_lock(&(log_sys->archive_lock));
+ rw_lock_s_unlock(&(log_sys->archive_lock));
+ }
+
+ *n_bytes = log_sys->archive_buf_size;
+
+ return(FALSE);
+ }
+
+ rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+
+ log_sys->archiving_phase = LOG_ARCHIVE_READ;
+
+ log_sys->next_archived_lsn = limit_lsn;
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Archiving from lsn %llu to lsn %llu\n",
+ log_sys->archived_lsn, limit_lsn);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Read the log segment to the archive buffer */
+
+ log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf,
+ UT_LIST_GET_FIRST(log_sys->log_groups),
+ start_lsn, limit_lsn);
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (sync) {
+ rw_lock_s_lock(&(log_sys->archive_lock));
+ rw_lock_s_unlock(&(log_sys->archive_lock));
+ }
+
+ *n_bytes = log_sys->archive_buf_size;
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Writes the log contents to the archive at least up to the lsn when this
+function was called. */
+static
+void
+log_archive_all(void)
+/*=================*/
+{
+ ib_uint64_t present_lsn;
+ ulint dummy;
+
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+ present_lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_pad_current_log_block();
+
+ for (;;) {
+ mutex_enter(&(log_sys->mutex));
+
+ if (present_lsn <= log_sys->archived_lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_archive_do(TRUE, &dummy);
+ }
+}
+
+/*****************************************************//**
+Closes the possible open archive log file (for each group) the first group,
+and if it was open, increments the group file count by 2, if desired. */
+static
+void
+log_archive_close_groups(
+/*=====================*/
+ ibool increment_file_count) /*!< in: TRUE if we want to increment
+ the file count */
+{
+ log_group_t* group;
+ ulint trunc_len;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+ return;
+ }
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ trunc_len = UNIV_PAGE_SIZE
+ * fil_space_get_size(group->archive_space_id);
+ if (trunc_len > 0) {
+ ut_a(trunc_len == group->file_size);
+
+ /* Write a notice to the headers of archived log
+ files that the file write has been completed */
+
+ log_group_archive_completed_header_write(
+ group, 0, log_sys->archived_lsn);
+
+ fil_space_truncate_start(group->archive_space_id,
+ trunc_len);
+ if (increment_file_count) {
+ group->archived_offset = 0;
+ group->archived_file_no += 2;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "Incrementing arch file no to %lu"
+ " in log group %lu\n",
+ (ulong) group->archived_file_no + 2,
+ (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+ }
+}
+
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from 2 higher, so that the archiving will not write
+again to the archived log files which exist when this function returns.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void)
+/*==================*/
+{
+ ibool success;
+
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->archiving_state != LOG_ARCH_ON) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_ERROR);
+ }
+
+ log_sys->archiving_state = LOG_ARCH_STOPPING;
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_archive_all();
+
+ mutex_enter(&(log_sys->mutex));
+
+ log_sys->archiving_state = LOG_ARCH_STOPPING2;
+ os_event_reset(log_sys->archiving_on);
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Wait for a possible archiving operation to end */
+
+ rw_lock_s_lock(&(log_sys->archive_lock));
+ rw_lock_s_unlock(&(log_sys->archive_lock));
+
+ mutex_enter(&(log_sys->mutex));
+
+ /* Close all archived log files, incrementing the file count by 2,
+ if appropriate */
+
+ log_archive_close_groups(TRUE);
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Make a checkpoint, so that if recovery is needed, the file numbers
+ of new archived log files will start from the right value */
+
+ success = FALSE;
+
+ while (!success) {
+ success = log_checkpoint(TRUE, TRUE);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+
+ log_sys->archiving_state = LOG_ARCH_STOPPED;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void)
+/*===================*/
+{
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->archiving_state != LOG_ARCH_STOPPED) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_ERROR);
+ }
+
+ log_sys->archiving_state = LOG_ARCH_ON;
+
+ os_event_set(log_sys->archiving_on);
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void)
+/*==========================*/
+{
+loop:
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->archiving_state == LOG_ARCH_STOPPED
+ || log_sys->archiving_state == LOG_ARCH_OFF) {
+
+ log_sys->archiving_state = LOG_ARCH_OFF;
+
+ os_event_set(log_sys->archiving_on);
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_SUCCESS);
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ log_archive_stop();
+
+ os_thread_sleep(500000);
+
+ goto loop;
+}
+
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void)
+/*========================*/
+{
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+ log_sys->archiving_state = LOG_ARCH_ON;
+
+ log_sys->archived_lsn
+ = ut_uint64_align_down(log_sys->lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_SUCCESS);
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_ERROR);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for
+archiving. */
+static
+void
+log_archive_margin(void)
+/*====================*/
+{
+ log_t* log = log_sys;
+ ulint age;
+ ibool sync;
+ ulint dummy;
+loop:
+ mutex_enter(&(log->mutex));
+
+ if (log->archiving_state == LOG_ARCH_OFF) {
+ mutex_exit(&(log->mutex));
+
+ return;
+ }
+
+ age = log->lsn - log->archived_lsn;
+
+ if (age > log->max_archived_lsn_age) {
+
+ /* An archiving is urgent: we have to do synchronous i/o */
+
+ sync = TRUE;
+
+ } else if (age > log->max_archived_lsn_age_async) {
+
+ /* An archiving is not urgent: we do asynchronous i/o */
+
+ sync = FALSE;
+ } else {
+ /* No archiving required yet */
+
+ mutex_exit(&(log->mutex));
+
+ return;
+ }
+
+ mutex_exit(&(log->mutex));
+
+ log_archive_do(sync, &dummy);
+
+ if (sync == TRUE) {
+ /* Check again that enough was written to the archive */
+
+ goto loop;
+ }
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void)
+/*===================*/
+{
+loop:
+ log_flush_margin();
+
+ log_checkpoint_margin();
+
+#ifdef UNIV_LOG_ARCHIVE
+ log_archive_margin();
+#endif /* UNIV_LOG_ARCHIVE */
+
+ mutex_enter(&(log_sys->mutex));
+ ut_ad(!recv_no_log_write);
+
+ if (log_sys->check_flush_or_checkpoint) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ goto loop;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+}
+
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void)
+/*=======================================*/
+{
+ ib_uint64_t lsn;
+ ulint arch_log_no;
+
+ if (srv_print_verbose_log) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Starting shutdown...\n");
+ }
+ /* Wait until the master thread and all other operations are idle: our
+ algorithm only works if the server is idle at shutdown */
+
+ srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+ os_event_set(srv_purge_thread_event);
+loop:
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+
+ /* We need the monitor threads to stop before we proceed with a
+ normal shutdown. In case of very fast shutdown, however, we can
+ proceed without waiting for monitor threads. */
+
+ if (srv_fast_shutdown < 2
+ && (srv_error_monitor_active
+ || srv_lock_timeout_active || srv_monitor_active)) {
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ /* Check that there are no longer transactions. We need this wait even
+ for the 'very fast' shutdown, because the InnoDB layer may have
+ committed or prepared transactions and we don't want to lose them. */
+
+ if (trx_n_mysql_transactions > 0
+ || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ if (srv_fast_shutdown == 2) {
+ /* In this fastest shutdown we do not flush the buffer pool:
+ it is essentially a 'crash' of the InnoDB server. Make sure
+ that the log is all flushed to disk, so that we can recover
+ all committed transactions in a crash recovery. We must not
+ write the lsn stamps to the data files, since at a startup
+ InnoDB deduces from the stamps if the previous shutdown was
+ clean. */
+
+ log_buffer_flush_to_disk();
+
+ return; /* We SKIP ALL THE REST !! */
+ }
+
+ /* Check that the master thread is suspended */
+
+ if (srv_n_threads_active[SRV_MASTER] != 0) {
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ /* Check that the purge threads ended */
+ if (srv_use_purge_thread
+ && (srv_n_threads_active[SRV_PURGE] != 0
+ || srv_n_threads_active[SRV_PURGE_WORKER] != 0)) {
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ mutex_enter(&(log_sys->mutex));
+
+ if (log_sys->n_pending_checkpoint_writes
+#ifdef UNIV_LOG_ARCHIVE
+ || log_sys->n_pending_archive_ios
+#endif /* UNIV_LOG_ARCHIVE */
+ || log_sys->n_pending_writes) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ goto loop;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (!buf_pool_check_no_pending_io()) {
+
+ goto loop;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ log_archive_all();
+#endif /* UNIV_LOG_ARCHIVE */
+
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ mutex_enter(&(log_sys->mutex));
+
+ lsn = log_sys->lsn;
+
+ if (lsn != log_sys->last_checkpoint_lsn
+#ifdef UNIV_LOG_ARCHIVE
+ || (srv_log_archive_on
+ && lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE)
+#endif /* UNIV_LOG_ARCHIVE */
+ ) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ goto loop;
+ }
+
+ arch_log_no = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+ UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
+
+ if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
+
+ arch_log_no--;
+ }
+
+ log_archive_close_groups(TRUE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ mutex_exit(&(log_sys->mutex));
+
+ mutex_enter(&kernel_mutex);
+ /* Check that the master thread has stayed suspended */
+ if (srv_n_threads_active[SRV_MASTER] != 0) {
+ fprintf(stderr,
+ "InnoDB: Warning: the master thread woke up"
+ " during shutdown\n");
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+ fil_flush_file_spaces(FIL_LOG);
+
+ /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
+ pool: therefore it is essential that the buffer pool has been
+ completely flushed to disk! (We do not call fil_write... if the
+ 'very fast' shutdown is enabled.) */
+
+ if (!buf_all_freed()) {
+
+ goto loop;
+ }
+
+ srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+ /* Make some checks that the server really is quiet */
+ ut_a(srv_n_threads_active[SRV_MASTER] == 0);
+ ut_a(buf_all_freed());
+ ut_a(lsn == log_sys->lsn);
+
+ if (lsn < srv_start_lsn) {
+ fprintf(stderr,
+ "InnoDB: Error: log sequence number"
+ " at shutdown %llu\n"
+ "InnoDB: is lower than at startup %llu!\n",
+ lsn, srv_start_lsn);
+ }
+
+ srv_shutdown_lsn = lsn;
+
+ fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+ fil_close_all_files();
+
+ /* Make some checks that the server really is quiet */
+ ut_a(srv_n_threads_active[SRV_MASTER] == 0);
+ ut_a(buf_all_freed());
+ ut_a(lsn == log_sys->lsn);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+ const byte* buf, /*!< in: pointer to the start of
+ the log segment in the
+ log_sys->buf log buffer */
+ ulint len, /*!< in: segment length in bytes */
+ ib_uint64_t buf_start_lsn) /*!< in: buffer start lsn */
+{
+ ib_uint64_t contiguous_lsn;
+ ib_uint64_t scanned_lsn;
+ const byte* start;
+ const byte* end;
+ byte* buf1;
+ byte* scan_buf;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ if (len == 0) {
+
+ return(TRUE);
+ }
+
+ start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE);
+ end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE);
+
+ buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE);
+ scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE);
+
+ ut_memcpy(scan_buf, start, end - start);
+
+ recv_scan_log_recs((buf_pool->curr_size
+ - recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
+ FALSE, scan_buf, end - start,
+ ut_uint64_align_down(buf_start_lsn,
+ OS_FILE_LOG_BLOCK_SIZE),
+ &contiguous_lsn, &scanned_lsn);
+
+ ut_a(scanned_lsn == buf_start_lsn + len);
+ ut_a(recv_sys->recovered_lsn == scanned_lsn);
+
+ mem_free(buf1);
+
+ return(TRUE);
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/******************************************************//**
+Peeks the current lsn.
+@return TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+ ib_uint64_t* lsn) /*!< out: if returns TRUE, current lsn is here */
+{
+ if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
+ *lsn = log_sys->lsn;
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+ FILE* file) /*!< in: file where to print */
+{
+ double time_elapsed;
+ time_t current_time;
+
+ mutex_enter(&(log_sys->mutex));
+
+ fprintf(file,
+ "Log sequence number %llu\n"
+ "Log flushed up to %llu\n"
+ "Last checkpoint at %llu\n",
+ log_sys->lsn,
+ log_sys->flushed_to_disk_lsn,
+ log_sys->last_checkpoint_lsn);
+
+ fprintf(file,
+ "Max checkpoint age %lu\n"
+ "Checkpoint age target %lu\n"
+ "Modified age %lu\n"
+ "Checkpoint age %lu\n",
+ (ulong) log_sys->max_checkpoint_age,
+ (ulong) log_max_checkpoint_age_async(),
+ (ulong) (log_sys->lsn -
+ log_buf_pool_get_oldest_modification()),
+ (ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
+
+ current_time = time(NULL);
+
+ time_elapsed = 0.001 + difftime(current_time,
+ log_sys->last_printout_time);
+ fprintf(file,
+ "%lu pending log writes, %lu pending chkp writes\n"
+ "%lu log i/o's done, %.2f log i/o's/second\n",
+ (ulong) log_sys->n_pending_writes,
+ (ulong) log_sys->n_pending_checkpoint_writes,
+ (ulong) log_sys->n_log_ios,
+ ((log_sys->n_log_ios - log_sys->n_log_ios_old)
+ / time_elapsed));
+
+ log_sys->n_log_ios_old = log_sys->n_log_ios;
+ log_sys->last_printout_time = current_time;
+
+ mutex_exit(&(log_sys->mutex));
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void)
+/*===================*/
+{
+ log_sys->n_log_ios_old = log_sys->n_log_ios;
+ log_sys->last_printout_time = time(NULL);
+}
+
+/**********************************************************************
+Closes a log group. */
+static
+void
+log_group_close(
+/*===========*/
+ log_group_t* group) /* in,own: log group to close */
+{
+ ulint i;
+
+ for (i = 0; i < group->n_files; i++) {
+ mem_free(group->file_header_bufs_ptr[i]);
+#ifdef UNIV_LOG_ARCHIVE
+ mem_free(group->archive_file_header_bufs_ptr[i]);
+#endif /* UNIV_LOG_ARCHIVE */
+ }
+
+ mem_free(group->file_header_bufs_ptr);
+ mem_free(group->file_header_bufs);
+
+#ifdef UNIV_LOG_ARCHIVE
+ mem_free(group->archive_file_header_bufs_ptr);
+ mem_free(group->archive_file_header_bufs);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ mem_free(group->checkpoint_buf_ptr);
+
+ mem_free(group);
+}
+
+/**********************************************************
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void)
+/*==============*/
+{
+ log_group_t* group;
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
+ log_group_t* prev_group = group;
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ UT_LIST_REMOVE(log_groups, log_sys->log_groups, prev_group);
+
+ log_group_close(prev_group);
+ }
+
+ mem_free(log_sys->buf_ptr);
+ log_sys->buf_ptr = NULL;
+ log_sys->buf = NULL;
+ mem_free(log_sys->checkpoint_buf_ptr);
+ log_sys->checkpoint_buf_ptr = NULL;
+ log_sys->checkpoint_buf = NULL;
+
+ os_event_free(log_sys->no_flush_event);
+ os_event_free(log_sys->one_flushed_event);
+
+ rw_lock_free(&log_sys->checkpoint_lock);
+
+ mutex_free(&log_sys->mutex);
+
+#ifdef UNIV_LOG_ARCHIVE
+ rw_lock_free(&log_sys->archive_lock);
+ os_event_create(log_sys->archiving_on);
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+ recv_sys_debug_free();
+#endif
+
+ recv_sys_close();
+}
+
+/**********************************************************
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void)
+/*==============*/
+{
+ if (log_sys != NULL) {
+ recv_sys_mem_free();
+ mem_free(log_sys);
+
+ log_sys = NULL;
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c
new file mode 100644
index 00000000000..200b3b088a7
--- /dev/null
+++ b/storage/xtradb/log/log0recv.c
@@ -0,0 +1,3955 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0recv.c
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "log0recv.h"
+
+#ifdef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0rea.h"
+# include "srv0srv.h"
+# include "srv0start.h"
+# include "trx0roll.h"
+# include "row0merge.h"
+# include "sync0sync.h"
+#else /* !UNIV_HOTBACKUP */
+
+/** This is set to FALSE if the backup was originally taken with the
+ibbackup --include regexp option: then we do not want to create tables in
+directories which were not included */
+UNIV_INTERN ibool recv_replay_file_ops = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log records are stored in the hash table in chunks at most of this size;
+this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */
+#define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t))
+
+/** Read-ahead area in applying log records to file pages */
+#define RECV_READ_AHEAD_AREA 32
+
+/** The recovery system */
+UNIV_INTERN recv_sys_t* recv_sys = NULL;
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise. Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+UNIV_INTERN ibool recv_recovery_on;
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+UNIV_INTERN ibool recv_recovery_from_backup_on;
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_HOTBACKUP
+/** TRUE when recv_init_crash_recovery() has been called. */
+UNIV_INTERN ibool recv_needed_recovery;
+# ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+UNIV_INTERN ibool recv_no_log_write = FALSE;
+# endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+UNIV_INTERN ibool recv_lsn_checks_on;
+
+/** There are two conditions under which we scan the logs, the first
+is normal startup and the second is when we do a recovery from an
+archive.
+This flag is set if we are doing a scan from the last checkpoint during
+startup. If we find log entries that were written after the last checkpoint
+we know that the server was not cleanly shutdown. We must then initialize
+the crash recovery environment before attempting to store these entries in
+the log hash table. */
+static ibool recv_log_scan_is_startup_type;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+UNIV_INTERN ibool recv_no_ibuf_operations;
+/** TRUE when the redo log is being backed up */
+# define recv_is_making_a_backup FALSE
+/** TRUE when recovering from a backed up redo log file */
+# define recv_is_from_backup FALSE
+#else /* !UNIV_HOTBACKUP */
+# define recv_needed_recovery FALSE
+/** TRUE when the redo log is being backed up */
+UNIV_INTERN ibool recv_is_making_a_backup = FALSE;
+/** TRUE when recovering from a backed up redo log file */
+UNIV_INTERN ibool recv_is_from_backup = FALSE;
+# define buf_pool_get_curr_size() (5 * 1024 * 1024)
+#endif /* !UNIV_HOTBACKUP */
+/** The following counter is used to decide when to print info on
+log scan */
+static ulint recv_scan_print_counter;
+
+/** The type of the previous parsed redo log record */
+static ulint recv_previous_parsed_rec_type;
+/** The offset of the previous parsed redo log record */
+static ulint recv_previous_parsed_rec_offset;
+/** The 'multi' flag of the previous parsed redo log record */
+static ulint recv_previous_parsed_rec_is_multi;
+
+/** Maximum page number encountered in the redo log */
+UNIV_INTERN ulint recv_max_parsed_page_no;
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database.
+This is the default value. If the actual size of the buffer pool is
+larger than 10 MB we'll set this value to 512. */
+UNIV_INTERN ulint recv_n_pool_free_frames;
+
+/** The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+UNIV_INTERN ib_uint64_t recv_max_page_lsn;
+
+/* prototypes */
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void);
+/*===========================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void)
+/*=================*/
+{
+ if (recv_sys != NULL) {
+
+ return;
+ }
+
+ recv_sys = mem_alloc(sizeof(*recv_sys));
+ memset(recv_sys, 0x0, sizeof(*recv_sys));
+
+ mutex_create(&recv_sys->mutex, SYNC_RECV);
+
+ recv_sys->heap = NULL;
+ recv_sys->addr_hash = NULL;
+
+ recv_sys->stats_recv_start_time = time(NULL);
+ recv_sys->stats_oldest_modified_lsn = IB_ULONGLONG_MAX;
+}
+
+/********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void)
+/*================*/
+{
+ if (recv_sys != NULL) {
+ if (recv_sys->addr_hash != NULL) {
+ hash_table_free(recv_sys->addr_hash);
+ }
+
+ if (recv_sys->heap != NULL) {
+ mem_heap_free(recv_sys->heap);
+ }
+
+ if (recv_sys->buf != NULL) {
+ ut_free(recv_sys->buf);
+ }
+
+ if (recv_sys->last_block_buf_start != NULL) {
+ mem_free(recv_sys->last_block_buf_start);
+ }
+
+ mutex_free(&recv_sys->mutex);
+
+ mem_free(recv_sys);
+ recv_sys = NULL;
+ }
+}
+
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void)
+/*===================*/
+{
+ if (recv_sys != NULL) {
+ if (recv_sys->addr_hash != NULL) {
+ hash_table_free(recv_sys->addr_hash);
+ }
+
+ if (recv_sys->heap != NULL) {
+ mem_heap_free(recv_sys->heap);
+ }
+
+ if (recv_sys->buf != NULL) {
+ ut_free(recv_sys->buf);
+ }
+
+ if (recv_sys->last_block_buf_start != NULL) {
+ mem_free(recv_sys->last_block_buf_start);
+ }
+
+ mem_free(recv_sys);
+ recv_sys = NULL;
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void)
+/*===================*/
+{
+ recv_lsn_checks_on = FALSE;
+
+ recv_n_pool_free_frames = 256;
+
+ recv_recovery_on = FALSE;
+
+#ifdef UNIV_LOG_ARCHIVE
+ recv_recovery_from_backup_on = FALSE;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ recv_needed_recovery = FALSE;
+
+ recv_lsn_checks_on = FALSE;
+
+ recv_log_scan_is_startup_type = FALSE;
+
+ recv_no_ibuf_operations = FALSE;
+
+ recv_scan_print_counter = 0;
+
+ recv_previous_parsed_rec_type = 999999;
+
+ recv_previous_parsed_rec_offset = 0;
+
+ recv_previous_parsed_rec_is_multi = 0;
+
+ recv_max_parsed_page_no = 0;
+
+ recv_n_pool_free_frames = 256;
+
+ recv_max_page_lsn = 0;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+ ulint available_memory) /*!< in: available memory in bytes */
+{
+ if (recv_sys->heap != NULL) {
+
+ return;
+ }
+
+ /* Initialize red-black tree for fast insertions into the
+ flush_list during recovery process.
+ As this initialization is done while holding the buffer pool
+ mutex we perform it before acquiring recv_sys->mutex. */
+#ifndef UNIV_HOTBACKUP
+ buf_flush_init_flush_rbt();
+
+ mutex_enter(&(recv_sys->mutex));
+
+ recv_sys->heap = mem_heap_create_in_buffer(256);
+#else /* !UNIV_HOTBACKUP */
+ recv_sys->heap = mem_heap_create(256);
+ recv_is_from_backup = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Set appropriate value of recv_n_pool_free_frames. */
+ if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
+ /* Buffer pool of size greater than 10 MB. */
+ recv_n_pool_free_frames = 512;
+ }
+
+ if (buf_pool_get_curr_size() >= (32 * 1024 * 1024)) {
+ /* Buffer pool of size greater than 32 MB. */
+ recv_n_pool_free_frames = 1024;
+ }
+
+ recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
+ recv_sys->len = 0;
+ recv_sys->recovered_offset = 0;
+
+ recv_sys->addr_hash = hash_create(available_memory / 512);
+ recv_sys->n_addrs = 0;
+
+ recv_sys->apply_log_recs = FALSE;
+ recv_sys->apply_batch_on = FALSE;
+
+ recv_sys->last_block_buf_start = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+
+ recv_sys->last_block = ut_align(recv_sys->last_block_buf_start,
+ OS_FILE_LOG_BLOCK_SIZE);
+ recv_sys->found_corrupt_log = FALSE;
+
+ recv_max_page_lsn = 0;
+
+ mutex_exit(&(recv_sys->mutex));
+}
+
+/********************************************************//**
+Empties the hash table when it has been fully processed. */
+static
+void
+recv_sys_empty_hash(void)
+/*=====================*/
+{
+ ut_ad(mutex_own(&(recv_sys->mutex)));
+
+ if (recv_sys->n_addrs != 0) {
+ fprintf(stderr,
+ "InnoDB: Error: %lu pages with log records"
+ " were left unprocessed!\n"
+ "InnoDB: Maximum page number with"
+ " log records on it %lu\n",
+ (ulong) recv_sys->n_addrs,
+ (ulong) recv_max_parsed_page_no);
+ ut_error;
+ }
+
+ hash_table_free(recv_sys->addr_hash);
+ mem_heap_empty(recv_sys->heap);
+
+ recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512);
+}
+
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+/********************************************************//**
+Frees the recovery system. */
+static
+void
+recv_sys_debug_free(void)
+/*=====================*/
+{
+ mutex_enter(&(recv_sys->mutex));
+
+ hash_table_free(recv_sys->addr_hash);
+ mem_heap_free(recv_sys->heap);
+ ut_free(recv_sys->buf);
+ mem_free(recv_sys->last_block_buf_start);
+
+ recv_sys->buf = NULL;
+ recv_sys->heap = NULL;
+ recv_sys->addr_hash = NULL;
+ recv_sys->last_block_buf_start = NULL;
+
+ mutex_exit(&(recv_sys->mutex));
+
+ /* Free up the flush_rbt. */
+ buf_flush_free_flush_rbt();
+}
+# endif /* UNIV_LOG_DEBUG */
+
+/********************************************************//**
+Truncates possible corrupted or extra records from a log group. */
+static
+void
+recv_truncate_group(
+/*================*/
+ log_group_t* group, /*!< in: log group */
+ ib_uint64_t recovered_lsn, /*!< in: recovery succeeded up to this
+ lsn */
+ ib_uint64_t limit_lsn, /*!< in: this was the limit for
+ recovery */
+ ib_uint64_t checkpoint_lsn, /*!< in: recovery was started from this
+ checkpoint */
+ ib_uint64_t archived_lsn) /*!< in: the log has been archived up to
+ this lsn */
+{
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ ib_uint64_t finish_lsn1;
+ ib_uint64_t finish_lsn2;
+ ib_uint64_t finish_lsn;
+ ulint len;
+ ulint i;
+
+ if (archived_lsn == IB_ULONGLONG_MAX) {
+ /* Checkpoint was taken in the NOARCHIVELOG mode */
+ archived_lsn = checkpoint_lsn;
+ }
+
+ finish_lsn1 = ut_uint64_align_down(archived_lsn,
+ OS_FILE_LOG_BLOCK_SIZE)
+ + log_group_get_capacity(group);
+
+ finish_lsn2 = ut_uint64_align_up(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE)
+ + recv_sys->last_log_buf_size;
+
+ if (limit_lsn != IB_ULONGLONG_MAX) {
+ /* We do not know how far we should erase log records: erase
+ as much as possible */
+
+ finish_lsn = finish_lsn1;
+ } else {
+ /* It is enough to erase the length of the log buffer */
+ finish_lsn = finish_lsn1 < finish_lsn2
+ ? finish_lsn1 : finish_lsn2;
+ }
+
+ ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+ /* Write the log buffer full of zeros */
+ for (i = 0; i < RECV_SCAN_SIZE; i++) {
+
+ *(log_sys->buf + i) = '\0';
+ }
+
+ start_lsn = ut_uint64_align_down(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+
+ if (start_lsn != recovered_lsn) {
+ /* Copy the last incomplete log block to the log buffer and
+ edit its data length: */
+
+ ut_memcpy(log_sys->buf, recv_sys->last_block,
+ OS_FILE_LOG_BLOCK_SIZE);
+ log_block_set_data_len(log_sys->buf,
+ (ulint) (recovered_lsn - start_lsn));
+ }
+
+ if (start_lsn >= finish_lsn) {
+
+ return;
+ }
+
+ for (;;) {
+ end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+ if (end_lsn > finish_lsn) {
+
+ end_lsn = finish_lsn;
+ }
+
+ len = (ulint) (end_lsn - start_lsn);
+
+ log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+ if (end_lsn >= finish_lsn) {
+
+ return;
+ }
+
+ /* Write the log buffer full of zeros */
+ for (i = 0; i < RECV_SCAN_SIZE; i++) {
+
+ *(log_sys->buf + i) = '\0';
+ }
+
+ start_lsn = end_lsn;
+ }
+}
+
+/********************************************************//**
+Copies the log segment between group->recovered_lsn and recovered_lsn from the
+most up-to-date log group to group, so that it contains the latest log data. */
+static
+void
+recv_copy_group(
+/*============*/
+ log_group_t* up_to_date_group, /*!< in: the most up-to-date log
+ group */
+ log_group_t* group, /*!< in: copy to this log
+ group */
+ ib_uint64_t recovered_lsn) /*!< in: recovery succeeded up
+ to this lsn */
+{
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ ulint len;
+
+ if (group->scanned_lsn >= recovered_lsn) {
+
+ return;
+ }
+
+ ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+ start_lsn = ut_uint64_align_down(group->scanned_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ for (;;) {
+ end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+ if (end_lsn > recovered_lsn) {
+ end_lsn = ut_uint64_align_up(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ }
+
+ log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+ up_to_date_group, start_lsn, end_lsn);
+
+ len = (ulint) (end_lsn - start_lsn);
+
+ log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+
+ if (end_lsn >= recovered_lsn) {
+
+ return;
+ }
+
+ start_lsn = end_lsn;
+ }
+}
+
+/********************************************************//**
+Copies a log segment from the most up-to-date log group to the other log
+groups, so that they all contain the latest log data. Also writes the info
+about the latest checkpoint to the groups, and inits the fields in the group
+memory structs to up-to-date values. */
+static
+void
+recv_synchronize_groups(
+/*====================*/
+ log_group_t* up_to_date_group) /*!< in: the most up-to-date
+ log group */
+{
+ log_group_t* group;
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ ib_uint64_t recovered_lsn;
+ ib_uint64_t limit_lsn;
+
+ recovered_lsn = recv_sys->recovered_lsn;
+ limit_lsn = recv_sys->limit_lsn;
+
+ /* Read the last recovered log block to the recovery system buffer:
+ the block is always incomplete */
+
+ start_lsn = ut_uint64_align_down(recovered_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ end_lsn = ut_uint64_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+ ut_a(start_lsn != end_lsn);
+
+ log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block,
+ up_to_date_group, start_lsn, end_lsn);
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group) {
+ if (group != up_to_date_group) {
+
+ /* Copy log data if needed */
+
+ recv_copy_group(group, up_to_date_group,
+ recovered_lsn);
+ }
+
+ /* Update the fields in the group struct to correspond to
+ recovered_lsn */
+
+ log_group_set_fields(group, recovered_lsn);
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ /* Copy the checkpoint info to the groups; remember that we have
+ incremented checkpoint_no by one, and the info will not be written
+ over the max checkpoint info, thus making the preservation of max
+ checkpoint info on disk certain */
+
+ log_groups_write_checkpoint_info();
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Wait for the checkpoint write to complete */
+ rw_lock_s_lock(&(log_sys->checkpoint_lock));
+ rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+
+ mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Checks the consistency of the checkpoint info
+@return TRUE if ok */
+static
+ibool
+recv_check_cp_is_consistent(
+/*========================*/
+ const byte* buf) /*!< in: buffer containing checkpoint info */
+{
+ ulint fold;
+
+ fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+
+ if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+ buf + LOG_CHECKPOINT_CHECKSUM_1)) {
+ return(FALSE);
+ }
+
+ fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+ LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+
+ if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+ buf + LOG_CHECKPOINT_CHECKSUM_2)) {
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Looks for the maximum consistent checkpoint from the log groups.
+@return error code or DB_SUCCESS */
+static
+ulint
+recv_find_max_checkpoint(
+/*=====================*/
+ log_group_t** max_group, /*!< out: max group */
+ ulint* max_field) /*!< out: LOG_CHECKPOINT_1 or
+ LOG_CHECKPOINT_2 */
+{
+ log_group_t* group;
+ ib_uint64_t max_no;
+ ib_uint64_t checkpoint_no;
+ ulint field;
+ byte* buf;
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ max_no = 0;
+ *max_group = NULL;
+ *max_field = 0;
+
+ buf = log_sys->checkpoint_buf;
+
+ while (group) {
+ group->state = LOG_GROUP_CORRUPTED;
+
+ for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+ field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
+
+ log_group_read_checkpoint_info(group, field);
+
+ if (!recv_check_cp_is_consistent(buf)) {
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Checkpoint in group"
+ " %lu at %lu invalid, %lu\n",
+ (ulong) group->id,
+ (ulong) field,
+ (ulong) mach_read_from_4(
+ buf
+ + LOG_CHECKPOINT_CHECKSUM_1));
+
+ }
+#endif /* UNIV_DEBUG */
+ goto not_consistent;
+ }
+
+ group->state = LOG_GROUP_OK;
+
+ group->lsn = mach_read_ull(
+ buf + LOG_CHECKPOINT_LSN);
+
+#ifdef UNIV_LOG_ARCHIVE
+#error "UNIV_LOG_ARCHIVE could not be enabled"
+#endif
+ {
+ ib_uint64_t tmp_lsn_offset = mach_read_ull(
+ buf + LOG_CHECKPOINT_ARCHIVED_LSN);
+ if (sizeof(ulint) != 4
+ && tmp_lsn_offset != IB_ULONGLONG_MAX) {
+ group->lsn_offset = (ulint) tmp_lsn_offset;
+ } else {
+ group->lsn_offset = mach_read_from_4(
+ buf + LOG_CHECKPOINT_OFFSET);
+ }
+ }
+
+ checkpoint_no = mach_read_ull(
+ buf + LOG_CHECKPOINT_NO);
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Checkpoint number %lu"
+ " found in group %lu\n",
+ (ulong) checkpoint_no,
+ (ulong) group->id);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (checkpoint_no >= max_no) {
+ *max_group = group;
+ *max_field = field;
+ max_no = checkpoint_no;
+ }
+
+not_consistent:
+ ;
+ }
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ if (*max_group == NULL) {
+
+ fprintf(stderr,
+ "InnoDB: No valid checkpoint found.\n"
+ "InnoDB: If this error appears when you are"
+ " creating an InnoDB database,\n"
+ "InnoDB: the problem may be that during"
+ " an earlier attempt you managed\n"
+ "InnoDB: to create the InnoDB data files,"
+ " but log file creation failed.\n"
+ "InnoDB: If that is the case, please refer to\n"
+ "InnoDB: " REFMAN "error-creating-innodb.html\n");
+ return(DB_ERROR);
+ }
+
+ return(DB_SUCCESS);
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_cp_info_for_backup(
+/*=========================*/
+ const byte* hdr, /*!< in: buffer containing the log group
+ header */
+ ib_uint64_t* lsn, /*!< out: checkpoint lsn */
+ ulint* offset, /*!< out: checkpoint offset in the log group */
+ ulint* fsp_limit,/*!< out: fsp limit of space 0,
+ 1000000000 if the database is running
+ with < version 3.23.50 of InnoDB */
+ ib_uint64_t* cp_no, /*!< out: checkpoint number */
+ ib_uint64_t* first_header_lsn)
+ /*!< out: lsn of of the start of the
+ first log file */
+{
+ ulint max_cp = 0;
+ ib_uint64_t max_cp_no = 0;
+ const byte* cp_buf;
+
+ cp_buf = hdr + LOG_CHECKPOINT_1;
+
+ if (recv_check_cp_is_consistent(cp_buf)) {
+ max_cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO);
+ max_cp = LOG_CHECKPOINT_1;
+ }
+
+ cp_buf = hdr + LOG_CHECKPOINT_2;
+
+ if (recv_check_cp_is_consistent(cp_buf)) {
+ if (mach_read_ull(cp_buf + LOG_CHECKPOINT_NO) > max_cp_no) {
+ max_cp = LOG_CHECKPOINT_2;
+ }
+ }
+
+ if (max_cp == 0) {
+ return(FALSE);
+ }
+
+ cp_buf = hdr + max_cp;
+
+ *lsn = mach_read_ull(cp_buf + LOG_CHECKPOINT_LSN);
+ *offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET);
+
+ /* If the user is running a pre-3.23.50 version of InnoDB, its
+ checkpoint data does not contain the fsp limit info */
+ if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N)
+ == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) {
+
+ *fsp_limit = mach_read_from_4(
+ cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT);
+
+ if (*fsp_limit == 0) {
+ *fsp_limit = 1000000000;
+ }
+ } else {
+ *fsp_limit = 1000000000;
+ }
+
+ /* fprintf(stderr, "fsp limit %lu MB\n", *fsp_limit); */
+
+ *cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO);
+
+ *first_header_lsn = mach_read_ull(hdr + LOG_FILE_START_LSN);
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Checks the 4-byte checksum to the trailer checksum field of a log
+block. We also accept a log block in the old format before
+InnoDB-3.23.52 where the checksum field contains the log block number.
+@return TRUE if ok, or if the log block may be in the format of InnoDB
+version predating 3.23.52 */
+static
+ibool
+log_block_checksum_is_ok_or_old_format(
+/*===================================*/
+ const byte* block) /*!< in: pointer to a log block */
+{
+#ifdef UNIV_LOG_DEBUG
+ return(TRUE);
+#endif /* UNIV_LOG_DEBUG */
+ if (log_block_calc_checksum(block) == log_block_get_checksum(block)) {
+
+ return(TRUE);
+ }
+
+ if (log_block_get_hdr_no(block) == log_block_get_checksum(block)) {
+
+ /* We assume the log block is in the format of
+ InnoDB version < 3.23.52 and the block is ok */
+#if 0
+ fprintf(stderr,
+ "InnoDB: Scanned old format < InnoDB-3.23.52"
+ " log block number %lu\n",
+ log_block_get_hdr_no(block));
+#endif
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+ byte* buf, /*!< in: buffer containing log data */
+ ulint buf_len, /*!< in: data length in that buffer */
+ ib_uint64_t* scanned_lsn, /*!< in/out: lsn of buffer start,
+ we return scanned lsn */
+ ulint* scanned_checkpoint_no,
+ /*!< in/out: 4 lowest bytes of the
+ highest scanned checkpoint number so
+ far */
+ ulint* n_bytes_scanned)/*!< out: how much we were able to
+ scan, smaller than buf_len if log
+ data ended here */
+{
+ ulint data_len;
+ byte* log_block;
+ ulint no;
+
+ *n_bytes_scanned = 0;
+
+ for (log_block = buf; log_block < buf + buf_len;
+ log_block += OS_FILE_LOG_BLOCK_SIZE) {
+
+ no = log_block_get_hdr_no(log_block);
+
+#if 0
+ fprintf(stderr, "Log block header no %lu\n", no);
+#endif
+
+ if (no != log_block_convert_lsn_to_no(*scanned_lsn)
+ || !log_block_checksum_is_ok_or_old_format(log_block)) {
+#if 0
+ fprintf(stderr,
+ "Log block n:o %lu, scanned lsn n:o %lu\n",
+ no, log_block_convert_lsn_to_no(*scanned_lsn));
+#endif
+ /* Garbage or an incompletely written log block */
+
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+#if 0
+ fprintf(stderr,
+ "Next log block n:o %lu\n",
+ log_block_get_hdr_no(log_block));
+#endif
+ break;
+ }
+
+ if (*scanned_checkpoint_no > 0
+ && log_block_get_checkpoint_no(log_block)
+ < *scanned_checkpoint_no
+ && *scanned_checkpoint_no
+ - log_block_get_checkpoint_no(log_block)
+ > 0x80000000UL) {
+
+ /* Garbage from a log buffer flush which was made
+ before the most recent database recovery */
+#if 0
+ fprintf(stderr,
+ "Scanned cp n:o %lu, block cp n:o %lu\n",
+ *scanned_checkpoint_no,
+ log_block_get_checkpoint_no(log_block));
+#endif
+ break;
+ }
+
+ data_len = log_block_get_data_len(log_block);
+
+ *scanned_checkpoint_no
+ = log_block_get_checkpoint_no(log_block);
+ *scanned_lsn += data_len;
+
+ *n_bytes_scanned += data_len;
+
+ if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+ /* Log data ends here */
+
+#if 0
+ fprintf(stderr, "Log block data len %lu\n",
+ data_len);
+#endif
+ break;
+ }
+ }
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record body and also applies it to a page if
+specified. File ops are parsed, but not applied in this function.
+@return log record end, NULL if not a complete record */
+static
+byte*
+recv_parse_or_apply_log_rec_body(
+/*=============================*/
+ byte type, /*!< in: type */
+ byte* ptr, /*!< in: pointer to a buffer */
+ byte* end_ptr,/*!< in: pointer to the buffer end */
+ buf_block_t* block, /*!< in/out: buffer block or NULL; if
+ not NULL, then the log record is
+ applied to the page, and the log
+ record should be complete then */
+ mtr_t* mtr) /*!< in: mtr or NULL; should be non-NULL
+ if and only if block is non-NULL */
+{
+ dict_index_t* index = NULL;
+ page_t* page;
+ page_zip_des_t* page_zip;
+#ifdef UNIV_DEBUG
+ ulint page_type;
+#endif /* UNIV_DEBUG */
+
+ ut_ad(!block == !mtr);
+
+ if (block) {
+ page = block->frame;
+ page_zip = buf_block_get_page_zip(block);
+ ut_d(page_type = fil_page_get_type(page));
+ } else {
+ page = NULL;
+ page_zip = NULL;
+ ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED);
+ }
+
+ switch (type) {
+#ifdef UNIV_LOG_LSN_DEBUG
+ case MLOG_LSN:
+ /* The LSN is checked in recv_parse_log_rec(). */
+ break;
+#endif /* UNIV_LOG_LSN_DEBUG */
+ case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES:
+#ifdef UNIV_DEBUG
+ if (page && page_type == FIL_PAGE_TYPE_ALLOCATED
+ && end_ptr >= ptr + 2) {
+ /* It is OK to set FIL_PAGE_TYPE and certain
+ list node fields on an empty page. Any other
+ write is not OK. */
+
+ /* NOTE: There may be bogus assertion failures for
+ dict_hdr_create(), trx_rseg_header_create(),
+ trx_sys_create_doublewrite_buf(), and
+ trx_sysf_create().
+ These are only called during database creation. */
+ ulint offs = mach_read_from_2(ptr);
+
+ switch (type) {
+ default:
+ ut_error;
+ case MLOG_2BYTES:
+ /* Note that this can fail when the
+ redo log been written with something
+ older than InnoDB Plugin 1.0.4. */
+ ut_ad(offs == FIL_PAGE_TYPE
+ || offs == IBUF_TREE_SEG_HEADER
+ + IBUF_HEADER + FSEG_HDR_OFFSET
+ || offs == PAGE_BTR_IBUF_FREE_LIST
+ + PAGE_HEADER + FIL_ADDR_BYTE
+ || offs == PAGE_BTR_IBUF_FREE_LIST
+ + PAGE_HEADER + FIL_ADDR_BYTE
+ + FIL_ADDR_SIZE
+ || offs == PAGE_BTR_SEG_LEAF
+ + PAGE_HEADER + FSEG_HDR_OFFSET
+ || offs == PAGE_BTR_SEG_TOP
+ + PAGE_HEADER + FSEG_HDR_OFFSET
+ || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ + PAGE_HEADER + FIL_ADDR_BYTE
+ + 0 /*FLST_PREV*/
+ || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ + PAGE_HEADER + FIL_ADDR_BYTE
+ + FIL_ADDR_SIZE /*FLST_NEXT*/);
+ break;
+ case MLOG_4BYTES:
+ /* Note that this can fail when the
+ redo log been written with something
+ older than InnoDB Plugin 1.0.4. */
+ ut_ad(0
+ || offs == IBUF_TREE_SEG_HEADER
+ + IBUF_HEADER + FSEG_HDR_SPACE
+ || offs == IBUF_TREE_SEG_HEADER
+ + IBUF_HEADER + FSEG_HDR_PAGE_NO
+ || offs == PAGE_BTR_IBUF_FREE_LIST
+ + PAGE_HEADER/* flst_init */
+ || offs == PAGE_BTR_IBUF_FREE_LIST
+ + PAGE_HEADER + FIL_ADDR_PAGE
+ || offs == PAGE_BTR_IBUF_FREE_LIST
+ + PAGE_HEADER + FIL_ADDR_PAGE
+ + FIL_ADDR_SIZE
+ || offs == PAGE_BTR_SEG_LEAF
+ + PAGE_HEADER + FSEG_HDR_PAGE_NO
+ || offs == PAGE_BTR_SEG_LEAF
+ + PAGE_HEADER + FSEG_HDR_SPACE
+ || offs == PAGE_BTR_SEG_TOP
+ + PAGE_HEADER + FSEG_HDR_PAGE_NO
+ || offs == PAGE_BTR_SEG_TOP
+ + PAGE_HEADER + FSEG_HDR_SPACE
+ || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ + PAGE_HEADER + FIL_ADDR_PAGE
+ + 0 /*FLST_PREV*/
+ || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+ + PAGE_HEADER + FIL_ADDR_PAGE
+ + FIL_ADDR_SIZE /*FLST_NEXT*/);
+ break;
+ }
+ }
+#endif /* UNIV_DEBUG */
+ ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip);
+ break;
+ case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_REC_INSERT,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr,
+ block, index, mtr);
+ }
+ break;
+ case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_REC_CLUST_DELETE_MARK,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = btr_cur_parse_del_mark_set_clust_rec(
+ ptr, end_ptr, page, page_zip, index);
+ }
+ break;
+ case MLOG_COMP_REC_SEC_DELETE_MARK:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ /* This log record type is obsolete, but we process it for
+ backward compatibility with MySQL 5.0.3 and 5.0.4. */
+ ut_a(!page || page_is_comp(page));
+ ut_a(!page_zip);
+ ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index);
+ if (!ptr) {
+ break;
+ }
+ /* Fall through */
+ case MLOG_REC_SEC_DELETE_MARK:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr,
+ page, page_zip);
+ break;
+ case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_REC_UPDATE_IN_PLACE,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page,
+ page_zip, index);
+ }
+ break;
+ case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE:
+ case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_LIST_END_DELETE
+ || type == MLOG_COMP_LIST_START_DELETE,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = page_parse_delete_rec_list(type, ptr, end_ptr,
+ block, index, mtr);
+ }
+ break;
+ case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_LIST_END_COPY_CREATED,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = page_parse_copy_rec_list_to_created_page(
+ ptr, end_ptr, block, index, mtr);
+ }
+ break;
+ case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_PAGE_REORGANIZE,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = btr_parse_page_reorganize(ptr, end_ptr, index,
+ block, mtr);
+ }
+ break;
+ case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
+ /* Allow anything in page_type when creating a page. */
+ ut_a(!page_zip);
+ ptr = page_parse_create(ptr, end_ptr,
+ type == MLOG_COMP_PAGE_CREATE,
+ block, mtr);
+ break;
+ case MLOG_UNDO_INSERT:
+ ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+ ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page);
+ break;
+ case MLOG_UNDO_ERASE_END:
+ ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+ ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr);
+ break;
+ case MLOG_UNDO_INIT:
+ /* Allow anything in page_type when creating a page. */
+ ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr);
+ break;
+ case MLOG_UNDO_HDR_DISCARD:
+ ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+ ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr);
+ break;
+ case MLOG_UNDO_HDR_CREATE:
+ case MLOG_UNDO_HDR_REUSE:
+ ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+ ptr = trx_undo_parse_page_header(type, ptr, end_ptr,
+ page, mtr);
+ break;
+ case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ /* On a compressed page, MLOG_COMP_REC_MIN_MARK
+ will be followed by MLOG_COMP_REC_DELETE
+ or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL)
+ in the same mini-transaction. */
+ ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip);
+ ptr = btr_parse_set_min_rec_mark(
+ ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK,
+ page, mtr);
+ break;
+ case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+ if (NULL != (ptr = mlog_parse_index(
+ ptr, end_ptr,
+ type == MLOG_COMP_REC_DELETE,
+ &index))) {
+ ut_a(!page
+ || (ibool)!!page_is_comp(page)
+ == dict_table_is_comp(index->table));
+ ptr = page_cur_parse_delete_rec(ptr, end_ptr,
+ block, index, mtr);
+ }
+ break;
+ case MLOG_IBUF_BITMAP_INIT:
+ /* Allow anything in page_type when creating a page. */
+ ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr);
+ break;
+ case MLOG_INIT_FILE_PAGE:
+ /* Allow anything in page_type when creating a page. */
+ ptr = fsp_parse_init_file_page(ptr, end_ptr, block);
+ break;
+ case MLOG_WRITE_STRING:
+ ut_ad(!page || page_type != FIL_PAGE_TYPE_ALLOCATED);
+ ptr = mlog_parse_string(ptr, end_ptr, page, page_zip);
+ break;
+ case MLOG_FILE_CREATE:
+ case MLOG_FILE_RENAME:
+ case MLOG_FILE_DELETE:
+ case MLOG_FILE_CREATE2:
+ ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, 0, 0);
+ break;
+ case MLOG_ZIP_WRITE_NODE_PTR:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ ptr = page_zip_parse_write_node_ptr(ptr, end_ptr,
+ page, page_zip);
+ break;
+ case MLOG_ZIP_WRITE_BLOB_PTR:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr,
+ page, page_zip);
+ break;
+ case MLOG_ZIP_WRITE_HEADER:
+ ut_ad(!page || page_type == FIL_PAGE_INDEX);
+ ptr = page_zip_parse_write_header(ptr, end_ptr,
+ page, page_zip);
+ break;
+ case MLOG_ZIP_PAGE_COMPRESS:
+ /* Allow anything in page_type when creating a page. */
+ ptr = page_zip_parse_compress(ptr, end_ptr,
+ page, page_zip);
+ break;
+ default:
+ ptr = NULL;
+ recv_sys->found_corrupt_log = TRUE;
+ }
+
+ if (index) {
+ dict_table_t* table = index->table;
+
+ dict_mem_index_free(index);
+ dict_mem_table_free(table);
+ }
+
+ return(ptr);
+}
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+recv_fold(
+/*======*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return folded value */
+UNIV_INLINE
+ulint
+recv_hash(
+/*======*/
+ ulint space, /*!< in: space */
+ ulint page_no)/*!< in: page number */
+{
+ return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash));
+}
+
+/*********************************************************************//**
+Gets the hashed file address struct for a page.
+@return file address struct, NULL if not found from the hash table */
+static
+recv_addr_t*
+recv_get_fil_addr_struct(
+/*=====================*/
+ ulint space, /*!< in: space id */
+ ulint page_no)/*!< in: page number */
+{
+ recv_addr_t* recv_addr;
+
+ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+ recv_hash(space, page_no));
+ while (recv_addr) {
+ if ((recv_addr->space == space)
+ && (recv_addr->page_no == page_no)) {
+
+ break;
+ }
+
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ return(recv_addr);
+}
+
+/*******************************************************************//**
+Adds a new log record to the hash table of log records. */
+static
+void
+recv_add_to_hash_table(
+/*===================*/
+ byte type, /*!< in: log record type */
+ ulint space, /*!< in: space id */
+ ulint page_no, /*!< in: page number */
+ byte* body, /*!< in: log record body */
+ byte* rec_end, /*!< in: log record end */
+ ib_uint64_t start_lsn, /*!< in: start lsn of the mtr */
+ ib_uint64_t end_lsn) /*!< in: end lsn of the mtr */
+{
+ recv_t* recv;
+ ulint len;
+ recv_data_t* recv_data;
+ recv_data_t** prev_field;
+ recv_addr_t* recv_addr;
+
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) {
+ /* The tablespace does not exist any more: do not store the
+ log record */
+
+ return;
+ }
+
+ len = rec_end - body;
+
+ if (srv_recovery_stats) {
+ recv_sys->stats_log_recs++;
+ recv_sys->stats_log_len_sum += len;
+ }
+
+ recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t));
+ recv->type = type;
+ recv->len = rec_end - body;
+ recv->start_lsn = start_lsn;
+ recv->end_lsn = end_lsn;
+
+ recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+ if (recv_addr == NULL) {
+ recv_addr = mem_heap_alloc(recv_sys->heap,
+ sizeof(recv_addr_t));
+ recv_addr->space = space;
+ recv_addr->page_no = page_no;
+ recv_addr->state = RECV_NOT_PROCESSED;
+
+ UT_LIST_INIT(recv_addr->rec_list);
+
+ HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash,
+ recv_fold(space, page_no), recv_addr);
+ recv_sys->n_addrs++;
+#if 0
+ fprintf(stderr, "Inserting log rec for space %lu, page %lu\n",
+ space, page_no);
+#endif
+ }
+
+ UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv);
+
+ prev_field = &(recv->data);
+
+ /* Store the log record body in chunks of less than UNIV_PAGE_SIZE:
+ recv_sys->heap grows into the buffer pool, and bigger chunks could not
+ be allocated */
+
+ while (rec_end > body) {
+
+ len = rec_end - body;
+
+ if (len > RECV_DATA_BLOCK_SIZE) {
+ len = RECV_DATA_BLOCK_SIZE;
+ }
+
+ recv_data = mem_heap_alloc(recv_sys->heap,
+ sizeof(recv_data_t) + len);
+ *prev_field = recv_data;
+
+ memcpy(recv_data + 1, body, len);
+
+ prev_field = &(recv_data->next);
+
+ body += len;
+ }
+
+ *prev_field = NULL;
+}
+
+/*********************************************************************//**
+Copies the log record body from recv to buf. */
+static
+void
+recv_data_copy_to_buf(
+/*==================*/
+ byte* buf, /*!< in: buffer of length at least recv->len */
+ recv_t* recv) /*!< in: log record */
+{
+ recv_data_t* recv_data;
+ ulint part_len;
+ ulint len;
+
+ len = recv->len;
+ recv_data = recv->data;
+
+ while (len > 0) {
+ if (len > RECV_DATA_BLOCK_SIZE) {
+ part_len = RECV_DATA_BLOCK_SIZE;
+ } else {
+ part_len = len;
+ }
+
+ ut_memcpy(buf, ((byte*)recv_data) + sizeof(recv_data_t),
+ part_len);
+ buf += part_len;
+ len -= part_len;
+
+ recv_data = recv_data->next;
+ }
+}
+
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+ ibool just_read_in,
+ /*!< in: TRUE if the i/o handler calls
+ this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+ buf_block_t* block) /*!< in/out: buffer block */
+{
+ page_t* page;
+ page_zip_des_t* page_zip;
+ recv_addr_t* recv_addr;
+ recv_t* recv;
+ byte* buf;
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+ ib_uint64_t page_lsn;
+ ib_uint64_t page_lsn_orig;
+ ib_uint64_t page_newest_lsn;
+ ibool modification_to_page;
+#ifndef UNIV_HOTBACKUP
+ ibool success;
+#endif /* !UNIV_HOTBACKUP */
+ mtr_t mtr;
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_sys->apply_log_recs == FALSE) {
+
+ /* Log records should not be applied now */
+
+ mutex_exit(&(recv_sys->mutex));
+
+ return;
+ }
+
+ recv_addr = recv_get_fil_addr_struct(buf_block_get_space(block),
+ buf_block_get_page_no(block));
+
+ if ((recv_addr == NULL)
+ /* bugfix: http://bugs.mysql.com/bug.php?id=44140 */
+ || (recv_addr->state == RECV_BEING_READ && !just_read_in)
+ || (recv_addr->state == RECV_BEING_PROCESSED)
+ || (recv_addr->state == RECV_PROCESSED)) {
+
+ mutex_exit(&(recv_sys->mutex));
+
+ return;
+ }
+
+#if 0
+ fprintf(stderr, "Recovering space %lu, page %lu\n",
+ buf_block_get_space(block), buf_block_get_page_no(block));
+#endif
+
+ recv_addr->state = RECV_BEING_PROCESSED;
+
+ if (srv_recovery_stats) {
+ if (just_read_in) {
+ recv_sys->stats_recover_pages_with_read++;
+ } else {
+ recv_sys->stats_recover_pages_without_read++;
+ }
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+
+ mtr_start(&mtr);
+ mtr_set_log_mode(&mtr, MTR_LOG_NONE);
+
+ page = block->frame;
+ page_zip = buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+ if (just_read_in) {
+ /* Move the ownership of the x-latch on the page to
+ this OS thread, so that we can acquire a second
+ x-latch on it. This is needed for the operations to
+ the page to pass the debug checks. */
+
+ rw_lock_x_lock_move_ownership(&block->lock);
+ }
+
+ success = buf_page_get_known_nowait(RW_X_LATCH, block,
+ BUF_KEEP_OLD,
+ __FILE__, __LINE__,
+ &mtr);
+ ut_a(success);
+
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Read the newest modification lsn from the page */
+ page_lsn = mach_read_ull(page + FIL_PAGE_LSN);
+ page_lsn_orig = page_lsn;
+
+#ifndef UNIV_HOTBACKUP
+ /* It may be that the page has been modified in the buffer
+ pool: read the newest modification lsn there */
+
+ page_newest_lsn = buf_page_get_newest_modification(&block->page);
+
+ if (page_newest_lsn) {
+
+ page_lsn = page_newest_lsn;
+ }
+#else /* !UNIV_HOTBACKUP */
+ /* In recovery from a backup we do not really use the buffer pool */
+ page_newest_lsn = 0;
+#endif /* !UNIV_HOTBACKUP */
+
+ modification_to_page = FALSE;
+ start_lsn = end_lsn = 0;
+
+ if (srv_recovery_stats) {
+ mutex_enter(&(recv_sys->mutex));
+ if (page_lsn_orig && recv_sys->stats_oldest_modified_lsn > page_lsn_orig) {
+ recv_sys->stats_oldest_modified_lsn = page_lsn_orig;
+ }
+ if (page_lsn_orig && recv_sys->stats_newest_modified_lsn < page_lsn_orig) {
+ recv_sys->stats_newest_modified_lsn = page_lsn_orig;
+ }
+ if (UT_LIST_GET_LAST(recv_addr->rec_list)->start_lsn
+ < page_lsn_orig) {
+ recv_sys->stats_pages_already_new++;
+ }
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
+
+ while (recv) {
+ end_lsn = recv->end_lsn;
+
+ if (recv->len > RECV_DATA_BLOCK_SIZE) {
+ /* We have to copy the record body to a separate
+ buffer */
+
+ buf = mem_alloc(recv->len);
+
+ recv_data_copy_to_buf(buf, recv);
+ } else {
+ buf = ((byte*)(recv->data)) + sizeof(recv_data_t);
+ }
+
+ if (recv->type == MLOG_INIT_FILE_PAGE) {
+ page_lsn = page_newest_lsn;
+
+ memset(FIL_PAGE_LSN + page, 0, 8);
+ memset(UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM
+ + page, 0, 8);
+
+ if (page_zip) {
+ memset(FIL_PAGE_LSN + page_zip->data, 0, 8);
+ }
+ }
+
+ if (recv->start_lsn >= page_lsn) {
+
+ ib_uint64_t end_lsn;
+
+ if (!modification_to_page) {
+
+ modification_to_page = TRUE;
+ start_lsn = recv->start_lsn;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Applying log rec"
+ " type %lu len %lu"
+ " to space %lu page no %lu\n",
+ (ulong) recv->type, (ulong) recv->len,
+ (ulong) recv_addr->space,
+ (ulong) recv_addr->page_no);
+ }
+#endif /* UNIV_DEBUG */
+
+ recv_parse_or_apply_log_rec_body(recv->type, buf,
+ buf + recv->len,
+ block, &mtr);
+
+ if (srv_recovery_stats) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_applied_log_recs++;
+ recv_sys->stats_applied_log_len_sum += recv->len;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ end_lsn = recv->start_lsn + recv->len;
+ mach_write_ull(FIL_PAGE_LSN + page, end_lsn);
+ mach_write_ull(UNIV_PAGE_SIZE
+ - FIL_PAGE_END_LSN_OLD_CHKSUM
+ + page, end_lsn);
+
+ if (page_zip) {
+ mach_write_ull(FIL_PAGE_LSN
+ + page_zip->data, end_lsn);
+ }
+ }
+
+ if (recv->len > RECV_DATA_BLOCK_SIZE) {
+ mem_free(buf);
+ }
+
+ recv = UT_LIST_GET_NEXT(rec_list, recv);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ if (page_zip) {
+ ut_a(page_zip_validate_low(page_zip, page, FALSE));
+ }
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_max_page_lsn < page_lsn) {
+ recv_max_page_lsn = page_lsn;
+ }
+
+ recv_addr->state = RECV_PROCESSED;
+
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+
+ mutex_exit(&(recv_sys->mutex));
+
+#ifndef UNIV_HOTBACKUP
+ if (modification_to_page) {
+ ut_a(block);
+
+ buf_flush_recv_note_modification(block, start_lsn, end_lsn);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Make sure that committing mtr does not change the modification
+ lsn values of page */
+
+ mtr.modifications = FALSE;
+
+ mtr_commit(&mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Reads in pages which have hashed log records, from an area around a given
+page number.
+@return number of pages found */
+static
+ulint
+recv_read_in_area(
+/*==============*/
+ ulint space, /*!< in: space */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ ulint page_no)/*!< in: page number */
+{
+ recv_addr_t* recv_addr;
+ ulint page_nos[RECV_READ_AHEAD_AREA];
+ ulint low_limit;
+ ulint n;
+
+ low_limit = page_no - (page_no % RECV_READ_AHEAD_AREA);
+
+ n = 0;
+
+ for (page_no = low_limit; page_no < low_limit + RECV_READ_AHEAD_AREA;
+ page_no++) {
+ recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+ if (recv_addr && !buf_page_peek(space, page_no)) {
+
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_addr->state == RECV_NOT_PROCESSED) {
+ recv_addr->state = RECV_BEING_READ;
+
+ page_nos[n] = page_no;
+
+ n++;
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+ }
+ }
+
+ if (srv_recovery_stats && n) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_read_requested_pages += n;
+ recv_sys->stats_read_in_area[n - 1]++;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ buf_read_recv_pages(FALSE, space, zip_size, page_nos, n);
+ /*
+ fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n);
+ */
+ return(n);
+}
+
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+ ibool allow_ibuf) /*!< in: if TRUE, also ibuf operations are
+ allowed during the application; if FALSE,
+ no ibuf operations are allowed, and after
+ the application all file pages are flushed to
+ disk and invalidated in buffer pool: this
+ alternative means that no new log records
+ can be generated during the application;
+ the caller must in this case own the log
+ mutex */
+{
+ recv_addr_t* recv_addr;
+ ulint i;
+ ulint n_pages;
+ ibool has_printed = FALSE;
+ mtr_t mtr;
+loop:
+ mutex_enter(&(recv_sys->mutex));
+
+ if (recv_sys->apply_batch_on) {
+
+ mutex_exit(&(recv_sys->mutex));
+
+ os_thread_sleep(500000);
+
+ goto loop;
+ }
+
+ ut_ad(!allow_ibuf == mutex_own(&log_sys->mutex));
+
+ if (!allow_ibuf) {
+ recv_no_ibuf_operations = TRUE;
+ }
+
+ recv_sys->apply_log_recs = TRUE;
+ recv_sys->apply_batch_on = TRUE;
+
+ for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) {
+
+ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, i);
+
+ while (recv_addr) {
+ ulint space = recv_addr->space;
+ ulint zip_size = fil_space_get_zip_size(space);
+ ulint page_no = recv_addr->page_no;
+
+ if (recv_addr->state == RECV_NOT_PROCESSED) {
+ if (!has_printed) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Starting an"
+ " apply batch of log records"
+ " to the database...\n"
+ "InnoDB: Progress in percents: ",
+ stderr);
+ has_printed = TRUE;
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+
+ if (buf_page_peek(space, page_no)) {
+ buf_block_t* block;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(
+ space, zip_size, page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(
+ block, SYNC_NO_ORDER_CHECK);
+
+ recv_recover_page(FALSE, block);
+ mtr_commit(&mtr);
+ } else {
+ recv_read_in_area(space, zip_size,
+ page_no);
+ }
+
+ mutex_enter(&(recv_sys->mutex));
+ }
+
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ if (has_printed
+ && (i * 100) / hash_get_n_cells(recv_sys->addr_hash)
+ != ((i + 1) * 100)
+ / hash_get_n_cells(recv_sys->addr_hash)) {
+
+ fprintf(stderr, "%lu ", (ulong)
+ ((i * 100)
+ / hash_get_n_cells(recv_sys->addr_hash)));
+ }
+ }
+
+ /* Wait until all the pages have been processed */
+
+ while (recv_sys->n_addrs != 0) {
+
+ mutex_exit(&(recv_sys->mutex));
+
+ os_thread_sleep(500000);
+
+ mutex_enter(&(recv_sys->mutex));
+ }
+
+ if (has_printed) {
+
+ fprintf(stderr, "\n");
+ }
+
+ if (!allow_ibuf) {
+ /* Flush all the file pages to disk and invalidate them in
+ the buffer pool */
+
+ ut_d(recv_no_log_write = TRUE);
+ mutex_exit(&(recv_sys->mutex));
+ mutex_exit(&(log_sys->mutex));
+
+ n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX,
+ IB_ULONGLONG_MAX);
+ ut_a(n_pages != ULINT_UNDEFINED);
+
+ buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+ buf_pool_invalidate();
+
+ mutex_enter(&(log_sys->mutex));
+ mutex_enter(&(recv_sys->mutex));
+ ut_d(recv_no_log_write = FALSE);
+
+ recv_no_ibuf_operations = FALSE;
+ }
+
+ recv_sys->apply_log_recs = FALSE;
+ recv_sys->apply_batch_on = FALSE;
+
+ recv_sys_empty_hash();
+
+ if (has_printed) {
+ fprintf(stderr, "InnoDB: Apply batch completed\n");
+
+ if (srv_recovery_stats) {
+ recv_sys->stats_recv_turns++;
+ }
+ }
+
+ mutex_exit(&(recv_sys->mutex));
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void)
+/*================================*/
+{
+ recv_addr_t* recv_addr;
+ ulint n_hash_cells;
+ buf_block_t* block;
+ ulint actual_size;
+ ibool success;
+ ulint error;
+ ulint i;
+
+ recv_sys->apply_log_recs = TRUE;
+ recv_sys->apply_batch_on = TRUE;
+
+ block = back_block1;
+
+ fputs("InnoDB: Starting an apply batch of log records"
+ " to the database...\n"
+ "InnoDB: Progress in percents: ", stderr);
+
+ n_hash_cells = hash_get_n_cells(recv_sys->addr_hash);
+
+ for (i = 0; i < n_hash_cells; i++) {
+ /* The address hash table is externally chained */
+ recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node;
+
+ while (recv_addr != NULL) {
+
+ ulint zip_size
+ = fil_space_get_zip_size(recv_addr->space);
+
+ if (zip_size == ULINT_UNDEFINED) {
+#if 0
+ fprintf(stderr,
+ "InnoDB: Warning: cannot apply"
+ " log record to"
+ " tablespace %lu page %lu,\n"
+ "InnoDB: because tablespace with"
+ " that id does not exist.\n",
+ recv_addr->space, recv_addr->page_no);
+#endif
+ recv_addr->state = RECV_PROCESSED;
+
+ ut_a(recv_sys->n_addrs);
+ recv_sys->n_addrs--;
+
+ goto skip_this_recv_addr;
+ }
+
+ /* We simulate a page read made by the buffer pool, to
+ make sure the recovery apparatus works ok. We must init
+ the block. */
+
+ buf_page_init_for_backup_restore(
+ recv_addr->space, recv_addr->page_no,
+ zip_size, block);
+
+ /* Extend the tablespace's last file if the page_no
+ does not fall inside its bounds; we assume the last
+ file is auto-extending, and ibbackup copied the file
+ when it still was smaller */
+
+ success = fil_extend_space_to_desired_size(
+ &actual_size,
+ recv_addr->space, recv_addr->page_no + 1);
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot extend"
+ " tablespace %lu to hold %lu pages\n",
+ recv_addr->space, recv_addr->page_no);
+
+ exit(1);
+ }
+
+ /* Read the page from the tablespace file using the
+ fil0fil.c routines */
+
+ if (zip_size) {
+ error = fil_io(OS_FILE_READ, TRUE,
+ recv_addr->space, zip_size,
+ recv_addr->page_no, 0, zip_size,
+ block->page.zip.data, NULL);
+ if (error == DB_SUCCESS
+ && !buf_zip_decompress(block, TRUE)) {
+ exit(1);
+ }
+ } else {
+ error = fil_io(OS_FILE_READ, TRUE,
+ recv_addr->space, 0,
+ recv_addr->page_no, 0,
+ UNIV_PAGE_SIZE,
+ block->frame, NULL);
+ }
+
+ if (error != DB_SUCCESS) {
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot read"
+ " from tablespace"
+ " %lu page number %lu\n",
+ (ulong) recv_addr->space,
+ (ulong) recv_addr->page_no);
+
+ exit(1);
+ }
+
+ /* Apply the log records to this page */
+ recv_recover_page(FALSE, block);
+
+ /* Write the page back to the tablespace file using the
+ fil0fil.c routines */
+
+ buf_flush_init_for_writing(
+ block->frame, buf_block_get_page_zip(block),
+ mach_read_ull(block->frame + FIL_PAGE_LSN));
+
+ if (zip_size) {
+ error = fil_io(OS_FILE_WRITE, TRUE,
+ recv_addr->space, zip_size,
+ recv_addr->page_no, 0,
+ zip_size,
+ block->page.zip.data, NULL);
+ } else {
+ error = fil_io(OS_FILE_WRITE, TRUE,
+ recv_addr->space, 0,
+ recv_addr->page_no, 0,
+ UNIV_PAGE_SIZE,
+ block->frame, NULL);
+ }
+skip_this_recv_addr:
+ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+ }
+
+ if ((100 * i) / n_hash_cells
+ != (100 * (i + 1)) / n_hash_cells) {
+ fprintf(stderr, "%lu ",
+ (ulong) ((100 * i) / n_hash_cells));
+ fflush(stderr);
+ }
+ }
+
+ recv_sys_empty_hash();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record and returns its length.
+@return length of the record, or 0 if the record was not complete */
+static
+ulint
+recv_parse_log_rec(
+/*===============*/
+ byte* ptr, /*!< in: pointer to a buffer */
+ byte* end_ptr,/*!< in: pointer to the buffer end */
+ byte* type, /*!< out: type */
+ ulint* space, /*!< out: space id */
+ ulint* page_no,/*!< out: page number */
+ byte** body) /*!< out: log record body start */
+{
+ byte* new_ptr;
+
+ *body = NULL;
+
+ if (ptr == end_ptr) {
+
+ return(0);
+ }
+
+ if (*ptr == MLOG_MULTI_REC_END) {
+
+ *type = *ptr;
+
+ return(1);
+ }
+
+ if (*ptr == MLOG_DUMMY_RECORD) {
+ *type = *ptr;
+
+ *space = ULINT_UNDEFINED - 1; /* For debugging */
+
+ return(1);
+ }
+
+ new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space,
+ page_no);
+ *body = new_ptr;
+
+ if (UNIV_UNLIKELY(!new_ptr)) {
+
+ return(0);
+ }
+
+#ifdef UNIV_LOG_LSN_DEBUG
+ if (*type == MLOG_LSN) {
+ ib_uint64_t lsn = (ib_uint64_t) *space << 32 | *page_no;
+# ifdef UNIV_LOG_DEBUG
+ ut_a(lsn == log_sys->old_lsn);
+# else /* UNIV_LOG_DEBUG */
+ ut_a(lsn == recv_sys->recovered_lsn);
+# endif /* UNIV_LOG_DEBUG */
+ }
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+ new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr,
+ NULL, NULL);
+ if (UNIV_UNLIKELY(new_ptr == NULL)) {
+
+ return(0);
+ }
+
+ if (*page_no > recv_max_parsed_page_no) {
+ recv_max_parsed_page_no = *page_no;
+ }
+
+ return(new_ptr - ptr);
+}
+
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+ib_uint64_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+ ib_uint64_t lsn, /*!< in: old lsn */
+ ib_uint64_t len) /*!< in: this many bytes of data is
+ added, log block headers not included */
+{
+ ulint frag_len;
+ ulint lsn_len;
+
+ frag_len = (((ulint) lsn) % OS_FILE_LOG_BLOCK_SIZE)
+ - LOG_BLOCK_HDR_SIZE;
+ ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+ - LOG_BLOCK_TRL_SIZE);
+ lsn_len = (ulint) len;
+ lsn_len += (lsn_len + frag_len)
+ / (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+ - LOG_BLOCK_TRL_SIZE)
+ * (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
+
+ return(lsn + lsn_len);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/*******************************************************//**
+Checks that the parser recognizes incomplete initial segments of a log
+record as incomplete. */
+static
+void
+recv_check_incomplete_log_recs(
+/*===========================*/
+ byte* ptr, /*!< in: pointer to a complete log record */
+ ulint len) /*!< in: length of the log record */
+{
+ ulint i;
+ byte type;
+ ulint space;
+ ulint page_no;
+ byte* body;
+
+ for (i = 0; i < len; i++) {
+ ut_a(0 == recv_parse_log_rec(ptr, ptr + i, &type, &space,
+ &page_no, &body));
+ }
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/*******************************************************//**
+Prints diagnostic info of corrupt log. */
+static
+void
+recv_report_corrupt_log(
+/*====================*/
+ byte* ptr, /*!< in: pointer to corrupt log record */
+ byte type, /*!< in: type of the record */
+ ulint space, /*!< in: space id, this may also be garbage */
+ ulint page_no)/*!< in: page number, this may also be garbage */
+{
+ fprintf(stderr,
+ "InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
+ "InnoDB: Log record type %lu, space id %lu, page number %lu\n"
+ "InnoDB: Log parsing proceeded successfully up to %llu\n"
+ "InnoDB: Previous log record type %lu, is multi %lu\n"
+ "InnoDB: Recv offset %lu, prev %lu\n",
+ (ulong) type, (ulong) space, (ulong) page_no,
+ recv_sys->recovered_lsn,
+ (ulong) recv_previous_parsed_rec_type,
+ (ulong) recv_previous_parsed_rec_is_multi,
+ (ulong) (ptr - recv_sys->buf),
+ (ulong) recv_previous_parsed_rec_offset);
+
+ if ((ulint)(ptr - recv_sys->buf + 100)
+ > recv_previous_parsed_rec_offset
+ && (ulint)(ptr - recv_sys->buf + 100
+ - recv_previous_parsed_rec_offset)
+ < 200000) {
+ fputs("InnoDB: Hex dump of corrupt log starting"
+ " 100 bytes before the start\n"
+ "InnoDB: of the previous log rec,\n"
+ "InnoDB: and ending 100 bytes after the start"
+ " of the corrupt rec:\n",
+ stderr);
+
+ ut_print_buf(stderr,
+ recv_sys->buf
+ + recv_previous_parsed_rec_offset - 100,
+ ptr - recv_sys->buf + 200
+ - recv_previous_parsed_rec_offset);
+ putc('\n', stderr);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set innodb_force_recovery"
+ " to ignore this error.\n", stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ fputs("InnoDB: WARNING: the log file may have been corrupt and it\n"
+ "InnoDB: is possible that the log scan did not proceed\n"
+ "InnoDB: far enough in recovery! Please run CHECK TABLE\n"
+ "InnoDB: on your InnoDB tables to check that they are ok!\n"
+ "InnoDB: If mysqld crashes after this recovery, look at\n"
+ "InnoDB: " REFMAN "forcing-recovery.html\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+
+ fflush(stderr);
+}
+
+/*******************************************************//**
+Parses log records from a buffer and stores them to a hash table to wait
+merging to file pages.
+@return currently always returns FALSE */
+static
+ibool
+recv_parse_log_recs(
+/*================*/
+ ibool store_to_hash) /*!< in: TRUE if the records should be stored
+ to the hash table; this is set to FALSE if just
+ debug checking is needed */
+{
+ byte* ptr;
+ byte* end_ptr;
+ ulint single_rec;
+ ulint len;
+ ulint total_len;
+ ib_uint64_t new_recovered_lsn;
+ ib_uint64_t old_lsn;
+ byte type;
+ ulint space;
+ ulint page_no;
+ byte* body;
+ ulint n_recs;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+ ut_ad(recv_sys->parse_start_lsn != 0);
+loop:
+ ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+ end_ptr = recv_sys->buf + recv_sys->len;
+
+ if (ptr == end_ptr) {
+
+ return(FALSE);
+ }
+
+ single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG;
+
+ if (single_rec || *ptr == MLOG_DUMMY_RECORD) {
+ /* The mtr only modified a single page, or this is a file op */
+
+ old_lsn = recv_sys->recovered_lsn;
+
+ /* Try to parse a log record, fetching its type, space id,
+ page no, and a pointer to the body of the log record */
+
+ len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+ &page_no, &body);
+
+ if (len == 0 || recv_sys->found_corrupt_log) {
+ if (recv_sys->found_corrupt_log) {
+
+ recv_report_corrupt_log(ptr,
+ type, space, page_no);
+ }
+
+ return(FALSE);
+ }
+
+ new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len);
+
+ if (new_recovered_lsn > recv_sys->scanned_lsn) {
+ /* The log record filled a log block, and we require
+ that also the next log block should have been scanned
+ in */
+
+ return(FALSE);
+ }
+
+ recv_previous_parsed_rec_type = (ulint)type;
+ recv_previous_parsed_rec_offset = recv_sys->recovered_offset;
+ recv_previous_parsed_rec_is_multi = 0;
+
+ recv_sys->recovered_offset += len;
+ recv_sys->recovered_lsn = new_recovered_lsn;
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Parsed a single log rec"
+ " type %lu len %lu space %lu page no %lu\n",
+ (ulong) type, (ulong) len, (ulong) space,
+ (ulong) page_no);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (type == MLOG_DUMMY_RECORD) {
+ /* Do nothing */
+
+ } else if (!store_to_hash) {
+ /* In debug checking, update a replicate page
+ according to the log record, and check that it
+ becomes identical with the original page */
+#ifdef UNIV_LOG_DEBUG
+ recv_check_incomplete_log_recs(ptr, len);
+#endif/* UNIV_LOG_DEBUG */
+
+ } else if (type == MLOG_FILE_CREATE
+ || type == MLOG_FILE_CREATE2
+ || type == MLOG_FILE_RENAME
+ || type == MLOG_FILE_DELETE) {
+ ut_a(space);
+#ifdef UNIV_HOTBACKUP
+ if (recv_replay_file_ops) {
+
+ /* In ibbackup --apply-log, replay an .ibd file
+ operation, if possible; note that
+ fil_path_to_mysql_datadir is set in ibbackup to
+ point to the datadir we should use there */
+
+ if (NULL == fil_op_log_parse_or_replay(
+ body, end_ptr, type,
+ space, page_no)) {
+ fprintf(stderr,
+ "InnoDB: Error: file op"
+ " log record of type %lu"
+ " space %lu not complete in\n"
+ "InnoDB: the replay phase."
+ " Path %s\n",
+ (ulint)type, space,
+ (char*)(body + 2));
+
+ ut_error;
+ }
+ }
+#endif
+ /* In normal mysqld crash recovery we do not try to
+ replay file operations */
+#ifdef UNIV_LOG_LSN_DEBUG
+ } else if (type == MLOG_LSN) {
+ /* Do not add these records to the hash table.
+ The page number and space id fields are misused
+ for something else. */
+#endif /* UNIV_LOG_LSN_DEBUG */
+ } else {
+ recv_add_to_hash_table(type, space, page_no, body,
+ ptr + len, old_lsn,
+ recv_sys->recovered_lsn);
+ }
+ } else {
+ /* Check that all the records associated with the single mtr
+ are included within the buffer */
+
+ total_len = 0;
+ n_recs = 0;
+
+ for (;;) {
+ len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+ &page_no, &body);
+ if (len == 0 || recv_sys->found_corrupt_log) {
+
+ if (recv_sys->found_corrupt_log) {
+
+ recv_report_corrupt_log(
+ ptr, type, space, page_no);
+ }
+
+ return(FALSE);
+ }
+
+ recv_previous_parsed_rec_type = (ulint)type;
+ recv_previous_parsed_rec_offset
+ = recv_sys->recovered_offset + total_len;
+ recv_previous_parsed_rec_is_multi = 1;
+
+#ifdef UNIV_LOG_DEBUG
+ if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) {
+ recv_check_incomplete_log_recs(ptr, len);
+ }
+#endif /* UNIV_LOG_DEBUG */
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Parsed a multi log rec"
+ " type %lu len %lu"
+ " space %lu page no %lu\n",
+ (ulong) type, (ulong) len,
+ (ulong) space, (ulong) page_no);
+ }
+#endif /* UNIV_DEBUG */
+
+ total_len += len;
+ n_recs++;
+
+ ptr += len;
+
+ if (type == MLOG_MULTI_REC_END) {
+
+ /* Found the end mark for the records */
+
+ break;
+ }
+ }
+
+ new_recovered_lsn = recv_calc_lsn_on_data_add(
+ recv_sys->recovered_lsn, total_len);
+
+ if (new_recovered_lsn > recv_sys->scanned_lsn) {
+ /* The log record filled a log block, and we require
+ that also the next log block should have been scanned
+ in */
+
+ return(FALSE);
+ }
+
+ /* Add all the records to the hash table */
+
+ ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+ for (;;) {
+ old_lsn = recv_sys->recovered_lsn;
+ len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+ &page_no, &body);
+ if (recv_sys->found_corrupt_log) {
+
+ recv_report_corrupt_log(ptr,
+ type, space, page_no);
+ }
+
+ ut_a(len != 0);
+ ut_a(0 == ((ulint)*ptr & MLOG_SINGLE_REC_FLAG));
+
+ recv_sys->recovered_offset += len;
+ recv_sys->recovered_lsn
+ = recv_calc_lsn_on_data_add(old_lsn, len);
+ if (type == MLOG_MULTI_REC_END) {
+
+ /* Found the end mark for the records */
+
+ break;
+ }
+
+ if (store_to_hash
+#ifdef UNIV_LOG_LSN_DEBUG
+ && type != MLOG_LSN
+#endif /* UNIV_LOG_LSN_DEBUG */
+ ) {
+ recv_add_to_hash_table(type, space, page_no,
+ body, ptr + len,
+ old_lsn,
+ new_recovered_lsn);
+ }
+
+ ptr += len;
+ }
+ }
+
+ goto loop;
+}
+
+/*******************************************************//**
+Adds data from a new log block to the parsing buffer of recv_sys if
+recv_sys->parse_start_lsn is non-zero.
+@return TRUE if more data added */
+static
+ibool
+recv_sys_add_to_parsing_buf(
+/*========================*/
+ const byte* log_block, /*!< in: log block */
+ ib_uint64_t scanned_lsn) /*!< in: lsn of how far we were able
+ to find data in this log block */
+{
+ ulint more_len;
+ ulint data_len;
+ ulint start_offset;
+ ulint end_offset;
+
+ ut_ad(scanned_lsn >= recv_sys->scanned_lsn);
+
+ if (!recv_sys->parse_start_lsn) {
+ /* Cannot start parsing yet because no start point for
+ it found */
+
+ return(FALSE);
+ }
+
+ data_len = log_block_get_data_len(log_block);
+
+ if (recv_sys->parse_start_lsn >= scanned_lsn) {
+
+ return(FALSE);
+
+ } else if (recv_sys->scanned_lsn >= scanned_lsn) {
+
+ return(FALSE);
+
+ } else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) {
+ more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn);
+ } else {
+ more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn);
+ }
+
+ if (more_len == 0) {
+
+ return(FALSE);
+ }
+
+ ut_ad(data_len >= more_len);
+
+ start_offset = data_len - more_len;
+
+ if (start_offset < LOG_BLOCK_HDR_SIZE) {
+ start_offset = LOG_BLOCK_HDR_SIZE;
+ }
+
+ end_offset = data_len;
+
+ if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+ end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+ }
+
+ ut_ad(start_offset <= end_offset);
+
+ if (start_offset < end_offset) {
+ ut_memcpy(recv_sys->buf + recv_sys->len,
+ log_block + start_offset, end_offset - start_offset);
+
+ recv_sys->len += end_offset - start_offset;
+
+ ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************//**
+Moves the parsing buffer data left to the buffer start. */
+static
+void
+recv_sys_justify_left_parsing_buf(void)
+/*===================================*/
+{
+ ut_memmove(recv_sys->buf, recv_sys->buf + recv_sys->recovered_offset,
+ recv_sys->len - recv_sys->recovered_offset);
+
+ recv_sys->len -= recv_sys->recovered_offset;
+
+ recv_sys->recovered_offset = 0;
+}
+
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found. Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+ ulint available_memory,/*!< in: we let the hash table of recs
+ to grow to this size, at the maximum */
+ ibool store_to_hash, /*!< in: TRUE if the records should be
+ stored to the hash table; this is set
+ to FALSE if just debug checking is
+ needed */
+ const byte* buf, /*!< in: buffer containing a log
+ segment or garbage */
+ ulint len, /*!< in: buffer length */
+ ib_uint64_t start_lsn, /*!< in: buffer start lsn */
+ ib_uint64_t* contiguous_lsn, /*!< in/out: it is known that all log
+ groups contain contiguous log data up
+ to this lsn */
+ ib_uint64_t* group_scanned_lsn)/*!< out: scanning succeeded up to
+ this lsn */
+{
+ const byte* log_block;
+ ulint no;
+ ib_uint64_t scanned_lsn;
+ ibool finished;
+ ulint data_len;
+ ibool more_data;
+
+ ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE);
+ ut_a(store_to_hash <= TRUE);
+
+ finished = FALSE;
+
+ log_block = buf;
+ scanned_lsn = start_lsn;
+ more_data = FALSE;
+
+ do {
+ no = log_block_get_hdr_no(log_block);
+ /*
+ fprintf(stderr, "Log block header no %lu\n", no);
+
+ fprintf(stderr, "Scanned lsn no %lu\n",
+ log_block_convert_lsn_to_no(scanned_lsn));
+ */
+ if (no != log_block_convert_lsn_to_no(scanned_lsn)
+ || !log_block_checksum_is_ok_or_old_format(log_block)) {
+
+ if (no == log_block_convert_lsn_to_no(scanned_lsn)
+ && !log_block_checksum_is_ok_or_old_format(
+ log_block)) {
+ fprintf(stderr,
+ "InnoDB: Log block no %lu at"
+ " lsn %llu has\n"
+ "InnoDB: ok header, but checksum field"
+ " contains %lu, should be %lu\n",
+ (ulong) no,
+ scanned_lsn,
+ (ulong) log_block_get_checksum(
+ log_block),
+ (ulong) log_block_calc_checksum(
+ log_block));
+ }
+
+ /* Garbage or an incompletely written log block */
+
+ finished = TRUE;
+
+ break;
+ }
+
+ if (log_block_get_flush_bit(log_block)) {
+ /* This block was a start of a log flush operation:
+ we know that the previous flush operation must have
+ been completed for all log groups before this block
+ can have been flushed to any of the groups. Therefore,
+ we know that log data is contiguous up to scanned_lsn
+ in all non-corrupt log groups. */
+
+ if (scanned_lsn > *contiguous_lsn) {
+ *contiguous_lsn = scanned_lsn;
+ }
+ }
+
+ data_len = log_block_get_data_len(log_block);
+
+ if ((store_to_hash || (data_len == OS_FILE_LOG_BLOCK_SIZE))
+ && scanned_lsn + data_len > recv_sys->scanned_lsn
+ && (recv_sys->scanned_checkpoint_no > 0)
+ && (log_block_get_checkpoint_no(log_block)
+ < recv_sys->scanned_checkpoint_no)
+ && (recv_sys->scanned_checkpoint_no
+ - log_block_get_checkpoint_no(log_block)
+ > 0x80000000UL)) {
+
+ /* Garbage from a log buffer flush which was made
+ before the most recent database recovery */
+
+ finished = TRUE;
+#ifdef UNIV_LOG_DEBUG
+ /* This is not really an error, but currently
+ we stop here in the debug version: */
+
+ ut_error;
+#endif
+ break;
+ }
+
+ if (!recv_sys->parse_start_lsn
+ && (log_block_get_first_rec_group(log_block) > 0)) {
+
+ /* We found a point from which to start the parsing
+ of log records */
+
+ recv_sys->parse_start_lsn = scanned_lsn
+ + log_block_get_first_rec_group(log_block);
+ recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+ recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+ }
+
+ scanned_lsn += data_len;
+
+ if (scanned_lsn > recv_sys->scanned_lsn) {
+
+ /* We have found more entries. If this scan is
+ of startup type, we must initiate crash recovery
+ environment before parsing these log records. */
+
+#ifndef UNIV_HOTBACKUP
+ if (recv_log_scan_is_startup_type
+ && !recv_needed_recovery) {
+
+ fprintf(stderr,
+ "InnoDB: Log scan progressed"
+ " past the checkpoint lsn %llu\n",
+ recv_sys->scanned_lsn);
+ recv_init_crash_recovery();
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ /* We were able to find more log data: add it to the
+ parsing buffer if parse_start_lsn is already
+ non-zero */
+
+ if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE
+ >= RECV_PARSING_BUF_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error: log parsing"
+ " buffer overflow."
+ " Recovery may have failed!\n");
+
+ recv_sys->found_corrupt_log = TRUE;
+
+#ifndef UNIV_HOTBACKUP
+ if (!srv_force_recovery) {
+ fputs("InnoDB: Set"
+ " innodb_force_recovery"
+ " to ignore this error.\n",
+ stderr);
+ ut_error;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ } else if (!recv_sys->found_corrupt_log) {
+ more_data = recv_sys_add_to_parsing_buf(
+ log_block, scanned_lsn);
+ }
+
+ recv_sys->scanned_lsn = scanned_lsn;
+ recv_sys->scanned_checkpoint_no
+ = log_block_get_checkpoint_no(log_block);
+ }
+
+ if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+ /* Log data for this group ends here */
+
+ finished = TRUE;
+ break;
+ } else {
+ log_block += OS_FILE_LOG_BLOCK_SIZE;
+ }
+ } while (log_block < buf + len && !finished);
+
+ *group_scanned_lsn = scanned_lsn;
+
+ if (recv_needed_recovery
+ || (recv_is_from_backup && !recv_is_making_a_backup)) {
+ recv_scan_print_counter++;
+
+ if (finished || (recv_scan_print_counter % 80 == 0)) {
+
+ fprintf(stderr,
+ "InnoDB: Doing recovery: scanned up to"
+ " log sequence number %llu\n",
+ *group_scanned_lsn);
+ }
+ }
+
+ if (more_data && !recv_sys->found_corrupt_log) {
+ /* Try to parse more log records */
+
+ recv_parse_log_recs(store_to_hash);
+
+#ifndef UNIV_HOTBACKUP
+ if (store_to_hash && mem_heap_get_size(recv_sys->heap)
+ > available_memory) {
+
+ /* Hash table of log records has grown too big:
+ empty it; FALSE means no ibuf operations
+ allowed, as we cannot add new records to the
+ log yet: they would be produced by ibuf
+ operations */
+
+ recv_apply_hashed_log_recs(FALSE);
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) {
+ /* Move parsing buffer data to the buffer start */
+
+ recv_sys_justify_left_parsing_buf();
+ }
+ }
+
+ return(finished);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer. Parses
+and hashes the log records if new data found. */
+static
+void
+recv_group_scan_log_recs(
+/*=====================*/
+ log_group_t* group, /*!< in: log group */
+ ib_uint64_t* contiguous_lsn, /*!< in/out: it is known that all log
+ groups contain contiguous log data up
+ to this lsn */
+ ib_uint64_t* group_scanned_lsn)/*!< out: scanning succeeded up to
+ this lsn */
+{
+ ibool finished;
+ ib_uint64_t start_lsn;
+ ib_uint64_t end_lsn;
+
+ finished = FALSE;
+
+ start_lsn = *contiguous_lsn;
+
+ while (!finished) {
+ end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+ log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+ group, start_lsn, end_lsn);
+
+ finished = recv_scan_log_recs(
+ (buf_pool->curr_size - recv_n_pool_free_frames)
+ * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE,
+ start_lsn, contiguous_lsn, group_scanned_lsn);
+ start_lsn = end_lsn;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Scanned group %lu up to"
+ " log sequence number %llu\n",
+ (ulong) group->id,
+ *group_scanned_lsn);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void)
+/*==========================*/
+{
+ ut_a(!recv_needed_recovery);
+ ut_a(!srv_buffer_pool_shm_is_reused);
+
+ recv_needed_recovery = TRUE;
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Database was not"
+ " shut down normally!\n"
+ "InnoDB: Starting crash recovery.\n");
+
+ fprintf(stderr,
+ "InnoDB: Reading tablespace information"
+ " from the .ibd files...\n");
+
+ fil_load_single_table_tablespaces();
+
+ /* If we are using the doublewrite method, we will
+ check if there are half-written pages in data files,
+ and restore them from the doublewrite buffer if
+ possible */
+
+ if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+ fprintf(stderr,
+ "InnoDB: Restoring possible"
+ " half-written data pages from"
+ " the doublewrite\n"
+ "InnoDB: buffer...\n");
+ trx_sys_doublewrite_init_or_restore_pages(TRUE);
+ }
+}
+
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+ ulint type, /*!< in: LOG_CHECKPOINT or
+ LOG_ARCHIVE */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn
+ if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn from
+ data files */
+ ib_uint64_t max_flushed_lsn)/*!< in: max flushed lsn from
+ data files */
+{
+ log_group_t* group;
+ log_group_t* max_cp_group;
+ log_group_t* up_to_date_group;
+ ulint max_cp_field;
+ ib_uint64_t checkpoint_lsn;
+ ib_uint64_t checkpoint_no;
+ ib_uint64_t old_scanned_lsn;
+ ib_uint64_t group_scanned_lsn;
+ ib_uint64_t contiguous_lsn;
+ ib_uint64_t archived_lsn;
+ byte* buf;
+ byte* log_hdr_buf;
+ byte log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
+ ulint err;
+
+ log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
+
+#ifdef UNIV_LOG_ARCHIVE
+ ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT (type == LOG_CHECKPOINT)
+/** Recover up to this log sequence number */
+# define LIMIT_LSN limit_lsn
+#else /* UNIV_LOG_ARCHIVE */
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT 1
+/** Recover up to this log sequence number */
+# define LIMIT_LSN IB_ULONGLONG_MAX
+#endif /* UNIV_LOG_ARCHIVE */
+
+ if (TYPE_CHECKPOINT) {
+ recv_sys_create();
+ recv_sys_init(buf_pool_get_curr_size());
+ }
+
+ if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
+ fprintf(stderr,
+ "InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n");
+ fprintf(stderr,
+ "InnoDB: Skipping log redo\n");
+
+ return(DB_SUCCESS);
+ }
+
+ recv_recovery_on = TRUE;
+
+ recv_sys->limit_lsn = LIMIT_LSN;
+
+ mutex_enter(&(log_sys->mutex));
+
+ /* Look for the latest checkpoint from any of the log groups */
+
+ err = recv_find_max_checkpoint(&max_cp_group, &max_cp_field);
+
+ if (err != DB_SUCCESS) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(err);
+ }
+
+ log_group_read_checkpoint_info(max_cp_group, max_cp_field);
+
+ buf = log_sys->checkpoint_buf;
+
+ checkpoint_lsn = mach_read_ull(buf + LOG_CHECKPOINT_LSN);
+ checkpoint_no = mach_read_ull(buf + LOG_CHECKPOINT_NO);
+ archived_lsn = mach_read_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN);
+
+ /* Read the first log file header to print a note if this is
+ a recovery from a restored InnoDB Hot Backup */
+
+ fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0,
+ 0, 0, LOG_FILE_HDR_SIZE,
+ log_hdr_buf, max_cp_group);
+
+ if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+ (byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
+ /* This log file was created by ibbackup --restore: print
+ a note to the user about it */
+
+ fprintf(stderr,
+ "InnoDB: The log file was created by"
+ " ibbackup --apply-log at\n"
+ "InnoDB: %s\n",
+ log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
+ fprintf(stderr,
+ "InnoDB: NOTE: the following crash recovery"
+ " is part of a normal restore.\n");
+
+ /* Wipe over the label now */
+
+ memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+ ' ', 4);
+ /* Write to the log file to wipe over the label */
+ fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE,
+ max_cp_group->space_id, 0,
+ 0, 0, OS_FILE_LOG_BLOCK_SIZE,
+ log_hdr_buf, max_cp_group);
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group) {
+ log_checkpoint_get_nth_group_info(buf, group->id,
+ &(group->archived_file_no),
+ &(group->archived_offset));
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ if (TYPE_CHECKPOINT) {
+ /* Start reading the log groups from the checkpoint lsn up. The
+ variable contiguous_lsn contains an lsn up to which the log is
+ known to be contiguously written to all log groups. */
+
+ recv_sys->parse_start_lsn = checkpoint_lsn;
+ recv_sys->scanned_lsn = checkpoint_lsn;
+ recv_sys->scanned_checkpoint_no = 0;
+ recv_sys->recovered_lsn = checkpoint_lsn;
+
+ srv_start_lsn = checkpoint_lsn;
+ }
+
+ contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn,
+ OS_FILE_LOG_BLOCK_SIZE);
+ if (TYPE_CHECKPOINT) {
+ up_to_date_group = max_cp_group;
+#ifdef UNIV_LOG_ARCHIVE
+ } else {
+ ulint capacity;
+
+ /* Try to recover the remaining part from logs: first from
+ the logs of the archived group */
+
+ group = recv_sys->archive_group;
+ capacity = log_group_get_capacity(group);
+
+ if (recv_sys->scanned_lsn > checkpoint_lsn + capacity
+ || checkpoint_lsn > recv_sys->scanned_lsn + capacity) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* The group does not contain enough log: probably
+ an archived log file was missing or corrupt */
+
+ return(DB_ERROR);
+ }
+
+ recv_group_scan_log_recs(group, &contiguous_lsn,
+ &group_scanned_lsn);
+ if (recv_sys->scanned_lsn < checkpoint_lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* The group did not contain enough log: an archived
+ log file was missing or invalid, or the log group
+ was corrupt */
+
+ return(DB_ERROR);
+ }
+
+ group->scanned_lsn = group_scanned_lsn;
+ up_to_date_group = group;
+#endif /* UNIV_LOG_ARCHIVE */
+ }
+
+ ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+#ifdef UNIV_LOG_ARCHIVE
+ if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) {
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ /* Set the flag to publish that we are doing startup scan. */
+ recv_log_scan_is_startup_type = TYPE_CHECKPOINT;
+ while (group) {
+ old_scanned_lsn = recv_sys->scanned_lsn;
+
+ recv_group_scan_log_recs(group, &contiguous_lsn,
+ &group_scanned_lsn);
+ group->scanned_lsn = group_scanned_lsn;
+
+ if (old_scanned_lsn < group_scanned_lsn) {
+ /* We found a more up-to-date group */
+
+ up_to_date_group = group;
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ if ((type == LOG_ARCHIVE)
+ && (group == recv_sys->archive_group)) {
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ /* Done with startup scan. Clear the flag. */
+ recv_log_scan_is_startup_type = FALSE;
+ if (TYPE_CHECKPOINT) {
+ /* NOTE: we always do a 'recovery' at startup, but only if
+ there is something wrong we will print a message to the
+ user about recovery: */
+
+ if (checkpoint_lsn != max_flushed_lsn
+ || checkpoint_lsn != min_flushed_lsn) {
+
+ if (checkpoint_lsn < max_flushed_lsn) {
+ fprintf(stderr,
+ "InnoDB: #########################"
+ "#################################\n"
+ "InnoDB: "
+ "WARNING!\n"
+ "InnoDB: The log sequence number"
+ " in ibdata files is higher\n"
+ "InnoDB: than the log sequence number"
+ " in the ib_logfiles! Are you sure\n"
+ "InnoDB: you are using the right"
+ " ib_logfiles to start up"
+ " the database?\n"
+ "InnoDB: Log sequence number in"
+ " ib_logfiles is %llu, log\n"
+ "InnoDB: sequence numbers stamped"
+ " to ibdata file headers are between\n"
+ "InnoDB: %llu and %llu.\n"
+ "InnoDB: #########################"
+ "#################################\n",
+ checkpoint_lsn,
+ min_flushed_lsn,
+ max_flushed_lsn);
+ }
+
+ if (!recv_needed_recovery) {
+ fprintf(stderr,
+ "InnoDB: The log sequence number"
+ " in ibdata files does not match\n"
+ "InnoDB: the log sequence number"
+ " in the ib_logfiles!\n");
+ recv_init_crash_recovery();
+ }
+ }
+
+ if (!recv_needed_recovery) {
+ /* Init the doublewrite buffer memory structure */
+ trx_sys_doublewrite_init_or_restore_pages(FALSE);
+ }
+ }
+
+ /* We currently have only one log group */
+ if (group_scanned_lsn < checkpoint_lsn) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: We were only able to scan the log"
+ " up to\n"
+ "InnoDB: %llu, but a checkpoint was at %llu.\n"
+ "InnoDB: It is possible that"
+ " the database is now corrupt!\n",
+ group_scanned_lsn,
+ checkpoint_lsn);
+ }
+
+ if (group_scanned_lsn < recv_max_page_lsn) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: ERROR: We were only able to scan the log"
+ " up to %llu\n"
+ "InnoDB: but a database page a had an lsn %llu."
+ " It is possible that the\n"
+ "InnoDB: database is now corrupt!\n",
+ group_scanned_lsn,
+ recv_max_page_lsn);
+ }
+
+ if (recv_sys->recovered_lsn < checkpoint_lsn) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (recv_sys->recovered_lsn >= LIMIT_LSN) {
+
+ return(DB_SUCCESS);
+ }
+
+ ut_error;
+
+ return(DB_ERROR);
+ }
+
+ /* Synchronize the uncorrupted log groups to the most up-to-date log
+ group; we also copy checkpoint info to groups */
+
+ log_sys->next_checkpoint_lsn = checkpoint_lsn;
+ log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+ log_sys->archived_lsn = archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ recv_synchronize_groups(up_to_date_group);
+
+ if (!recv_needed_recovery) {
+ ut_a(checkpoint_lsn == recv_sys->recovered_lsn);
+ } else {
+ srv_start_lsn = recv_sys->recovered_lsn;
+ }
+
+ log_sys->lsn = recv_sys->recovered_lsn;
+
+ ut_memcpy(log_sys->buf, recv_sys->last_block, OS_FILE_LOG_BLOCK_SIZE);
+
+ log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE;
+ log_sys->buf_next_to_write = log_sys->buf_free;
+ log_sys->written_to_some_lsn = log_sys->lsn;
+ log_sys->written_to_all_lsn = log_sys->lsn;
+
+ log_sys->last_checkpoint_lsn = checkpoint_lsn;
+
+ log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+ if (archived_lsn == IB_ULONGLONG_MAX) {
+
+ log_sys->archiving_state = LOG_ARCH_OFF;
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ mutex_enter(&(recv_sys->mutex));
+
+ recv_sys->apply_log_recs = TRUE;
+
+ mutex_exit(&(recv_sys->mutex));
+
+ mutex_exit(&(log_sys->mutex));
+
+ recv_lsn_checks_on = TRUE;
+
+ /* The database is now ready to start almost normal processing of user
+ transactions: transaction rollbacks and the application of the log
+ records in the hash table can be run in background. */
+
+ return(DB_SUCCESS);
+
+#undef TYPE_CHECKPOINT
+#undef LIMIT_LSN
+}
+
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void)
+/*======================================*/
+{
+ /* Apply the hashed log records to the respective file pages */
+
+ if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+ recv_apply_hashed_log_recs(TRUE);
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Log records applied to the database\n");
+ }
+#endif /* UNIV_DEBUG */
+
+ if (recv_needed_recovery && srv_recovery_stats) {
+ ulint i;
+
+ fprintf(stderr,
+ "InnoDB: Applying log records was done. Its statistics are followings.\n");
+
+ fprintf(stderr,
+ "============================================================\n"
+ "-------------------\n"
+ "RECOVERY STATISTICS\n"
+ "-------------------\n");
+ fprintf(stderr,
+ "Recovery time: %g sec. (%lu turns)\n",
+ difftime(time(NULL), recv_sys->stats_recv_start_time),
+ recv_sys->stats_recv_turns);
+
+ fprintf(stderr,
+ "\n"
+ "Data page IO statistics\n"
+ " Requested pages: %lu\n"
+ " Read pages: %lu\n"
+ " Written pages: %lu\n"
+ " (Dirty blocks): %lu\n",
+ recv_sys->stats_read_requested_pages,
+ recv_sys->stats_read_io_pages,
+ recv_sys->stats_write_io_pages,
+ UT_LIST_GET_LEN(buf_pool->flush_list));
+
+ fprintf(stderr,
+ " Grouping IO [times]:\n"
+ "\tnumber of pages,\n"
+ "\t\tread request neighbors (in %d pages chunk),\n"
+ "\t\t\tcombined read IO,\n"
+ "\t\t\t\tcombined write IO\n",
+ RECV_READ_AHEAD_AREA);
+ for (i = 0; i < ut_max(RECV_READ_AHEAD_AREA,
+ OS_AIO_MERGE_N_CONSECUTIVE); i++) {
+ fprintf(stderr,
+ "\t%3lu,\t%lu,\t%lu,\t%lu\n", i + 1,
+ (i < RECV_READ_AHEAD_AREA) ?
+ recv_sys->stats_read_in_area[i] : 0,
+ (i < OS_AIO_MERGE_N_CONSECUTIVE) ?
+ recv_sys->stats_read_io_consecutive[i] : 0,
+ (i < OS_AIO_MERGE_N_CONSECUTIVE) ?
+ recv_sys->stats_write_io_consecutive[i] : 0);
+ }
+
+ fprintf(stderr,
+ "\n"
+ "Recovery process statistics\n"
+ " Checked pages by doublewrite buffer: %lu\n"
+ " Overwritten pages from doublewrite: %lu\n"
+ " Recovered pages by io_thread: %lu\n"
+ " Recovered pages by main thread: %lu\n"
+ " Parsed log records to apply: %lu\n"
+ " Sum of the length: %lu\n"
+ " Applied log records: %lu\n"
+ " Sum of the length: %lu\n"
+ " Pages which are already new enough: %lu (It may not be accurate, if turns > 1)\n"
+ " Oldest page's LSN: %llu\n"
+ " Newest page's LSN: %llu\n",
+ recv_sys->stats_doublewrite_check_pages,
+ recv_sys->stats_doublewrite_overwrite_pages,
+ recv_sys->stats_recover_pages_with_read,
+ recv_sys->stats_recover_pages_without_read,
+ recv_sys->stats_log_recs,
+ recv_sys->stats_log_len_sum,
+ recv_sys->stats_applied_log_recs,
+ recv_sys->stats_applied_log_len_sum,
+ recv_sys->stats_pages_already_new,
+ recv_sys->stats_oldest_modified_lsn,
+ recv_sys->stats_newest_modified_lsn);
+
+ fprintf(stderr,
+ "============================================================\n");
+ }
+
+ if (recv_needed_recovery) {
+ trx_sys_print_mysql_master_log_pos();
+ trx_sys_print_mysql_binlog_offset();
+ }
+
+ if (recv_sys->found_corrupt_log) {
+
+ fprintf(stderr,
+ "InnoDB: WARNING: the log file may have been"
+ " corrupt and it\n"
+ "InnoDB: is possible that the log scan or parsing"
+ " did not proceed\n"
+ "InnoDB: far enough in recovery. Please run"
+ " CHECK TABLE\n"
+ "InnoDB: on your InnoDB tables to check that"
+ " they are ok!\n"
+ "InnoDB: It may be safest to recover your"
+ " InnoDB database from\n"
+ "InnoDB: a backup!\n");
+ }
+
+ /* Free the resources of the recovery system */
+
+ recv_recovery_on = FALSE;
+
+#ifndef UNIV_LOG_DEBUG
+ recv_sys_debug_free();
+#endif
+ /* Roll back any recovered data dictionary transactions, so
+ that the data dictionary tables will be free of any locks.
+ The data dictionary latch should guarantee that there is at
+ most one data dictionary transaction active at a time. */
+ trx_rollback_or_clean_recovered(FALSE);
+}
+
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void)
+/*===============================*/
+{
+ int i;
+
+#ifdef UNIV_SYNC_DEBUG
+ /* Wait for a while so that created threads have time to suspend
+ themselves before we switch the latching order checks on */
+ os_thread_sleep(1000000);
+
+ /* Switch latching order checks on in sync0sync.c */
+ sync_order_checks_on = TRUE;
+#endif
+ /* Drop partially created indexes. */
+ row_merge_drop_temp_indexes();
+ /* Drop temporary tables. */
+ row_mysql_drop_temp_tables();
+
+ if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+ /* Rollback the uncommitted transactions which have no user
+ session */
+
+ os_thread_create(trx_rollback_or_clean_all_recovered,
+ (void *)&i, NULL);
+ }
+}
+
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+ ib_uint64_t lsn, /*!< in: reset to this lsn
+ rounded up to be divisible by
+ OS_FILE_LOG_BLOCK_SIZE, after
+ which we add
+ LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_LOG_ARCHIVE
+ ulint arch_log_no, /*!< in: next archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+ ibool new_logs_created)/*!< in: TRUE if resetting logs
+ is done at the log creation;
+ FALSE if it is done after
+ archive recovery */
+{
+ log_group_t* group;
+
+ ut_ad(mutex_own(&(log_sys->mutex)));
+
+ log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group) {
+ group->lsn = log_sys->lsn;
+ group->lsn_offset = LOG_FILE_HDR_SIZE;
+#ifdef UNIV_LOG_ARCHIVE
+ group->archived_file_no = arch_log_no;
+ group->archived_offset = 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ if (!new_logs_created) {
+ recv_truncate_group(group, group->lsn, group->lsn,
+ group->lsn, group->lsn);
+ }
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ log_sys->buf_next_to_write = 0;
+ log_sys->written_to_some_lsn = log_sys->lsn;
+ log_sys->written_to_all_lsn = log_sys->lsn;
+
+ log_sys->next_checkpoint_no = 0;
+ log_sys->last_checkpoint_lsn = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+ log_sys->archived_lsn = log_sys->lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+ log_block_init(log_sys->buf, log_sys->lsn);
+ log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+ log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+ log_sys->lsn += LOG_BLOCK_HDR_SIZE;
+
+ mutex_exit(&(log_sys->mutex));
+
+ /* Reset the checkpoint fields in logs */
+
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+ const char* log_dir, /*!< in: log file directory path */
+ ulint n_log_files, /*!< in: number of log files */
+ ulint log_file_size, /*!< in: log file size */
+ ib_uint64_t lsn) /*!< in: new start lsn, must be
+ divisible by OS_FILE_LOG_BLOCK_SIZE */
+{
+ os_file_t log_file;
+ ibool success;
+ byte* buf;
+ ulint i;
+ ulint log_dir_len;
+ char name[5000];
+ static const char ib_logfile_basename[] = "ib_logfile";
+
+ log_dir_len = strlen(log_dir);
+ /* full path name of ib_logfile consists of log dir path + basename
+ + number. This must fit in the name buffer.
+ */
+ ut_a(log_dir_len + strlen(ib_logfile_basename) + 11 < sizeof(name));
+
+ buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+ memset(buf, '\0', LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+ for (i = 0; i < n_log_files; i++) {
+
+ sprintf(name, "%s%s%lu", log_dir,
+ ib_logfile_basename, (ulong)i);
+
+ log_file = os_file_create_simple(name, OS_FILE_CREATE,
+ OS_FILE_READ_WRITE, &success);
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Cannot create %s. Check that"
+ " the file does not exist yet.\n", name);
+
+ exit(1);
+ }
+
+ fprintf(stderr,
+ "Setting log file size to %lu %lu\n",
+ (ulong) ut_get_high32(log_file_size),
+ (ulong) log_file_size & 0xFFFFFFFFUL);
+
+ success = os_file_set_size(name, log_file,
+ log_file_size & 0xFFFFFFFFUL,
+ ut_get_high32(log_file_size));
+
+ if (!success) {
+ fprintf(stderr,
+ "InnoDB: Cannot set %s size to %lu %lu\n",
+ name, (ulong) ut_get_high32(log_file_size),
+ (ulong) (log_file_size & 0xFFFFFFFFUL));
+ exit(1);
+ }
+
+ os_file_flush(log_file);
+ os_file_close(log_file);
+ }
+
+ /* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */
+
+ log_reset_first_header_and_checkpoint(buf, lsn);
+
+ log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
+ log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
+ LOG_BLOCK_HDR_SIZE);
+ sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0);
+
+ log_file = os_file_create_simple(name, OS_FILE_OPEN,
+ OS_FILE_READ_WRITE, &success);
+ if (!success) {
+ fprintf(stderr, "InnoDB: Cannot open %s.\n", name);
+
+ exit(1);
+ }
+
+ os_file_write(name, log_file, buf, 0, 0,
+ LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+ os_file_flush(log_file);
+ os_file_close(log_file);
+
+ ut_free(buf);
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Reads from the archive of a log group and performs recovery.
+@return TRUE if no more complete consistent archive files */
+static
+ibool
+log_group_recover_from_archive_file(
+/*================================*/
+ log_group_t* group) /*!< in: log group */
+{
+ os_file_t file_handle;
+ ib_uint64_t start_lsn;
+ ib_uint64_t file_end_lsn;
+ ib_uint64_t dummy_lsn;
+ ib_uint64_t scanned_lsn;
+ ulint len;
+ ibool ret;
+ byte* buf;
+ ulint read_offset;
+ ulint file_size;
+ ulint file_size_high;
+ int input_char;
+ char name[10000];
+
+ ut_a(0);
+
+try_open_again:
+ buf = log_sys->buf;
+
+ /* Add the file to the archive file space; open the file */
+
+ log_archived_file_name_gen(name, group->id, group->archived_file_no);
+
+ file_handle = os_file_create(name, OS_FILE_OPEN,
+ OS_FILE_LOG, OS_FILE_AIO, &ret);
+
+ if (ret == FALSE) {
+ask_again:
+ fprintf(stderr,
+ "InnoDB: Do you want to copy additional"
+ " archived log files\n"
+ "InnoDB: to the directory\n");
+ fprintf(stderr,
+ "InnoDB: or were these all the files needed"
+ " in recovery?\n");
+ fprintf(stderr,
+ "InnoDB: (Y == copy more files; N == this is all)?");
+
+ input_char = getchar();
+
+ if (input_char == (int) 'N') {
+
+ return(TRUE);
+ } else if (input_char == (int) 'Y') {
+
+ goto try_open_again;
+ } else {
+ goto ask_again;
+ }
+ }
+
+ ret = os_file_get_size(file_handle, &file_size, &file_size_high);
+ ut_a(ret);
+
+ ut_a(file_size_high == 0);
+
+ fprintf(stderr, "InnoDB: Opened archived log file %s\n", name);
+
+ ret = os_file_close(file_handle);
+
+ if (file_size < LOG_FILE_HDR_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Archive file header incomplete %s\n", name);
+
+ return(TRUE);
+ }
+
+ ut_a(ret);
+
+ /* Add the archive file as a node to the space */
+
+ fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
+ group->archive_space_id, FALSE);
+#if RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE
+# error "RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE"
+#endif
+
+ /* Read the archive file header */
+ fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, 0, 0,
+ LOG_FILE_HDR_SIZE, buf, NULL);
+
+ /* Check if the archive file header is consistent */
+
+ if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id
+ || mach_read_from_4(buf + LOG_FILE_NO)
+ != group->archived_file_no) {
+ fprintf(stderr,
+ "InnoDB: Archive file header inconsistent %s\n", name);
+
+ return(TRUE);
+ }
+
+ if (!mach_read_from_4(buf + LOG_FILE_ARCH_COMPLETED)) {
+ fprintf(stderr,
+ "InnoDB: Archive file not completely written %s\n",
+ name);
+
+ return(TRUE);
+ }
+
+ start_lsn = mach_read_ull(buf + LOG_FILE_START_LSN);
+ file_end_lsn = mach_read_ull(buf + LOG_FILE_END_LSN);
+
+ if (!recv_sys->scanned_lsn) {
+
+ if (recv_sys->parse_start_lsn < start_lsn) {
+ fprintf(stderr,
+ "InnoDB: Archive log file %s"
+ " starts from too big a lsn\n",
+ name);
+ return(TRUE);
+ }
+
+ recv_sys->scanned_lsn = start_lsn;
+ }
+
+ if (recv_sys->scanned_lsn != start_lsn) {
+
+ fprintf(stderr,
+ "InnoDB: Archive log file %s starts from"
+ " a wrong lsn\n",
+ name);
+ return(TRUE);
+ }
+
+ read_offset = LOG_FILE_HDR_SIZE;
+
+ for (;;) {
+ len = RECV_SCAN_SIZE;
+
+ if (read_offset + len > file_size) {
+ len = ut_calc_align_down(file_size - read_offset,
+ OS_FILE_LOG_BLOCK_SIZE);
+ }
+
+ if (len == 0) {
+
+ break;
+ }
+
+#ifdef UNIV_DEBUG
+ if (log_debug_writes) {
+ fprintf(stderr,
+ "InnoDB: Archive read starting at"
+ " lsn %llu, len %lu from file %s\n",
+ start_lsn,
+ (ulong) len, name);
+ }
+#endif /* UNIV_DEBUG */
+
+ fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE,
+ group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
+ read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+
+ ret = recv_scan_log_recs(
+ (buf_pool->n_frames - recv_n_pool_free_frames)
+ * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn,
+ &dummy_lsn, &scanned_lsn);
+
+ if (scanned_lsn == file_end_lsn) {
+
+ return(FALSE);
+ }
+
+ if (ret) {
+ fprintf(stderr,
+ "InnoDB: Archive log file %s"
+ " does not scan right\n",
+ name);
+ return(TRUE);
+ }
+
+ read_offset += len;
+ start_lsn += len;
+
+ ut_ad(start_lsn == scanned_lsn);
+ }
+
+ return(FALSE);
+}
+
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+ ib_uint64_t min_flushed_lsn,/*!< in: min flushed lsn field from the
+ data files */
+ ib_uint64_t limit_lsn, /*!< in: recover up to this lsn if
+ possible */
+ ulint first_log_no) /*!< in: number of the first archived
+ log file to use in the recovery; the
+ file will be searched from
+ INNOBASE_LOG_ARCH_DIR specified in
+ server config file */
+{
+ log_group_t* group;
+ ulint group_id;
+ ulint trunc_len;
+ ibool ret;
+ ulint err;
+
+ ut_a(0);
+
+ recv_sys_create();
+ recv_sys_init(buf_pool_get_curr_size());
+
+ recv_recovery_on = TRUE;
+ recv_recovery_from_backup_on = TRUE;
+
+ recv_sys->limit_lsn = limit_lsn;
+
+ group_id = 0;
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+ while (group) {
+ if (group->id == group_id) {
+
+ break;
+ }
+
+ group = UT_LIST_GET_NEXT(log_groups, group);
+ }
+
+ if (!group) {
+ fprintf(stderr,
+ "InnoDB: There is no log group defined with id %lu!\n",
+ (ulong) group_id);
+ return(DB_ERROR);
+ }
+
+ group->archived_file_no = first_log_no;
+
+ recv_sys->parse_start_lsn = min_flushed_lsn;
+
+ recv_sys->scanned_lsn = 0;
+ recv_sys->scanned_checkpoint_no = 0;
+ recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+
+ recv_sys->archive_group = group;
+
+ ret = FALSE;
+
+ mutex_enter(&(log_sys->mutex));
+
+ while (!ret) {
+ ret = log_group_recover_from_archive_file(group);
+
+ /* Close and truncate a possible processed archive file
+ from the file space */
+
+ trunc_len = UNIV_PAGE_SIZE
+ * fil_space_get_size(group->archive_space_id);
+ if (trunc_len > 0) {
+ fil_space_truncate_start(group->archive_space_id,
+ trunc_len);
+ }
+
+ group->archived_file_no++;
+ }
+
+ if (recv_sys->recovered_lsn < limit_lsn) {
+
+ if (!recv_sys->scanned_lsn) {
+
+ recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE,
+ limit_lsn,
+ IB_ULONGLONG_MAX,
+ IB_ULONGLONG_MAX);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+ }
+
+ if (limit_lsn != IB_ULONGLONG_MAX) {
+
+ recv_apply_hashed_log_recs(FALSE);
+
+ recv_reset_logs(recv_sys->recovered_lsn, 0, FALSE);
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void)
+/*===================================*/
+{
+ recv_recovery_from_checkpoint_finish();
+
+ recv_recovery_from_backup_on = FALSE;
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/xtradb/mach/mach0data.c b/storage/xtradb/mach/mach0data.c
new file mode 100644
index 00000000000..e030ce9aadf
--- /dev/null
+++ b/storage/xtradb/mach/mach0data.c
@@ -0,0 +1,134 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file mach/mach0data.c
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "mach0data.h"
+
+#ifdef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ ulint* val) /*!< out: read value (< 2^32) */
+{
+ ulint flag;
+
+ ut_ad(ptr && end_ptr && val);
+
+ if (ptr >= end_ptr) {
+
+ return(NULL);
+ }
+
+ flag = mach_read_from_1(ptr);
+
+ if (flag < 0x80UL) {
+ *val = flag;
+ return(ptr + 1);
+
+ } else if (flag < 0xC0UL) {
+ if (end_ptr < ptr + 2) {
+ return(NULL);
+ }
+
+ *val = mach_read_from_2(ptr) & 0x7FFFUL;
+
+ return(ptr + 2);
+
+ } else if (flag < 0xE0UL) {
+ if (end_ptr < ptr + 3) {
+ return(NULL);
+ }
+
+ *val = mach_read_from_3(ptr) & 0x3FFFFFUL;
+
+ return(ptr + 3);
+ } else if (flag < 0xF0UL) {
+ if (end_ptr < ptr + 4) {
+ return(NULL);
+ }
+
+ *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL;
+
+ return(ptr + 4);
+ } else {
+ ut_ad(flag == 0xF0UL);
+
+ if (end_ptr < ptr + 5) {
+ return(NULL);
+ }
+
+ *val = mach_read_from_4(ptr + 1);
+ return(ptr + 5);
+ }
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form if the log record fully contains it.
+@return pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+ byte* ptr, /*!< in: pointer to buffer from where to read */
+ byte* end_ptr,/*!< in: pointer to end of the buffer */
+ dulint* val) /*!< out: read value */
+{
+ ulint high;
+ ulint low;
+ ulint size;
+
+ ut_ad(ptr && end_ptr && val);
+
+ if (end_ptr < ptr + 5) {
+
+ return(NULL);
+ }
+
+ high = mach_read_compressed(ptr);
+
+ size = mach_get_compressed_size(high);
+
+ ptr += size;
+
+ if (end_ptr < ptr + 4) {
+
+ return(NULL);
+ }
+
+ low = mach_read_from_4(ptr);
+
+ *val = ut_dulint_create(high, low);
+
+ return(ptr + 4);
+}
diff --git a/storage/xtradb/mem/mem0dbg.c b/storage/xtradb/mem/mem0dbg.c
new file mode 100644
index 00000000000..1cd2ff15bab
--- /dev/null
+++ b/storage/xtradb/mem/mem0dbg.c
@@ -0,0 +1,1041 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0dbg.c
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables below. */
+UNIV_INTERN mutex_t mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
+/* The following variables contain information about the
+extent of memory allocations. Only used in the debug version.
+Protected by mem_hash_mutex above. */
+
+static ulint mem_n_created_heaps = 0;
+static ulint mem_n_allocations = 0;
+static ulint mem_total_allocated_memory = 0;
+UNIV_INTERN ulint mem_current_allocated_memory = 0;
+static ulint mem_max_allocated_memory = 0;
+# ifndef UNIV_HOTBACKUP
+static ulint mem_last_print_info = 0;
+static ibool mem_hash_initialized = FALSE;
+# endif /* !UNIV_HOTBACKUP */
+
+/* Size of the hash table for memory management tracking */
+#define MEM_HASH_SIZE 997
+
+/* The node of the list containing currently allocated memory heaps */
+
+typedef struct mem_hash_node_struct mem_hash_node_t;
+struct mem_hash_node_struct {
+ UT_LIST_NODE_T(mem_hash_node_t)
+ list; /*!< hash list node */
+ mem_heap_t* heap; /*!< memory heap */
+ const char* file_name;/* file where heap was created*/
+ ulint line; /*!< file line of creation */
+ ulint nth_heap;/* this is the nth heap created */
+ UT_LIST_NODE_T(mem_hash_node_t)
+ all_list;/* list of all created heaps */
+};
+
+typedef UT_LIST_BASE_NODE_T(mem_hash_node_t) mem_hash_cell_t;
+
+/* The hash table of allocated heaps */
+static mem_hash_cell_t mem_hash_table[MEM_HASH_SIZE];
+
+/* The base node of the list of all allocated heaps */
+static mem_hash_cell_t mem_all_list_base;
+
+
+
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i);
+
+/* Accessor function for the hash table. Returns a pointer to the
+table cell. */
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i)
+{
+ ut_a(i < MEM_HASH_SIZE);
+
+ return(&(mem_hash_table[i]));
+}
+
+/* Accessor functions for a memory field in the debug version */
+UNIV_INTERN
+void
+mem_field_header_set_len(byte* field, ulint len)
+{
+ mach_write_to_4(field - 2 * sizeof(ulint), len);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_len(byte* field)
+{
+ return(mach_read_from_4(field - 2 * sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_header_set_check(byte* field, ulint check)
+{
+ mach_write_to_4(field - sizeof(ulint), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_check(byte* field)
+{
+ return(mach_read_from_4(field - sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_trailer_set_check(byte* field, ulint check)
+{
+ mach_write_to_4(field + mem_field_header_get_len(field), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_trailer_get_check(byte* field)
+{
+ return(mach_read_from_4(field
+ + mem_field_header_get_len(field)));
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+ ulint size) /*!< in: common pool size in bytes */
+{
+#ifdef UNIV_MEM_DEBUG
+
+ ulint i;
+
+ /* Initialize the hash table */
+ ut_a(FALSE == mem_hash_initialized);
+
+ mutex_create(&mem_hash_mutex, SYNC_MEM_HASH);
+
+ for (i = 0; i < MEM_HASH_SIZE; i++) {
+ UT_LIST_INIT(*mem_hash_get_nth_cell(i));
+ }
+
+ UT_LIST_INIT(mem_all_list_base);
+
+ mem_hash_initialized = TRUE;
+#endif
+
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ /* When innodb_use_sys_malloc is set, the
+ mem_comm_pool won't be used for any allocations. We
+ create a dummy mem_comm_pool, because some statistics
+ and debugging code relies on it being initialized. */
+ size = 1;
+ }
+
+ mem_comm_pool = mem_pool_create(size);
+}
+
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void)
+/*===========*/
+{
+ mem_pool_free(mem_comm_pool);
+ mem_comm_pool = NULL;
+#ifdef UNIV_MEM_DEBUG
+ mutex_free(&mem_hash_mutex);
+ mem_hash_initialized = FALSE;
+#endif /* UNIV_MEM_DEBUG */
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MEM_DEBUG
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+ byte* buf, /*!< in: memory field */
+ ulint n) /*!< in: how many bytes the user requested */
+{
+ ulint rnd;
+ byte* usr_buf;
+
+ usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+ /* In the debug version write the length field and the
+ check fields to the start and the end of the allocated storage.
+ The field header consists of a length field and
+ a random number field, in this order. The field trailer contains
+ the same random number as a check field. */
+
+ mem_field_header_set_len(usr_buf, n);
+
+ rnd = ut_rnd_gen_ulint();
+
+ mem_field_header_set_check(usr_buf, rnd);
+ mem_field_trailer_set_check(usr_buf, rnd);
+
+ /* Update the memory allocation information */
+
+ mutex_enter(&mem_hash_mutex);
+
+ mem_total_allocated_memory += n;
+ mem_current_allocated_memory += n;
+ mem_n_allocations++;
+
+ if (mem_current_allocated_memory > mem_max_allocated_memory) {
+ mem_max_allocated_memory = mem_current_allocated_memory;
+ }
+
+ mutex_exit(&mem_hash_mutex);
+
+ /* In the debug version set the buffer to a random
+ combination of 0xBA and 0xBE */
+
+ mem_init_buf(usr_buf, n);
+}
+
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+ byte* buf, /*!< in: memory field */
+ ulint n __attribute__((unused)))
+ /*!< in: how many bytes the user requested */
+{
+ byte* usr_buf;
+
+ usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+ mutex_enter(&mem_hash_mutex);
+ mem_current_allocated_memory -= n;
+ mutex_exit(&mem_hash_mutex);
+
+ /* Check that the field lengths agree */
+ ut_ad(n == (ulint)mem_field_header_get_len(usr_buf));
+
+ /* In the debug version, set the freed space to a random
+ combination of 0xDE and 0xAD */
+
+ mem_erase_buf(buf, MEM_SPACE_NEEDED(n));
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n) /*!< in: length of buffer */
+{
+ byte* ptr;
+
+ UNIV_MEM_ASSERT_W(buf, n);
+
+ for (ptr = buf; ptr < buf + n; ptr++) {
+
+ if (ut_rnd_gen_ibool()) {
+ *ptr = 0xBA;
+ } else {
+ *ptr = 0xBE;
+ }
+ }
+
+ UNIV_MEM_INVALID(buf, n);
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+ byte* buf, /*!< in: pointer to buffer */
+ ulint n) /*!< in: length of buffer */
+{
+ byte* ptr;
+
+ UNIV_MEM_ASSERT_W(buf, n);
+
+ for (ptr = buf; ptr < buf + n; ptr++) {
+ if (ut_rnd_gen_ibool()) {
+ *ptr = 0xDE;
+ } else {
+ *ptr = 0xAD;
+ }
+ }
+
+ UNIV_MEM_FREE(buf, n);
+}
+
+/***************************************************************//**
+Inserts a created memory heap to the hash table of current allocated
+memory heaps. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+ mem_heap_t* heap, /*!< in: the created heap */
+ const char* file_name, /*!< in: file name of creation */
+ ulint line) /*!< in: line where created */
+{
+ mem_hash_node_t* new_node;
+ ulint cell_no ;
+
+ ut_ad(mem_heap_check(heap));
+
+ mutex_enter(&mem_hash_mutex);
+
+ cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+
+ /* Allocate a new node to the list */
+ new_node = ut_malloc(sizeof(mem_hash_node_t));
+
+ new_node->heap = heap;
+ new_node->file_name = file_name;
+ new_node->line = line;
+ new_node->nth_heap = mem_n_created_heaps;
+
+ /* Insert into lists */
+ UT_LIST_ADD_FIRST(list, *mem_hash_get_nth_cell(cell_no), new_node);
+
+ UT_LIST_ADD_LAST(all_list, mem_all_list_base, new_node);
+
+ mem_n_created_heaps++;
+
+ mutex_exit(&mem_hash_mutex);
+}
+
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+ mem_heap_t* heap, /*!< in: the heap to be freed */
+ const char* file_name, /*!< in: file name of freeing */
+ ulint line) /*!< in: line where freed */
+{
+ mem_hash_node_t* node;
+ ulint cell_no;
+ ibool error;
+ ulint size;
+
+ ut_ad(mem_heap_check(heap));
+
+ mutex_enter(&mem_hash_mutex);
+
+ cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+
+ /* Look for the heap in the hash table list */
+ node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no));
+
+ while (node != NULL) {
+ if (node->heap == heap) {
+
+ break;
+ }
+
+ node = UT_LIST_GET_NEXT(list, node);
+ }
+
+ if (node == NULL) {
+ fprintf(stderr,
+ "Memory heap or buffer freed in %s line %lu"
+ " did not exist.\n",
+ file_name, (ulong) line);
+ ut_error;
+ }
+
+ /* Remove from lists */
+ UT_LIST_REMOVE(list, *mem_hash_get_nth_cell(cell_no), node);
+
+ UT_LIST_REMOVE(all_list, mem_all_list_base, node);
+
+ /* Validate the heap which will be freed */
+ mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size,
+ NULL, NULL);
+ if (error) {
+ fprintf(stderr,
+ "Inconsistency in memory heap or"
+ " buffer n:o %lu created\n"
+ "in %s line %lu and tried to free in %s line %lu.\n"
+ "Hex dump of 400 bytes around memory heap"
+ " first block start:\n",
+ node->nth_heap, node->file_name, (ulong) node->line,
+ file_name, (ulong) line);
+ ut_print_buf(stderr, (byte*)node->heap - 200, 400);
+ fputs("\nDump of the mem heap:\n", stderr);
+ mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
+ &size, NULL, NULL);
+ ut_error;
+ }
+
+ /* Free the memory occupied by the node struct */
+ ut_free(node);
+
+ mem_current_allocated_memory -= size;
+
+ mutex_exit(&mem_hash_mutex);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ byte* top __attribute__((unused)),
+ /*!< in: calculate and validate only until
+ this top pointer in the heap is reached,
+ if this pointer is NULL, ignored */
+ ibool print, /*!< in: if TRUE, prints the contents
+ of the heap; works only in
+ the debug version */
+ ibool* error, /*!< out: TRUE if error */
+ ulint* us_size,/*!< out: allocated memory
+ (for the user) in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored; in the
+ non-debug version this is always -1 */
+ ulint* ph_size,/*!< out: physical size of the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+ ulint* n_blocks) /*!< out: number of blocks in the heap,
+ if a NULL pointer is passed as this
+ argument, it is ignored */
+{
+ mem_block_t* block;
+ ulint total_len = 0;
+ ulint block_count = 0;
+ ulint phys_len = 0;
+#ifdef UNIV_MEM_DEBUG
+ ulint len;
+ byte* field;
+ byte* user_field;
+ ulint check_field;
+#endif
+
+ /* Pessimistically, we set the parameters to error values */
+ if (us_size != NULL) {
+ *us_size = 0;
+ }
+ if (ph_size != NULL) {
+ *ph_size = 0;
+ }
+ if (n_blocks != NULL) {
+ *n_blocks = 0;
+ }
+ *error = TRUE;
+
+ block = heap;
+
+ if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+ return;
+ }
+
+ if (print) {
+ fputs("Memory heap:", stderr);
+ }
+
+ while (block != NULL) {
+ phys_len += mem_block_get_len(block);
+
+ if ((block->type == MEM_HEAP_BUFFER)
+ && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: mem block %p"
+ " length %lu > UNIV_PAGE_SIZE\n",
+ (void*) block,
+ (ulong) mem_block_get_len(block));
+ /* error */
+
+ return;
+ }
+
+#ifdef UNIV_MEM_DEBUG
+ /* We can trace the fields of the block only in the debug
+ version */
+ if (print) {
+ fprintf(stderr, " Block %ld:", block_count);
+ }
+
+ field = (byte*)block + mem_block_get_start(block);
+
+ if (top && (field == top)) {
+
+ goto completed;
+ }
+
+ while (field < (byte*)block + mem_block_get_free(block)) {
+
+ /* Calculate the pointer to the storage
+ which was given to the user */
+
+ user_field = field + MEM_FIELD_HEADER_SIZE;
+
+ len = mem_field_header_get_len(user_field);
+
+ if (print) {
+ ut_print_buf(stderr, user_field, len);
+ putc('\n', stderr);
+ }
+
+ total_len += len;
+ check_field = mem_field_header_get_check(user_field);
+
+ if (check_field
+ != mem_field_trailer_get_check(user_field)) {
+ /* error */
+
+ fprintf(stderr,
+ "InnoDB: Error: block %lx mem"
+ " field %lx len %lu\n"
+ "InnoDB: header check field is"
+ " %lx but trailer %lx\n",
+ (ulint)block,
+ (ulint)field, len, check_field,
+ mem_field_trailer_get_check(
+ user_field));
+
+ return;
+ }
+
+ /* Move to next field */
+ field = field + MEM_SPACE_NEEDED(len);
+
+ if (top && (field == top)) {
+
+ goto completed;
+ }
+
+ }
+
+ /* At the end check that we have arrived to the first free
+ position */
+
+ if (field != (byte*)block + mem_block_get_free(block)) {
+ /* error */
+
+ fprintf(stderr,
+ "InnoDB: Error: block %lx end of"
+ " mem fields %lx\n"
+ "InnoDB: but block free at %lx\n",
+ (ulint)block, (ulint)field,
+ (ulint)((byte*)block
+ + mem_block_get_free(block)));
+
+ return;
+ }
+
+#endif
+
+ block = UT_LIST_GET_NEXT(list, block);
+ block_count++;
+ }
+#ifdef UNIV_MEM_DEBUG
+completed:
+#endif
+ if (us_size != NULL) {
+ *us_size = total_len;
+ }
+ if (ph_size != NULL) {
+ *ph_size = phys_len;
+ }
+ if (n_blocks != NULL) {
+ *n_blocks = block_count;
+ }
+ *error = FALSE;
+}
+
+/**************************************************************//**
+Prints the contents of a memory heap. */
+static
+void
+mem_heap_print(
+/*===========*/
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ibool error;
+ ulint us_size;
+ ulint phys_size;
+ ulint n_blocks;
+
+ ut_ad(mem_heap_check(heap));
+
+ mem_heap_validate_or_print(heap, NULL, TRUE, &error,
+ &us_size, &phys_size, &n_blocks);
+ fprintf(stderr,
+ "\nheap type: %lu; size: user size %lu;"
+ " physical size %lu; blocks %lu.\n",
+ (ulong) heap->type, (ulong) us_size,
+ (ulong) phys_size, (ulong) n_blocks);
+ ut_a(!error);
+}
+
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ibool error;
+ ulint us_size;
+ ulint phys_size;
+ ulint n_blocks;
+
+ ut_ad(mem_heap_check(heap));
+
+ mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size,
+ &phys_size, &n_blocks);
+ if (error) {
+ mem_heap_print(heap);
+ }
+
+ ut_a(!error);
+
+ return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it).
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ut_a(heap->magic_n == MEM_BLOCK_MAGIC_N);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void)
+/*===============*/
+{
+ mem_hash_node_t* node;
+ ulint heap_count = 0;
+ ulint i;
+
+ mem_validate();
+
+ mutex_enter(&mem_hash_mutex);
+
+ for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+ node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+ while (node != NULL) {
+ heap_count++;
+ node = UT_LIST_GET_NEXT(list, node);
+ }
+ }
+
+ mutex_exit(&mem_hash_mutex);
+
+ if (heap_count == 0) {
+# ifndef UNIV_HOTBACKUP
+ ut_a(mem_pool_get_reserved(mem_comm_pool) == 0);
+# endif /* !UNIV_HOTBACKUP */
+
+ return(TRUE);
+ } else {
+ return(FALSE);
+ }
+}
+
+/*****************************************************************//**
+Validates the dynamic memory allocation system.
+@return TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void)
+/*========================*/
+{
+ mem_hash_node_t* node;
+ ulint n_heaps = 0;
+ ulint allocated_mem;
+ ulint ph_size;
+ ulint total_allocated_mem = 0;
+ ibool error = FALSE;
+ ulint n_blocks;
+ ulint i;
+
+# ifndef UNIV_HOTBACKUP
+ mem_pool_validate(mem_comm_pool);
+# endif /* !UNIV_HOTBACKUP */
+
+ mutex_enter(&mem_hash_mutex);
+
+ for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+ node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+
+ while (node != NULL) {
+ n_heaps++;
+
+ mem_heap_validate_or_print(node->heap, NULL,
+ FALSE, &error,
+ &allocated_mem,
+ &ph_size, &n_blocks);
+
+ if (error) {
+ fprintf(stderr,
+ "\nERROR!!!!!!!!!!!!!!!!!!!"
+ "!!!!!!!!!!!!!!!!!!!!!!!\n\n"
+ "Inconsistency in memory heap"
+ " or buffer created\n"
+ "in %s line %lu.\n",
+ node->file_name, node->line);
+
+ mutex_exit(&mem_hash_mutex);
+
+ return(TRUE);
+ }
+
+ total_allocated_mem += allocated_mem;
+ node = UT_LIST_GET_NEXT(list, node);
+ }
+ }
+
+ if ((n_heaps == 0) && (mem_current_allocated_memory != 0)) {
+ error = TRUE;
+ }
+
+ if (mem_total_allocated_memory < mem_current_allocated_memory) {
+ error = TRUE;
+ }
+
+ if (mem_max_allocated_memory > mem_total_allocated_memory) {
+ error = TRUE;
+ }
+
+ if (mem_n_created_heaps < n_heaps) {
+ error = TRUE;
+ }
+
+ mutex_exit(&mem_hash_mutex);
+
+ return(error);
+}
+
+/************************************************************//**
+Validates the dynamic memory
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void)
+/*==============*/
+{
+ ut_a(!mem_validate_no_assert());
+
+ return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+ void* ptr) /*!< in: pointer to place of possible corruption */
+{
+ byte* p;
+ ulint i;
+ ulint dist;
+
+ fputs("InnoDB: Apparent memory corruption: mem dump ", stderr);
+ ut_print_buf(stderr, (byte*)ptr - 250, 500);
+
+ fputs("\nInnoDB: Scanning backward trying to find"
+ " previous allocated mem blocks\n", stderr);
+
+ p = (byte*)ptr;
+ dist = 0;
+
+ for (i = 0; i < 10; i++) {
+ for (;;) {
+ if (((ulint)p) % 4 == 0) {
+
+ if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+ fprintf(stderr,
+ "Mem block at - %lu,"
+ " file %s, line %lu\n",
+ (ulong) dist,
+ (p + sizeof(ulint)),
+ (ulong)
+ (*(ulint*)(p + 8
+ + sizeof(ulint))));
+
+ break;
+ }
+
+ if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+ fprintf(stderr,
+ "Freed mem block at - %lu,"
+ " file %s, line %lu\n",
+ (ulong) dist,
+ (p + sizeof(ulint)),
+ (ulong)
+ (*(ulint*)(p + 8
+ + sizeof(ulint))));
+
+ break;
+ }
+ }
+
+ p--;
+ dist++;
+ }
+
+ p--;
+ dist++;
+ }
+
+ fprintf(stderr,
+ "InnoDB: Scanning forward trying to find next"
+ " allocated mem blocks\n");
+
+ p = (byte*)ptr;
+ dist = 0;
+
+ for (i = 0; i < 10; i++) {
+ for (;;) {
+ if (((ulint)p) % 4 == 0) {
+
+ if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+ fprintf(stderr,
+ "Mem block at + %lu, file %s,"
+ " line %lu\n",
+ (ulong) dist,
+ (p + sizeof(ulint)),
+ (ulong)
+ (*(ulint*)(p + 8
+ + sizeof(ulint))));
+
+ break;
+ }
+
+ if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+ fprintf(stderr,
+ "Freed mem block at + %lu,"
+ " file %s, line %lu\n",
+ (ulong) dist,
+ (p + sizeof(ulint)),
+ (ulong)
+ (*(ulint*)(p + 8
+ + sizeof(ulint))));
+
+ break;
+ }
+ }
+
+ p++;
+ dist++;
+ }
+
+ p++;
+ dist++;
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated
+memory heaps or buffers. Can only be used in the debug version. */
+static
+void
+mem_print_info_low(
+/*===============*/
+ ibool print_all) /*!< in: if TRUE, all heaps are printed,
+ else only the heaps allocated after the
+ previous call of this function */
+{
+#ifdef UNIV_MEM_DEBUG
+ mem_hash_node_t* node;
+ ulint n_heaps = 0;
+ ulint allocated_mem;
+ ulint ph_size;
+ ulint total_allocated_mem = 0;
+ ibool error;
+ ulint n_blocks;
+#endif
+ FILE* outfile;
+
+ /* outfile = fopen("ibdebug", "a"); */
+
+ outfile = stdout;
+
+ fprintf(outfile, "\n");
+ fprintf(outfile,
+ "________________________________________________________\n");
+ fprintf(outfile, "MEMORY ALLOCATION INFORMATION\n\n");
+
+#ifndef UNIV_MEM_DEBUG
+
+ UT_NOT_USED(print_all);
+
+ mem_pool_print_info(outfile, mem_comm_pool);
+
+ fprintf(outfile,
+ "Sorry, non-debug version cannot give more memory info\n");
+
+ /* fclose(outfile); */
+
+ return;
+#else
+ mutex_enter(&mem_hash_mutex);
+
+ fprintf(outfile, "LIST OF CREATED HEAPS AND ALLOCATED BUFFERS: \n\n");
+
+ if (!print_all) {
+ fprintf(outfile, "AFTER THE LAST PRINT INFO\n");
+ }
+
+ node = UT_LIST_GET_FIRST(mem_all_list_base);
+
+ while (node != NULL) {
+ n_heaps++;
+
+ if (!print_all && node->nth_heap < mem_last_print_info) {
+
+ goto next_heap;
+ }
+
+ mem_heap_validate_or_print(node->heap, NULL,
+ FALSE, &error, &allocated_mem,
+ &ph_size, &n_blocks);
+ total_allocated_mem += allocated_mem;
+
+ fprintf(outfile,
+ "%lu: file %s line %lu of size %lu phys.size %lu"
+ " with %lu blocks, type %lu\n",
+ node->nth_heap, node->file_name, node->line,
+ allocated_mem, ph_size, n_blocks,
+ (node->heap)->type);
+next_heap:
+ node = UT_LIST_GET_NEXT(all_list, node);
+ }
+
+ fprintf(outfile, "\n");
+
+ fprintf(outfile, "Current allocated memory : %lu\n",
+ mem_current_allocated_memory);
+ fprintf(outfile, "Current allocated heaps and buffers : %lu\n",
+ n_heaps);
+ fprintf(outfile, "Cumulative allocated memory : %lu\n",
+ mem_total_allocated_memory);
+ fprintf(outfile, "Maximum allocated memory : %lu\n",
+ mem_max_allocated_memory);
+ fprintf(outfile, "Cumulative created heaps and buffers : %lu\n",
+ mem_n_created_heaps);
+ fprintf(outfile, "Cumulative number of allocations : %lu\n",
+ mem_n_allocations);
+
+ mem_last_print_info = mem_n_created_heaps;
+
+ mutex_exit(&mem_hash_mutex);
+
+ mem_pool_print_info(outfile, mem_comm_pool);
+
+ /* mem_validate(); */
+
+ /* fclose(outfile); */
+#endif
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void)
+/*================*/
+{
+ mem_print_info_low(TRUE);
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void)
+/*====================*/
+{
+ mem_print_info_low(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/mem/mem0mem.c b/storage/xtradb/mem/mem0mem.c
new file mode 100644
index 00000000000..1dd4db30841
--- /dev/null
+++ b/storage/xtradb/mem/mem0mem.c
@@ -0,0 +1,573 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0mem.c
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#ifdef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "mem0dbg.c"
+#include <stdarg.h>
+
+/*
+ THE MEMORY MANAGEMENT
+ =====================
+
+The basic element of the memory management is called a memory
+heap. A memory heap is conceptually a
+stack from which memory can be allocated. The stack may grow infinitely.
+The top element of the stack may be freed, or
+the whole stack can be freed at one time. The advantage of the
+memory heap concept is that we can avoid using the malloc and free
+functions of C which are quite expensive, for example, on the Solaris + GCC
+system (50 MHz Sparc, 1993) the pair takes 3 microseconds,
+on Win NT + 100MHz Pentium, 2.5 microseconds.
+When we use a memory heap,
+we can allocate larger blocks of memory at a time and thus
+reduce overhead. Slightly more efficient the method is when we
+allocate the memory from the index page buffer pool, as we can
+claim a new page fast. This is called buffer allocation.
+When we allocate the memory from the dynamic memory of the
+C environment, that is called dynamic allocation.
+
+The default way of operation of the memory heap is the following.
+First, when the heap is created, an initial block of memory is
+allocated. In dynamic allocation this may be about 50 bytes.
+If more space is needed, additional blocks are allocated
+and they are put into a linked list.
+After the initial block, each allocated block is twice the size of the
+previous, until a threshold is attained, after which the sizes
+of the blocks stay the same. An exception is, of course, the case
+where the caller requests a memory buffer whose size is
+bigger than the threshold. In that case a block big enough must
+be allocated.
+
+The heap is physically arranged so that if the current block
+becomes full, a new block is allocated and always inserted in the
+chain of blocks as the last block.
+
+In the debug version of the memory management, all the allocated
+heaps are kept in a list (which is implemented as a hash table).
+Thus we can notice if the caller tries to free an already freed
+heap. In addition, each buffer given to the caller contains
+start field at the start and a trailer field at the end of the buffer.
+
+The start field has the following content:
+A. sizeof(ulint) bytes of field length (in the standard byte order)
+B. sizeof(ulint) bytes of check field (a random number)
+
+The trailer field contains:
+A. sizeof(ulint) bytes of check field (the same random number as at the start)
+
+Thus we can notice if something has been copied over the
+borders of the buffer, which is illegal.
+The memory in the buffers is initialized to a random byte sequence.
+After freeing, all the blocks in the heap are set to random bytes
+to help us discover errors which result from the use of
+buffers in an already freed heap. */
+
+#ifdef MEM_PERIODIC_CHECK
+
+ibool mem_block_list_inited;
+/* List of all mem blocks allocated; protected by the mem_comm_pool mutex */
+UT_LIST_BASE_NODE_T(mem_block_t) mem_block_list;
+
+#endif
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* str) /*!< in: string to be copied */
+{
+ return(mem_heap_dup(heap, str, strlen(str) + 1));
+}
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+ mem_heap_t* heap, /*!< in: memory heap where copy is allocated */
+ const void* data, /*!< in: data to be copied */
+ ulint len) /*!< in: length of data, in bytes */
+{
+ return(memcpy(mem_heap_alloc(heap, len), data, len));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap where string is allocated */
+ const char* s1, /*!< in: string 1 */
+ const char* s2) /*!< in: string 2 */
+{
+ char* s;
+ ulint s1_len = strlen(s1);
+ ulint s2_len = strlen(s2);
+
+ s = mem_heap_alloc(heap, s1_len + s2_len + 1);
+
+ memcpy(s, s1, s1_len);
+ memcpy(s + s1_len, s2, s2_len);
+
+ s[s1_len + s2_len] = '\0';
+
+ return(s);
+}
+
+
+/****************************************************************//**
+Helper function for mem_heap_printf.
+@return length of formatted string, including terminating NUL */
+static
+ulint
+mem_heap_printf_low(
+/*================*/
+ char* buf, /*!< in/out: buffer to store formatted string
+ in, or NULL to just calculate length */
+ const char* format, /*!< in: format string */
+ va_list ap) /*!< in: arguments */
+{
+ ulint len = 0;
+
+ while (*format) {
+
+ /* Does this format specifier have the 'l' length modifier. */
+ ibool is_long = FALSE;
+
+ /* Length of one parameter. */
+ size_t plen;
+
+ if (*format++ != '%') {
+ /* Non-format character. */
+
+ len++;
+
+ if (buf) {
+ *buf++ = *(format - 1);
+ }
+
+ continue;
+ }
+
+ if (*format == 'l') {
+ is_long = TRUE;
+ format++;
+ }
+
+ switch (*format++) {
+ case 's':
+ /* string */
+ {
+ char* s = va_arg(ap, char*);
+
+ /* "%ls" is a non-sensical format specifier. */
+ ut_a(!is_long);
+
+ plen = strlen(s);
+ len += plen;
+
+ if (buf) {
+ memcpy(buf, s, plen);
+ buf += plen;
+ }
+ }
+
+ break;
+
+ case 'u':
+ /* unsigned int */
+ {
+ char tmp[32];
+ unsigned long val;
+
+ /* We only support 'long' values for now. */
+ ut_a(is_long);
+
+ val = va_arg(ap, unsigned long);
+
+ plen = sprintf(tmp, "%lu", val);
+ len += plen;
+
+ if (buf) {
+ memcpy(buf, tmp, plen);
+ buf += plen;
+ }
+ }
+
+ break;
+
+ case '%':
+
+ /* "%l%" is a non-sensical format specifier. */
+ ut_a(!is_long);
+
+ len++;
+
+ if (buf) {
+ *buf++ = '%';
+ }
+
+ break;
+
+ default:
+ ut_error;
+ }
+ }
+
+ /* For the NUL character. */
+ len++;
+
+ if (buf) {
+ *buf = '\0';
+ }
+
+ return(len);
+}
+
+/****************************************************************//**
+A simple (s)printf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ const char* format, /*!< in: format string */
+ ...)
+{
+ va_list ap;
+ char* str;
+ ulint len;
+
+ /* Calculate length of string */
+ len = 0;
+ va_start(ap, format);
+ len = mem_heap_printf_low(NULL, format, ap);
+ va_end(ap);
+
+ /* Now create it for real. */
+ str = mem_heap_alloc(heap, len);
+ va_start(ap, format);
+ mem_heap_printf_low(str, format, ap);
+ va_end(ap);
+
+ return(str);
+}
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+ mem_heap_t* heap, /*!< in: memory heap or NULL if first block
+ should be created */
+ ulint n, /*!< in: number of bytes needed for user data */
+ ulint type, /*!< in: type of heap: MEM_HEAP_DYNAMIC or
+ MEM_HEAP_BUFFER */
+ const char* file_name,/*!< in: file name where created */
+ ulint line) /*!< in: line where created */
+{
+#ifndef UNIV_HOTBACKUP
+ buf_block_t* buf_block = NULL;
+#endif /* !UNIV_HOTBACKUP */
+ mem_block_t* block;
+ ulint len;
+
+ ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+ || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+ if (heap && heap->magic_n != MEM_BLOCK_MAGIC_N) {
+ mem_analyze_corruption(heap);
+ }
+
+ /* In dynamic allocation, calculate the size: block header + data. */
+ len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+
+#ifndef UNIV_HOTBACKUP
+ if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+ ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
+
+ block = mem_area_alloc(&len, mem_comm_pool);
+ } else {
+ len = UNIV_PAGE_SIZE;
+
+ if ((type & MEM_HEAP_BTR_SEARCH) && heap) {
+ /* We cannot allocate the block from the
+ buffer pool, but must get the free block from
+ the heap header free block field */
+
+ buf_block = heap->free_block;
+ heap->free_block = NULL;
+
+ if (UNIV_UNLIKELY(!buf_block)) {
+
+ return(NULL);
+ }
+ } else {
+ buf_block = buf_block_alloc(0);
+ }
+
+ block = (mem_block_t*) buf_block->frame;
+ }
+
+ ut_ad(block);
+ block->buf_block = buf_block;
+ block->free_block = NULL;
+#else /* !UNIV_HOTBACKUP */
+ len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+ block = ut_malloc(len);
+ ut_ad(block);
+#endif /* !UNIV_HOTBACKUP */
+
+ block->magic_n = MEM_BLOCK_MAGIC_N;
+ ut_strlcpy_rev(block->file_name, file_name, sizeof(block->file_name));
+ block->line = line;
+
+#ifdef MEM_PERIODIC_CHECK
+ mutex_enter(&(mem_comm_pool->mutex));
+
+ if (!mem_block_list_inited) {
+ mem_block_list_inited = TRUE;
+ UT_LIST_INIT(mem_block_list);
+ }
+
+ UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block);
+
+ mutex_exit(&(mem_comm_pool->mutex));
+#endif
+ mem_block_set_len(block, len);
+ mem_block_set_type(block, type);
+ mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
+ mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+
+ if (UNIV_UNLIKELY(heap == NULL)) {
+ /* This is the first block of the heap. The field
+ total_size should be initialized here */
+ block->total_size = len;
+ } else {
+ /* Not the first allocation for the heap. This block's
+ total_length field should be set to undefined. */
+ ut_d(block->total_size = ULINT_UNDEFINED);
+ UNIV_MEM_INVALID(&block->total_size,
+ sizeof block->total_size);
+
+ heap->total_size += len;
+ }
+
+ ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
+
+ return(block);
+}
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+ mem_heap_t* heap, /*!< in: memory heap */
+ ulint n) /*!< in: number of bytes user needs */
+{
+ mem_block_t* block;
+ mem_block_t* new_block;
+ ulint new_size;
+
+ ut_ad(mem_heap_check(heap));
+
+ block = UT_LIST_GET_LAST(heap->base);
+
+ /* We have to allocate a new block. The size is always at least
+ doubled until the standard size is reached. After that the size
+ stays the same, except in cases where the caller needs more space. */
+
+ new_size = 2 * mem_block_get_len(block);
+
+ if (heap->type != MEM_HEAP_DYNAMIC) {
+ /* From the buffer pool we allocate buffer frames */
+ ut_a(n <= MEM_MAX_ALLOC_IN_BUF);
+
+ if (new_size > MEM_MAX_ALLOC_IN_BUF) {
+ new_size = MEM_MAX_ALLOC_IN_BUF;
+ }
+ } else if (new_size > MEM_BLOCK_STANDARD_SIZE) {
+
+ new_size = MEM_BLOCK_STANDARD_SIZE;
+ }
+
+ if (new_size < n) {
+ new_size = n;
+ }
+
+ new_block = mem_heap_create_block(heap, new_size, heap->type,
+ heap->file_name, heap->line);
+ if (new_block == NULL) {
+
+ return(NULL);
+ }
+
+ /* Add the new block as the last block */
+
+ UT_LIST_INSERT_AFTER(list, heap->base, block, new_block);
+
+ return(new_block);
+}
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+ mem_heap_t* heap, /*!< in: heap */
+ mem_block_t* block) /*!< in: block to free */
+{
+ ulint type;
+ ulint len;
+#ifndef UNIV_HOTBACKUP
+ buf_block_t* buf_block = block->buf_block;
+#endif /* !UNIV_HOTBACKUP */
+
+ if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+ mem_analyze_corruption(block);
+ }
+
+ UT_LIST_REMOVE(list, heap->base, block);
+
+#ifdef MEM_PERIODIC_CHECK
+ mutex_enter(&(mem_comm_pool->mutex));
+
+ UT_LIST_REMOVE(mem_block_list, mem_block_list, block);
+
+ mutex_exit(&(mem_comm_pool->mutex));
+#endif
+
+ ut_ad(heap->total_size >= block->len);
+ heap->total_size -= block->len;
+
+ type = heap->type;
+ len = block->len;
+ block->magic_n = MEM_FREED_BLOCK_MAGIC_N;
+
+#ifndef UNIV_HOTBACKUP
+ if (!srv_use_sys_malloc) {
+#ifdef UNIV_MEM_DEBUG
+ /* In the debug version we set the memory to a random
+ combination of hex 0xDE and 0xAD. */
+
+ mem_erase_buf((byte*)block, len);
+#else /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+
+ }
+ if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+ ut_ad(!buf_block);
+ mem_area_free(block, mem_comm_pool);
+ } else {
+ ut_ad(type & MEM_HEAP_BUFFER);
+
+ buf_block_free(buf_block);
+ }
+#else /* !UNIV_HOTBACKUP */
+#ifdef UNIV_MEM_DEBUG
+ /* In the debug version we set the memory to a random
+ combination of hex 0xDE and 0xAD. */
+
+ mem_erase_buf((byte*)block, len);
+#else /* UNIV_MEM_DEBUG */
+ UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+ ut_free(block);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+ mem_heap_t* heap) /*!< in: heap */
+{
+ if (UNIV_LIKELY_NULL(heap->free_block)) {
+
+ buf_block_free(heap->free_block);
+
+ heap->free_block = NULL;
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void)
+/*=========================*/
+{
+ mem_block_t* block;
+
+ mutex_enter(&(mem_comm_pool->mutex));
+
+ block = UT_LIST_GET_FIRST(mem_block_list);
+
+ while (block) {
+ if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+ mem_analyze_corruption(block);
+ }
+
+ block = UT_LIST_GET_NEXT(mem_block_list, block);
+ }
+
+ mutex_exit(&(mem_comm_pool->mutex));
+}
+#endif
diff --git a/storage/xtradb/mem/mem0pool.c b/storage/xtradb/mem/mem0pool.c
new file mode 100644
index 00000000000..3291453eeb5
--- /dev/null
+++ b/storage/xtradb/mem/mem0pool.c
@@ -0,0 +1,728 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0pool.c
+The lowest-level memory management
+
+Created 5/12/1997 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0pool.h"
+#ifdef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "srv0start.h"
+
+/* We would like to use also the buffer frames to allocate memory. This
+would be desirable, because then the memory consumption of the database
+would be fixed, and we might even lock the buffer pool to the main memory.
+The problem here is that the buffer management routines can themselves call
+memory allocation, while the buffer pool mutex is reserved.
+
+The main components of the memory consumption are:
+
+1. buffer pool,
+2. parsed and optimized SQL statements,
+3. data dictionary cache,
+4. log buffer,
+5. locks for each transaction,
+6. hash table for the adaptive index,
+7. state and buffers for each SQL query currently being executed,
+8. session for each user, and
+9. stack for each OS thread.
+
+Items 1 and 2 are managed by an LRU algorithm. Items 5 and 6 can potentially
+consume very much memory. Items 7 and 8 should consume quite little memory,
+and the OS should take care of item 9, which too should consume little memory.
+
+A solution to the memory management:
+
+1. the buffer pool size is set separately;
+2. log buffer size is set separately;
+3. the common pool size for all the other entries, except 8, is set separately.
+
+Problems: we may waste memory if the common pool is set too big. Another
+problem is the locks, which may take very much space in big transactions.
+Then the shared pool size should be set very big. We can allow locks to take
+space from the buffer pool, but the SQL optimizer is then unaware of the
+usable size of the buffer pool. We could also combine the objects in the
+common pool and the buffers in the buffer pool into a single LRU list and
+manage it uniformly, but this approach does not take into account the parsing
+and other costs unique to SQL statements.
+
+The locks for a transaction can be seen as a part of the state of the
+transaction. Hence, they should be stored in the common pool. We still
+have the problem of a very big update transaction, for example, which
+will set very many x-locks on rows, and the locks will consume a lot
+of memory, say, half of the buffer pool size.
+
+Another problem is what to do if we are not able to malloc a requested
+block of memory from the common pool. Then we can request memory from
+the operating system. If it does not help, a system error results.
+
+Because 5 and 6 may potentially consume very much memory, we let them grow
+into the buffer pool. We may let the locks of a transaction take frames
+from the buffer pool, when the corresponding memory heap block has grown to
+the size of a buffer frame. Similarly for the hash node cells of the locks,
+and for the adaptive index. Thus, for each individual transaction, its locks
+can occupy at most about the size of the buffer frame of memory in the common
+pool, and after that its locks will grow into the buffer pool. */
+
+/** Mask used to extract the free bit from area->size */
+#define MEM_AREA_FREE 1
+
+/** The smallest memory area total size */
+#define MEM_AREA_MIN_SIZE (2 * MEM_AREA_EXTRA_SIZE)
+
+
+/** Data structure for a memory pool. The space is allocated using the buddy
+algorithm, where free list i contains areas of size 2 to power i. */
+struct mem_pool_struct{
+ byte* buf; /*!< memory pool */
+ ulint size; /*!< memory common pool size */
+ ulint reserved; /*!< amount of currently allocated
+ memory */
+ mutex_t mutex; /*!< mutex protecting this struct */
+ UT_LIST_BASE_NODE_T(mem_area_t)
+ free_list[64]; /*!< lists of free memory areas: an
+ area is put to the list whose number
+ is the 2-logarithm of the area size */
+};
+
+/** The common memory pool */
+UNIV_INTERN mem_pool_t* mem_comm_pool = NULL;
+
+/* We use this counter to check that the mem pool mutex does not leak;
+this is to track a strange assertion failure reported at
+mysql@lists.mysql.com */
+
+UNIV_INTERN ulint mem_n_threads_inside = 0;
+
+/********************************************************************//**
+Reserves the mem pool mutex if we are not in server shutdown. Use
+this function only in memory free functions, since only memory
+free functions are used during server shutdown. */
+UNIV_INLINE
+void
+mem_pool_mutex_enter(
+/*=================*/
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+ mutex_enter(&(pool->mutex));
+ }
+}
+
+/********************************************************************//**
+Releases the mem pool mutex if we are not in server shutdown. As
+its corresponding mem_pool_mutex_enter() function, use it only
+in memory free functions */
+UNIV_INLINE
+void
+mem_pool_mutex_exit(
+/*================*/
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+ mutex_exit(&(pool->mutex));
+ }
+}
+
+/********************************************************************//**
+Returns memory area size.
+@return size */
+UNIV_INLINE
+ulint
+mem_area_get_size(
+/*==============*/
+ mem_area_t* area) /*!< in: area */
+{
+ return(area->size_and_free & ~MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area size. */
+UNIV_INLINE
+void
+mem_area_set_size(
+/*==============*/
+ mem_area_t* area, /*!< in: area */
+ ulint size) /*!< in: size */
+{
+ area->size_and_free = (area->size_and_free & MEM_AREA_FREE)
+ | size;
+}
+
+/********************************************************************//**
+Returns memory area free bit.
+@return TRUE if free */
+UNIV_INLINE
+ibool
+mem_area_get_free(
+/*==============*/
+ mem_area_t* area) /*!< in: area */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+ return(area->size_and_free & MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area free bit. */
+UNIV_INLINE
+void
+mem_area_set_free(
+/*==============*/
+ mem_area_t* area, /*!< in: area */
+ ibool free) /*!< in: free bit value */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+ area->size_and_free = (area->size_and_free & ~MEM_AREA_FREE)
+ | free;
+}
+
+/********************************************************************//**
+Creates a memory pool.
+@return memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+ ulint size) /*!< in: pool size in bytes */
+{
+ mem_pool_t* pool;
+ mem_area_t* area;
+ ulint i;
+ ulint used;
+
+ pool = ut_malloc(sizeof(mem_pool_t));
+
+ /* We do not set the memory to zero (FALSE) in the pool,
+ but only when allocated at a higher level in mem0mem.c.
+ This is to avoid masking useful Purify warnings. */
+
+ pool->buf = ut_malloc_low(size, FALSE, TRUE);
+ pool->size = size;
+
+ mutex_create(&pool->mutex, SYNC_MEM_POOL);
+
+ /* Initialize the free lists */
+
+ for (i = 0; i < 64; i++) {
+
+ UT_LIST_INIT(pool->free_list[i]);
+ }
+
+ used = 0;
+
+ while (size - used >= MEM_AREA_MIN_SIZE) {
+
+ i = ut_2_log(size - used);
+
+ if (ut_2_exp(i) > size - used) {
+
+ /* ut_2_log rounds upward */
+
+ i--;
+ }
+
+ area = (mem_area_t*)(pool->buf + used);
+
+ mem_area_set_size(area, ut_2_exp(i));
+ mem_area_set_free(area, TRUE);
+ UNIV_MEM_FREE(MEM_AREA_EXTRA_SIZE + (byte*) area,
+ ut_2_exp(i) - MEM_AREA_EXTRA_SIZE);
+
+ UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+ used = used + ut_2_exp(i);
+ }
+
+ ut_ad(size >= used);
+
+ pool->reserved = 0;
+
+ return(pool);
+}
+
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+ mem_pool_t* pool) /*!< in, own: memory pool */
+{
+ ut_free(pool->buf);
+ ut_free(pool);
+}
+
+/********************************************************************//**
+Fills the specified free list.
+@return TRUE if we were able to insert a block to the free list */
+static
+ibool
+mem_pool_fill_free_list(
+/*====================*/
+ ulint i, /*!< in: free list index */
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ mem_area_t* area;
+ mem_area_t* area2;
+ ibool ret;
+
+ ut_ad(mutex_own(&(pool->mutex)));
+
+ if (UNIV_UNLIKELY(i >= 63)) {
+ /* We come here when we have run out of space in the
+ memory pool: */
+
+ return(FALSE);
+ }
+
+ area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+
+ if (area == NULL) {
+ if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: mem pool free list %lu"
+ " length is %lu\n"
+ "InnoDB: though the list is empty!\n",
+ (ulong) i + 1,
+ (ulong)
+ UT_LIST_GET_LEN(pool->free_list[i + 1]));
+ }
+
+ ret = mem_pool_fill_free_list(i + 1, pool);
+
+ if (ret == FALSE) {
+
+ return(FALSE);
+ }
+
+ area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+ }
+
+ if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) {
+ mem_analyze_corruption(area);
+
+ ut_error;
+ }
+
+ UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);
+
+ area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
+ UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);
+
+ mem_area_set_size(area2, ut_2_exp(i));
+ mem_area_set_free(area2, TRUE);
+
+ UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2);
+
+ mem_area_set_size(area, ut_2_exp(i));
+
+ UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+ ulint* psize, /*!< in: requested size in bytes; for optimum
+ space usage, the size should be a power of 2
+ minus MEM_AREA_EXTRA_SIZE;
+ out: allocated size in bytes (greater than
+ or equal to the requested size) */
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ mem_area_t* area;
+ ulint size;
+ ulint n;
+ ibool ret;
+
+ /* If we are using os allocator just make a simple call
+ to malloc */
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ return(malloc(*psize));
+ }
+
+ size = *psize;
+ n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE));
+
+ mutex_enter(&(pool->mutex));
+ mem_n_threads_inside++;
+
+ ut_a(mem_n_threads_inside == 1);
+
+ area = UT_LIST_GET_FIRST(pool->free_list[n]);
+
+ if (area == NULL) {
+ ret = mem_pool_fill_free_list(n, pool);
+
+ if (ret == FALSE) {
+ /* Out of memory in memory pool: we try to allocate
+ from the operating system with the regular malloc: */
+
+ mem_n_threads_inside--;
+ mutex_exit(&(pool->mutex));
+
+ return(ut_malloc(size));
+ }
+
+ area = UT_LIST_GET_FIRST(pool->free_list[n]);
+ }
+
+ if (!mem_area_get_free(area)) {
+ fprintf(stderr,
+ "InnoDB: Error: Removing element from mem pool"
+ " free list %lu though the\n"
+ "InnoDB: element is not marked free!\n",
+ (ulong) n);
+
+ mem_analyze_corruption(area);
+
+ /* Try to analyze a strange assertion failure reported at
+ mysql@lists.mysql.com where the free bit IS 1 in the
+ hex dump above */
+
+ if (mem_area_get_free(area)) {
+ fprintf(stderr,
+ "InnoDB: Probably a race condition"
+ " because now the area is marked free!\n");
+ }
+
+ ut_error;
+ }
+
+ if (UT_LIST_GET_LEN(pool->free_list[n]) == 0) {
+ fprintf(stderr,
+ "InnoDB: Error: Removing element from mem pool"
+ " free list %lu\n"
+ "InnoDB: though the list length is 0!\n",
+ (ulong) n);
+ mem_analyze_corruption(area);
+
+ ut_error;
+ }
+
+ ut_ad(mem_area_get_size(area) == ut_2_exp(n));
+
+ mem_area_set_free(area, FALSE);
+
+ UT_LIST_REMOVE(free_list, pool->free_list[n], area);
+
+ pool->reserved += mem_area_get_size(area);
+
+ mem_n_threads_inside--;
+ mutex_exit(&(pool->mutex));
+
+ ut_ad(mem_pool_validate(pool));
+
+ *psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE;
+ UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*)area, *psize);
+
+ return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*)area)));
+}
+
+/********************************************************************//**
+Gets the buddy of an area, if it exists in pool.
+@return the buddy, NULL if no buddy in pool */
+UNIV_INLINE
+mem_area_t*
+mem_area_get_buddy(
+/*===============*/
+ mem_area_t* area, /*!< in: memory area */
+ ulint size, /*!< in: memory area size */
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ mem_area_t* buddy;
+
+ ut_ad(size != 0);
+
+ if (((((byte*)area) - pool->buf) % (2 * size)) == 0) {
+
+ /* The buddy is in a higher address */
+
+ buddy = (mem_area_t*)(((byte*)area) + size);
+
+ if ((((byte*)buddy) - pool->buf) + size > pool->size) {
+
+ /* The buddy is not wholly contained in the pool:
+ there is no buddy */
+
+ buddy = NULL;
+ }
+ } else {
+ /* The buddy is in a lower address; NOTE that area cannot
+ be at the pool lower end, because then we would end up to
+ the upper branch in this if-clause: the remainder would be
+ 0 */
+
+ buddy = (mem_area_t*)(((byte*)area) - size);
+ }
+
+ return(buddy);
+}
+
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+ void* ptr, /*!< in, own: pointer to allocated memory
+ buffer */
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ mem_area_t* area;
+ mem_area_t* buddy;
+ void* new_ptr;
+ ulint size;
+ ulint n;
+
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ free(ptr);
+
+ return;
+ }
+
+ /* It may be that the area was really allocated from the OS with
+ regular malloc: check if ptr points within our memory pool */
+
+ if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) {
+ ut_free(ptr);
+
+ return;
+ }
+
+ area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE);
+
+ if (mem_area_get_free(area)) {
+ fprintf(stderr,
+ "InnoDB: Error: Freeing element to mem pool"
+ " free list though the\n"
+ "InnoDB: element is marked free!\n");
+
+ mem_analyze_corruption(area);
+ ut_error;
+ }
+
+ size = mem_area_get_size(area);
+ UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE);
+
+ if (size == 0) {
+ fprintf(stderr,
+ "InnoDB: Error: Mem area size is 0. Possibly a"
+ " memory overrun of the\n"
+ "InnoDB: previous allocated area!\n");
+
+ mem_analyze_corruption(area);
+ ut_error;
+ }
+
+#ifdef UNIV_LIGHT_MEM_DEBUG
+ if (((byte*)area) + size < pool->buf + pool->size) {
+
+ ulint next_size;
+
+ next_size = mem_area_get_size(
+ (mem_area_t*)(((byte*)area) + size));
+ if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
+ fprintf(stderr,
+ "InnoDB: Error: Memory area size %lu,"
+ " next area size %lu not a power of 2!\n"
+ "InnoDB: Possibly a memory overrun of"
+ " the buffer being freed here.\n",
+ (ulong) size, (ulong) next_size);
+ mem_analyze_corruption(area);
+
+ ut_error;
+ }
+ }
+#endif
+ buddy = mem_area_get_buddy(area, size, pool);
+
+ n = ut_2_log(size);
+
+ mem_pool_mutex_enter(pool);
+ mem_n_threads_inside++;
+
+ ut_a(mem_n_threads_inside == 1);
+
+ if (buddy && mem_area_get_free(buddy)
+ && (size == mem_area_get_size(buddy))) {
+
+ /* The buddy is in a free list */
+
+ if ((byte*)buddy < (byte*)area) {
+ new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE;
+
+ mem_area_set_size(buddy, 2 * size);
+ mem_area_set_free(buddy, FALSE);
+ } else {
+ new_ptr = ptr;
+
+ mem_area_set_size(area, 2 * size);
+ }
+
+ /* Remove the buddy from its free list and merge it to area */
+
+ UT_LIST_REMOVE(free_list, pool->free_list[n], buddy);
+
+ pool->reserved += ut_2_exp(n);
+
+ mem_n_threads_inside--;
+ mem_pool_mutex_exit(pool);
+
+ mem_area_free(new_ptr, pool);
+
+ return;
+ } else {
+ UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area);
+
+ mem_area_set_free(area, TRUE);
+
+ ut_ad(pool->reserved >= size);
+
+ pool->reserved -= size;
+ }
+
+ mem_n_threads_inside--;
+ mem_pool_mutex_exit(pool);
+
+ ut_ad(mem_pool_validate(pool));
+}
+
+/********************************************************************//**
+Validates a memory pool.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ mem_area_t* area;
+ mem_area_t* buddy;
+ ulint free;
+ ulint i;
+
+ mem_pool_mutex_enter(pool);
+
+ free = 0;
+
+ for (i = 0; i < 64; i++) {
+
+ UT_LIST_VALIDATE(free_list, mem_area_t, pool->free_list[i],
+ (void) 0);
+
+ area = UT_LIST_GET_FIRST(pool->free_list[i]);
+
+ while (area != NULL) {
+ ut_a(mem_area_get_free(area));
+ ut_a(mem_area_get_size(area) == ut_2_exp(i));
+
+ buddy = mem_area_get_buddy(area, ut_2_exp(i), pool);
+
+ ut_a(!buddy || !mem_area_get_free(buddy)
+ || (ut_2_exp(i) != mem_area_get_size(buddy)));
+
+ area = UT_LIST_GET_NEXT(free_list, area);
+
+ free += ut_2_exp(i);
+ }
+ }
+
+ ut_a(free + pool->reserved == pool->size);
+
+ mem_pool_mutex_exit(pool);
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+ FILE* outfile,/*!< in: output file to write to */
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ ulint i;
+
+ mem_pool_validate(pool);
+
+ fprintf(outfile, "INFO OF A MEMORY POOL\n");
+
+ mutex_enter(&(pool->mutex));
+
+ for (i = 0; i < 64; i++) {
+ if (UT_LIST_GET_LEN(pool->free_list[i]) > 0) {
+
+ fprintf(outfile,
+ "Free list length %lu for"
+ " blocks of size %lu\n",
+ (ulong) UT_LIST_GET_LEN(pool->free_list[i]),
+ (ulong) ut_2_exp(i));
+ }
+ }
+
+ fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size,
+ (ulong) pool->reserved);
+ mutex_exit(&(pool->mutex));
+}
+
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return reserved memory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+ mem_pool_t* pool) /*!< in: memory pool */
+{
+ ulint reserved;
+
+ mutex_enter(&(pool->mutex));
+
+ reserved = pool->reserved;
+
+ mutex_exit(&(pool->mutex));
+
+ return(reserved);
+}
diff --git a/storage/xtradb/mtr/mtr0log.c b/storage/xtradb/mtr/mtr0log.c
new file mode 100644
index 00000000000..d22015a575f
--- /dev/null
+++ b/storage/xtradb/mtr/mtr0log.c
@@ -0,0 +1,612 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0log.c
+Mini-transaction log routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "dict0boot.h"
+
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* str, /*!< in: string to write */
+ ulint len) /*!< in: string length */
+{
+ dyn_array_t* mlog;
+
+ if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+ return;
+ }
+
+ mlog = &(mtr->log);
+
+ dyn_push_string(mlog, str, len);
+}
+
+/********************************************************//**
+Writes the initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. Also pushes info
+to the mtr memo that a buffer page has been modified. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+ const byte* ptr, /*!< in: pointer to (inside) a buffer
+ frame holding the file page where
+ modification is made */
+ byte type, /*!< in: log item type: MLOG_1BYTE, ... */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+
+ ut_ad(type <= MLOG_BIGGEST_TYPE);
+ ut_ad(type > MLOG_8BYTES);
+
+ log_ptr = mlog_open(mtr, 11);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+
+ mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* type, /*!< out: log record type: MLOG_1BYTE, ... */
+ ulint* space, /*!< out: space id */
+ ulint* page_no)/*!< out: page number */
+{
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ *type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG);
+ ut_ad(*type <= MLOG_BIGGEST_TYPE);
+
+ ptr++;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, space);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, page_no);
+
+ return(ptr);
+}
+
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_dulint.
+@return parsed record end, NULL if not a complete record or a corrupt record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+ ulint type, /*!< in: log record type: MLOG_1BYTE, ... */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip)/*!< in/out: compressed page, or NULL */
+{
+ ulint offset;
+ ulint val;
+ dulint dval;
+
+ ut_a(type <= MLOG_8BYTES);
+ ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (offset >= UNIV_PAGE_SIZE) {
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (type == MLOG_8BYTES) {
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_8
+ (((page_zip_des_t*) page_zip)->data
+ + offset, dval);
+ }
+ mach_write_to_8(page + offset, dval);
+ }
+
+ return(ptr);
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &val);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ switch (type) {
+ case MLOG_1BYTE:
+ if (UNIV_UNLIKELY(val > 0xFFUL)) {
+ goto corrupt;
+ }
+ if (page) {
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_1
+ (((page_zip_des_t*) page_zip)->data
+ + offset, val);
+ }
+ mach_write_to_1(page + offset, val);
+ }
+ break;
+ case MLOG_2BYTES:
+ if (UNIV_UNLIKELY(val > 0xFFFFUL)) {
+ goto corrupt;
+ }
+ if (page) {
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_2
+ (((page_zip_des_t*) page_zip)->data
+ + offset, val);
+ }
+ mach_write_to_2(page + offset, val);
+ }
+ break;
+ case MLOG_4BYTES:
+ if (page) {
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_4
+ (((page_zip_des_t*) page_zip)->data
+ + offset, val);
+ }
+ mach_write_to_4(page + offset, val);
+ }
+ break;
+ default:
+ corrupt:
+ recv_sys->found_corrupt_log = TRUE;
+ ptr = NULL;
+ }
+
+ return(ptr);
+}
+
+/********************************************************//**
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+ byte* ptr, /*!< in: pointer where to write */
+ ulint val, /*!< in: value to write */
+ byte type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+
+ switch (type) {
+ case MLOG_1BYTE:
+ mach_write_to_1(ptr, val);
+ break;
+ case MLOG_2BYTES:
+ mach_write_to_2(ptr, val);
+ break;
+ case MLOG_4BYTES:
+ mach_write_to_4(ptr, val);
+ break;
+ default:
+ ut_error;
+ }
+
+ log_ptr = mlog_open(mtr, 11 + 2 + 5);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+
+ mach_write_to_2(log_ptr, page_offset(ptr));
+ log_ptr += 2;
+
+ log_ptr += mach_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_dulint(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ dulint val, /*!< in: value to write */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+
+ ut_ad(ptr && mtr);
+
+ mach_write_to_8(ptr, val);
+
+ log_ptr = mlog_open(mtr, 11 + 2 + 9);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES,
+ log_ptr, mtr);
+
+ mach_write_to_2(log_ptr, page_offset(ptr));
+ log_ptr += 2;
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+ mlog_close(mtr, log_ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+ byte* ptr, /*!< in: pointer where to write */
+ const byte* str, /*!< in: string to write */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ut_ad(ptr && mtr);
+ ut_a(len < UNIV_PAGE_SIZE);
+
+ memcpy(ptr, str, len);
+
+ mlog_log_string(ptr, len, mtr);
+}
+
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+ byte* ptr, /*!< in: pointer written to */
+ ulint len, /*!< in: string length */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+
+ ut_ad(ptr && mtr);
+ ut_ad(len <= UNIV_PAGE_SIZE);
+
+ log_ptr = mlog_open(mtr, 30);
+
+ /* If no logging is requested, we may return now */
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING,
+ log_ptr, mtr);
+ mach_write_to_2(log_ptr, page_offset(ptr));
+ log_ptr += 2;
+
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, ptr, len);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ byte* page, /*!< in: page where to apply the log record, or NULL */
+ void* page_zip)/*!< in/out: compressed page, or NULL */
+{
+ ulint offset;
+ ulint len;
+
+ ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+ if (end_ptr < ptr + 4) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+ len = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+ || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) {
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ memcpy(((page_zip_des_t*) page_zip)->data
+ + offset, ptr, len);
+ }
+ memcpy(page + offset, ptr, len);
+ }
+
+ return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index.
+@return buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* rec, /*!< in: index record or page */
+ dict_index_t* index, /*!< in: record descriptor */
+ byte type, /*!< in: log item type */
+ ulint size) /*!< in: requested buffer size in bytes
+ (if 0, calls mlog_close() and returns NULL) */
+{
+ byte* log_ptr;
+ const byte* log_start;
+ const byte* log_end;
+
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+ if (!page_rec_is_comp(rec)) {
+ log_start = log_ptr = mlog_open(mtr, 11 + size);
+ if (!log_ptr) {
+ return(NULL); /* logging is disabled */
+ }
+ log_ptr = mlog_write_initial_log_record_fast(rec, type,
+ log_ptr, mtr);
+ log_end = log_ptr + 11 + size;
+ } else {
+ ulint i;
+ ulint n = dict_index_get_n_fields(index);
+ /* total size needed */
+ ulint total = 11 + size + (n + 2) * 2;
+ ulint alloc = total;
+ /* allocate at most DYN_ARRAY_DATA_SIZE at a time */
+ if (alloc > DYN_ARRAY_DATA_SIZE) {
+ alloc = DYN_ARRAY_DATA_SIZE;
+ }
+ log_start = log_ptr = mlog_open(mtr, alloc);
+ if (!log_ptr) {
+ return(NULL); /* logging is disabled */
+ }
+ log_end = log_ptr + alloc;
+ log_ptr = mlog_write_initial_log_record_fast(rec, type,
+ log_ptr, mtr);
+ mach_write_to_2(log_ptr, n);
+ log_ptr += 2;
+ mach_write_to_2(log_ptr,
+ dict_index_get_n_unique_in_tree(index));
+ log_ptr += 2;
+ for (i = 0; i < n; i++) {
+ dict_field_t* field;
+ const dict_col_t* col;
+ ulint len;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+ len = field->fixed_len;
+ ut_ad(len < 0x7fff);
+ if (len == 0
+ && (col->len > 255 || col->mtype == DATA_BLOB)) {
+ /* variable-length field
+ with maximum length > 255 */
+ len = 0x7fff;
+ }
+ if (col->prtype & DATA_NOT_NULL) {
+ len |= 0x8000;
+ }
+ if (log_ptr + 2 > log_end) {
+ mlog_close(mtr, log_ptr);
+ ut_a(total > (ulint) (log_ptr - log_start));
+ total -= log_ptr - log_start;
+ alloc = total;
+ if (alloc > DYN_ARRAY_DATA_SIZE) {
+ alloc = DYN_ARRAY_DATA_SIZE;
+ }
+ log_start = log_ptr = mlog_open(mtr, alloc);
+ if (!log_ptr) {
+ return(NULL); /* logging is disabled */
+ }
+ log_end = log_ptr + alloc;
+ }
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+ }
+ }
+ if (size == 0) {
+ mlog_close(mtr, log_ptr);
+ log_ptr = NULL;
+ } else if (log_ptr + size > log_end) {
+ mlog_close(mtr, log_ptr);
+ log_ptr = mlog_open(mtr, size);
+ }
+ return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+ byte* ptr, /*!< in: buffer */
+ const byte* end_ptr,/*!< in: buffer end */
+ ibool comp, /*!< in: TRUE=compact record format */
+ dict_index_t** index) /*!< out, own: dummy index */
+{
+ ulint i, n, n_uniq;
+ dict_table_t* table;
+ dict_index_t* ind;
+
+ ut_ad(comp == FALSE || comp == TRUE);
+
+ if (comp) {
+ if (end_ptr < ptr + 4) {
+ return(NULL);
+ }
+ n = mach_read_from_2(ptr);
+ ptr += 2;
+ n_uniq = mach_read_from_2(ptr);
+ ptr += 2;
+ ut_ad(n_uniq <= n);
+ if (end_ptr < ptr + n * 2) {
+ return(NULL);
+ }
+ } else {
+ n = n_uniq = 1;
+ }
+ table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n,
+ comp ? DICT_TF_COMPACT : 0);
+ ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY",
+ DICT_HDR_SPACE, 0, n);
+ ind->table = table;
+ ind->n_uniq = (unsigned int) n_uniq;
+ if (n_uniq != n) {
+ ut_a(n_uniq + DATA_ROLL_PTR <= n);
+ ind->type = DICT_CLUSTERED;
+ }
+ if (comp) {
+ for (i = 0; i < n; i++) {
+ ulint len = mach_read_from_2(ptr);
+ ptr += 2;
+ /* The high-order bit of len is the NOT NULL flag;
+ the rest is 0 or 0x7fff for variable-length fields,
+ and 1..0x7ffe for fixed-length fields. */
+ dict_mem_table_add_col(
+ table, NULL, NULL,
+ ((len + 1) & 0x7fff) <= 1
+ ? DATA_BINARY : DATA_FIXBINARY,
+ len & 0x8000 ? DATA_NOT_NULL : 0,
+ len & 0x7fff);
+
+ dict_index_add_col(ind, table,
+ dict_table_get_nth_col(table, i),
+ 0);
+ }
+ dict_table_add_system_columns(table, table->heap);
+ if (n_uniq != n) {
+ /* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */
+ ut_a(DATA_TRX_ID_LEN
+ == dict_index_get_nth_col(ind, DATA_TRX_ID - 1
+ + n_uniq)->len);
+ ut_a(DATA_ROLL_PTR_LEN
+ == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1
+ + n_uniq)->len);
+ ind->fields[DATA_TRX_ID - 1 + n_uniq].col
+ = &table->cols[n + DATA_TRX_ID];
+ ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col
+ = &table->cols[n + DATA_ROLL_PTR];
+ }
+ }
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ ind->cached = TRUE;
+ *index = ind;
+ return(ptr);
+}
diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.c
new file mode 100644
index 00000000000..34e6d3ffc92
--- /dev/null
+++ b/storage/xtradb/mtr/mtr0mtr.c
@@ -0,0 +1,401 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0mtr.c
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0types.h"
+#include "mtr0log.h"
+#include "log0log.h"
+#include "buf0flu.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "log0recv.h"
+/*****************************************************************//**
+Releases the item in the slot given. */
+UNIV_INLINE
+void
+mtr_memo_slot_release(
+/*==================*/
+ mtr_t* mtr, /*!< in: mtr */
+ mtr_memo_slot_t* slot) /*!< in: memo slot */
+{
+ void* object;
+ ulint type;
+
+ ut_ad(mtr && slot);
+
+ object = slot->object;
+ type = slot->type;
+
+ if (UNIV_LIKELY(object != NULL)) {
+ if (type <= MTR_MEMO_BUF_FIX) {
+ buf_page_release((buf_block_t*)object, type, mtr);
+ } else if (type == MTR_MEMO_S_LOCK) {
+ rw_lock_s_unlock((rw_lock_t*)object);
+#ifdef UNIV_DEBUG
+ } else if (type != MTR_MEMO_X_LOCK) {
+ ut_ad(type == MTR_MEMO_MODIFY);
+ ut_ad(mtr_memo_contains(mtr, object,
+ MTR_MEMO_PAGE_X_FIX));
+#endif /* UNIV_DEBUG */
+ } else {
+ rw_lock_x_unlock((rw_lock_t*)object);
+ }
+ }
+
+ slot->object = NULL;
+}
+
+/**********************************************************//**
+Releases the mlocks and other objects stored in an mtr memo. They are released
+in the order opposite to which they were pushed to the memo. NOTE! It is
+essential that the x-rw-lock on a modified buffer page is not released before
+buf_page_note_modification is called for that page! Otherwise, some thread
+might race to modify it, and the flush list sort order on lsn would be
+destroyed. */
+UNIV_INLINE
+void
+mtr_memo_pop_all(
+/*=============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+ commit */
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+ slot = dyn_array_get_element(memo, offset);
+
+ mtr_memo_slot_release(mtr, slot);
+ }
+}
+
+UNIV_INLINE
+void
+mtr_memo_note_modification_all(
+/*===========================*/
+ mtr_t* mtr) /* in: mtr */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+ commit */
+ ut_ad(mtr->modifications);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+ slot = dyn_array_get_element(memo, offset);
+
+ if (UNIV_LIKELY(slot->object != NULL) &&
+ slot->type == MTR_MEMO_PAGE_X_FIX) {
+ buf_flush_note_modification(
+ (buf_block_t*)slot->object, mtr);
+ }
+ }
+}
+
+/************************************************************//**
+Writes the contents of a mini-transaction log, if any, to the database log. */
+static
+void
+mtr_log_reserve_and_write(
+/*======================*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dyn_array_t* mlog;
+ dyn_block_t* block;
+ ulint data_size;
+ byte* first_data;
+
+ ut_ad(mtr);
+
+ mlog = &(mtr->log);
+
+ first_data = dyn_block_get_data(mlog);
+
+ if (mtr->n_log_recs > 1) {
+ mlog_catenate_ulint(mtr, MLOG_MULTI_REC_END, MLOG_1BYTE);
+ } else {
+ *first_data = (byte)((ulint)*first_data
+ | MLOG_SINGLE_REC_FLAG);
+ }
+
+ if (mlog->heap == NULL) {
+ mtr->end_lsn = log_reserve_and_write_fast(
+ first_data, dyn_block_get_used(mlog),
+ &mtr->start_lsn);
+ if (mtr->end_lsn) {
+
+ return;
+ }
+ }
+
+ data_size = dyn_array_get_data_size(mlog);
+
+ /* Open the database log for log_write_low */
+ mtr->start_lsn = log_reserve_and_open(data_size);
+
+ if (mtr->log_mode == MTR_LOG_ALL) {
+
+ block = mlog;
+
+ while (block != NULL) {
+ log_write_low(dyn_block_get_data(block),
+ dyn_block_get_used(block));
+ block = dyn_array_get_next_block(mlog, block);
+ }
+ } else {
+ ut_ad(mtr->log_mode == MTR_LOG_NONE);
+ /* Do nothing */
+ }
+
+ mtr->end_lsn = log_close();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+#ifndef UNIV_HOTBACKUP
+ ibool write_log;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_d(mtr->state = MTR_COMMITTING);
+
+#ifndef UNIV_HOTBACKUP
+ /* This is a dirty read, for debugging. */
+ ut_ad(!recv_no_log_write);
+ write_log = mtr->modifications && mtr->n_log_recs;
+
+ if (write_log) {
+ mtr_log_reserve_and_write(mtr);
+
+ mtr_memo_note_modification_all(mtr);
+ }
+
+ /* We first update the modification info to buffer pages, and only
+ after that release the log mutex: this guarantees that when the log
+ mutex is free, all buffer pages contain an up-to-date info of their
+ modifications. This fact is used in making a checkpoint when we look
+ at the oldest modification of any page in the buffer pool. It is also
+ required when we insert modified buffer pages in to the flush list
+ which must be sorted on oldest_modification. */
+
+ if (write_log) {
+ log_release();
+ }
+
+ /* All unlocking has been moved here, after log_sys mutex release. */
+ mtr_memo_pop_all(mtr);
+
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_d(mtr->state = MTR_COMMITTED);
+ dyn_array_free(&(mtr->memo));
+ dyn_array_free(&(mtr->log));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+UNIV_INTERN
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+ mtr_t* mtr, /*!< in: mtr */
+ ulint savepoint) /*!< in: savepoint */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+ ut_ad(offset >= savepoint);
+
+ while (offset > savepoint) {
+ offset -= sizeof(mtr_memo_slot_t);
+
+ slot = dyn_array_get_element(memo, offset);
+
+ ut_ad(slot->type != MTR_MEMO_MODIFY);
+ mtr_memo_slot_release(mtr, slot);
+ }
+}
+
+/***************************************************//**
+Releases an object in the memo stack. */
+UNIV_INTERN
+void
+mtr_memo_release(
+/*=============*/
+ mtr_t* mtr, /*!< in: mtr */
+ void* object, /*!< in: object */
+ ulint type) /*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+ mtr_memo_slot_t* slot;
+ dyn_array_t* memo;
+ ulint offset;
+
+ ut_ad(mtr);
+ ut_ad(mtr->magic_n == MTR_MAGIC_N);
+ ut_ad(mtr->state == MTR_ACTIVE);
+
+ memo = &(mtr->memo);
+
+ offset = dyn_array_get_data_size(memo);
+
+ while (offset > 0) {
+ offset -= sizeof(mtr_memo_slot_t);
+
+ slot = dyn_array_get_element(memo, offset);
+
+ if ((object == slot->object) && (type == slot->type)) {
+ if (mtr->modifications &&
+ UNIV_LIKELY(slot->object != NULL) &&
+ slot->type == MTR_MEMO_PAGE_X_FIX) {
+ buf_flush_note_modification(
+ (buf_block_t*)slot->object, mtr);
+ }
+
+ mtr_memo_slot_release(mtr, slot);
+
+ break;
+ }
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ ulint type, /*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+ if (type == MLOG_1BYTE) {
+ return(mach_read_from_1(ptr));
+ } else if (type == MLOG_2BYTES) {
+ return(mach_read_from_2(ptr));
+ } else {
+ ut_ad(type == MLOG_4BYTES);
+ return(mach_read_from_4(ptr));
+ }
+}
+
+/********************************************************//**
+Reads 8 bytes from a file page buffered in the buffer pool.
+@return value read */
+UNIV_INTERN
+dulint
+mtr_read_dulint(
+/*============*/
+ const byte* ptr, /*!< in: pointer from where to read */
+ mtr_t* mtr __attribute__((unused)))
+ /*!< in: mini-transaction handle */
+{
+ ut_ad(mtr->state == MTR_ACTIVE);
+ ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+ return(mach_read_from_8(ptr));
+}
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given page.
+@return TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+ mtr_t* mtr, /*!< in: mtr */
+ const byte* ptr, /*!< in: pointer to buffer frame */
+ ulint type) /*!< in: type of object */
+{
+ return(mtr_memo_contains(mtr, buf_block_align(ptr), type));
+}
+
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ fprintf(stderr,
+ "Mini-transaction handle: memo size %lu bytes"
+ " log size %lu bytes\n",
+ (ulong) dyn_array_get_data_size(&(mtr->memo)),
+ (ulong) dyn_array_get_data_size(&(mtr->log)));
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
new file mode 100644
index 00000000000..48d796c38e1
--- /dev/null
+++ b/storage/xtradb/os/os0file.c
@@ -0,0 +1,4604 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.c
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "ut0mem.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "log0recv.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+# include "os0thread.h"
+#else /* !UNIV_HOTBACKUP */
+# ifdef __WIN__
+/* Add includes for the _stat() call to compile on Windows */
+# include <sys/types.h>
+# include <sys/stat.h>
+# include <errno.h>
+# endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask
+ = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+UNIV_INTERN ulint os_innodb_umask = 0;
+#endif
+
+#ifdef UNIV_DO_FLUSH
+/* If the following is set to TRUE, we do not call os_file_flush in every
+os_file_write. We can set this TRUE when the doublewrite buffer is used. */
+UNIV_INTERN ibool os_do_not_call_flush_at_each_write = FALSE;
+#else
+/* We do not call os_file_flush in every os_file_write. */
+#endif /* UNIV_DO_FLUSH */
+
+#ifdef UNIV_HOTBACKUP
+# define os_aio_use_native_aio FALSE
+#else /* UNIV_HOTBACKUP */
+/* We use these mutexes to protect lseek + file i/o operation, if the
+OS does not provide an atomic pread or pwrite, or similar */
+#define OS_FILE_N_SEEK_MUTEXES 16
+UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+
+/* In simulated aio, merge at most this many consecutive i/os */
+#define OS_AIO_MERGE_N_CONSECUTIVE 64
+
+/** If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
+
+/** Flag: enable debug printout for asynchronous i/o */
+UNIV_INTERN ibool os_aio_print_debug = FALSE;
+
+/* State for the state of an IO request in simulated AIO.
+ Protocol for simulated aio:
+ client requests IO: find slot with reserved = FALSE. Add entry with
+ status = OS_AIO_NOT_ISSUED.
+ IO thread wakes: find adjacent slots with reserved = TRUE and status =
+ OS_AIO_NOT_ISSUED. Change status for slots to
+ OS_AIO_ISSUED.
+ IO operation completes: set status for slots to OS_AIO_DONE. set status
+ for the first slot to OS_AIO_CLAIMED and return
+ result for that slot.
+ When there are multiple read and write threads, they all compete to execute
+ the requests in the array (os_aio_array_t). This avoids the need to load
+ balance requests at the time the request is made at the cost of waking all
+ threads when a request is available.
+*/
+typedef enum {
+ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
+ OS_AIO_ISSUED, /* Being processed by an IO thread. */
+ OS_AIO_DONE, /* Request processed. */
+ OS_AIO_CLAIMED /* Result being returned to client. */
+} os_aio_status;
+
+/** The asynchronous i/o array slot structure */
+typedef struct os_aio_slot_struct os_aio_slot_t;
+
+/** The asynchronous i/o array slot structure */
+struct os_aio_slot_struct{
+ ibool is_read; /*!< TRUE if a read operation */
+ ulint pos; /*!< index of the slot in the aio
+ array */
+ ibool reserved; /*!< TRUE if this slot is reserved */
+ os_aio_status status; /* Status for current request. Valid when reserved
+ is TRUE. Used only in simulated aio. */
+ time_t reservation_time;/*!< time when reserved */
+ ulint len; /*!< length of the block to read or
+ write */
+ byte* buf; /*!< buffer used in i/o */
+ ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */
+ ulint offset; /*!< 32 low bits of file offset in
+ bytes */
+ ulint offset_high; /*!< 32 high bits of file offset */
+ os_file_t file; /*!< file where to read or write */
+ const char* name; /*!< file name or path */
+// ibool io_already_done;/*!< used only in simulated aio:
+// TRUE if the physical i/o already
+// made and only the slot message
+// needs to be passed to the caller
+// of os_aio_simulated_handle */
+ fil_node_t* message1; /*!< message which is given by the */
+ void* message2; /*!< the requester of an aio operation
+ and which can be used to identify
+ which pending aio operation was
+ completed */
+#ifdef WIN_ASYNC_IO
+ os_event_t event; /*!< event object we need in the
+ OVERLAPPED struct */
+ OVERLAPPED control; /*!< Windows control block for the
+ aio request */
+#endif
+};
+
+/** The asynchronous i/o array structure */
+typedef struct os_aio_array_struct os_aio_array_t;
+
+/** The asynchronous i/o array structure */
+struct os_aio_array_struct{
+ os_mutex_t mutex; /*!< the mutex protecting the aio array */
+ os_event_t not_full;
+ /*!< The event which is set to the
+ signaled state when there is space in
+ the aio outside the ibuf segment */
+ os_event_t is_empty;
+ /*!< The event which is set to the
+ signaled state when there are no
+ pending i/os in this array */
+ ulint n_slots;/*!< Total number of slots in the aio
+ array. This must be divisible by
+ n_threads. */
+ ulint n_segments;
+ /*!< Number of segments in the aio
+ array of pending aio requests. A
+ thread can wait separately for any one
+ of the segments. */
+ ulint n_reserved;
+ /*!< Number of reserved slots in the
+ aio array outside the ibuf segment */
+ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
+#ifdef __WIN__
+ os_native_event_t* native_events;
+ /*!< Pointer to an array of OS native
+ event handles where we copied the
+ handles from slots, in the same
+ order. This can be used in
+ WaitForMultipleObjects; used only in
+ Windows */
+#endif
+};
+
+/** Array of events used in simulated aio */
+static os_event_t* os_aio_segment_wait_events = NULL;
+
+/* Number for the first global segment for reading. */
+const ulint os_aio_first_read_segment = 2;
+
+/* Number for the first global segment for writing. Set to
+2 + os_aio_read_write_threads. */
+ulint os_aio_first_write_segment = 0;
+
+/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+are NULL when the module has not yet been initialized. @{ */
+static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */
+static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */
+static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */
+static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */
+static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */
+/* @} */
+
+/* Per thread buffer used for merged IO requests. Used by
+os_aio_simulated_handle so that a buffer doesn't have to be allocated
+for each request. */
+static byte* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
+static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
+
+/** Number of asynchronous I/O segments. Set by os_aio_init(). */
+static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+/** If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+#endif /* UNIV_HOTBACKUP */
+
+UNIV_INTERN ulint os_n_file_reads = 0;
+UNIV_INTERN ulint os_bytes_read_since_printout = 0;
+UNIV_INTERN ulint os_n_file_writes = 0;
+UNIV_INTERN ulint os_n_fsyncs = 0;
+UNIV_INTERN ulint os_n_file_reads_old = 0;
+UNIV_INTERN ulint os_n_file_writes_old = 0;
+UNIV_INTERN ulint os_n_fsyncs_old = 0;
+UNIV_INTERN time_t os_last_printout;
+
+UNIV_INTERN ibool os_has_said_disk_full = FALSE;
+
+#ifndef UNIV_HOTBACKUP
+/** The mutex protecting the following counts of pending I/O operations */
+static os_mutex_t os_file_count_mutex;
+#endif /* !UNIV_HOTBACKUP */
+/** Number of pending os_file_pread() operations */
+UNIV_INTERN ulint os_file_n_pending_preads = 0;
+/** Number of pending os_file_pwrite() operations */
+UNIV_INTERN ulint os_file_n_pending_pwrites = 0;
+/** Number of pending write operations */
+UNIV_INTERN ulint os_n_pending_writes = 0;
+/** Number of pending read operations */
+UNIV_INTERN ulint os_n_pending_reads = 0;
+
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
+UNIV_INTERN
+ulint
+os_get_os_version(void)
+/*===================*/
+{
+#ifdef __WIN__
+ OSVERSIONINFO os_info;
+
+ os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+ ut_a(GetVersionEx(&os_info));
+
+ if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
+ return(OS_WIN31);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
+ return(OS_WIN95);
+ } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+ if (os_info.dwMajorVersion <= 4) {
+ return(OS_WINNT);
+ } else {
+ return(OS_WIN2000);
+ }
+ } else {
+ ut_error;
+ return(0);
+ }
+#else
+ ut_error;
+
+ return(0);
+#endif
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+ ibool report_all_errors) /*!< in: TRUE if we want an error message
+ printed of all errors */
+{
+ ulint err;
+
+#ifdef __WIN__
+
+ err = (ulint) GetLastError();
+
+ if (report_all_errors
+ || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %lu"
+ " in a file operation.\n", (ulong) err);
+
+ if (err == ERROR_PATH_NOT_FOUND) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == ERROR_ACCESS_DENIED) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory. It may also be"
+ " you have created a subdirectory\n"
+ "InnoDB: of the same name as a data file.\n");
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ fprintf(stderr,
+ "InnoDB: The error means that another program"
+ " is using InnoDB's files.\n"
+ "InnoDB: This might be a backup or antivirus"
+ " software or another instance\n"
+ "InnoDB: of MySQL."
+ " Please close it to get rid of this error.\n");
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ fprintf(stderr,
+ "InnoDB: The error means that there are no"
+ " sufficient system resources or quota to"
+ " complete the operation.\n");
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ fprintf(stderr,
+ "InnoDB: The error means that the I/O"
+ " operation has been aborted\n"
+ "InnoDB: because of either a thread exit"
+ " or an application request.\n"
+ "InnoDB: Retry attempt is made.\n");
+ } else {
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ if (err == ERROR_FILE_NOT_FOUND) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == ERROR_DISK_FULL) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ERROR_FILE_EXISTS) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == ERROR_SHARING_VIOLATION
+ || err == ERROR_LOCK_VIOLATION) {
+ return(OS_FILE_SHARING_VIOLATION);
+ } else if (err == ERROR_WORKING_SET_QUOTA
+ || err == ERROR_NO_SYSTEM_RESOURCES) {
+ return(OS_FILE_INSUFFICIENT_RESOURCE);
+ } else if (err == ERROR_OPERATION_ABORTED) {
+ return(OS_FILE_OPERATION_ABORTED);
+ } else {
+ return(100 + err);
+ }
+#else
+ err = (ulint) errno;
+
+ if (report_all_errors
+ || (err != ENOSPC && err != EEXIST)) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Operating system error number %lu"
+ " in a file operation.\n", (ulong) err);
+
+ if (err == ENOENT) {
+ fprintf(stderr,
+ "InnoDB: The error means the system"
+ " cannot find the path specified.\n");
+
+ if (srv_is_being_started) {
+ fprintf(stderr,
+ "InnoDB: If you are installing InnoDB,"
+ " remember that you must create\n"
+ "InnoDB: directories yourself, InnoDB"
+ " does not create them.\n");
+ }
+ } else if (err == EACCES) {
+ fprintf(stderr,
+ "InnoDB: The error means mysqld does not have"
+ " the access rights to\n"
+ "InnoDB: the directory.\n");
+ } else {
+ if (strerror((int)err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %lu"
+ " means '%s'.\n",
+ err, strerror((int)err));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system"
+ " error numbers are described at\n"
+ "InnoDB: "
+ REFMAN
+ "operating-system-error-codes.html\n");
+ }
+ }
+
+ fflush(stderr);
+
+ if (err == ENOSPC) {
+ return(OS_FILE_DISK_FULL);
+ } else if (err == ENOENT) {
+ return(OS_FILE_NOT_FOUND);
+ } else if (err == EEXIST) {
+ return(OS_FILE_ALREADY_EXISTS);
+ } else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
+ return(OS_FILE_PATH_ERROR);
+ } else {
+ return(100 + err);
+ }
+#endif
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+Conditionally exits (calling exit(3)) based on should_exit value and the
+error type
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_cond_exit(
+/*===========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool should_exit) /*!< in: call exit(3) if unknown error
+ and this parameter is TRUE */
+{
+ ulint err;
+
+ err = os_file_get_last_error(FALSE);
+
+ if (err == OS_FILE_DISK_FULL) {
+ /* We only print a warning about disk full once */
+
+ if (os_has_said_disk_full) {
+
+ return(FALSE);
+ }
+
+ if (name) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Encountered a problem with"
+ " file %s\n", name);
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Disk is full. Try to clean the disk"
+ " to free space.\n");
+
+ os_has_said_disk_full = TRUE;
+
+ fflush(stderr);
+
+ return(FALSE);
+ } else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
+
+ return(TRUE);
+ } else if (err == OS_FILE_ALREADY_EXISTS
+ || err == OS_FILE_PATH_ERROR) {
+
+ return(FALSE);
+ } else if (err == OS_FILE_SHARING_VIOLATION) {
+
+ os_thread_sleep(10000000); /* 10 sec */
+ return(TRUE);
+ } else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(TRUE);
+ } else if (err == OS_FILE_OPERATION_ABORTED) {
+
+ os_thread_sleep(100000); /* 100 ms */
+ return(TRUE);
+ } else {
+ if (name) {
+ fprintf(stderr, "InnoDB: File name %s\n", name);
+ }
+
+ fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
+ operation);
+
+ if (should_exit) {
+ fprintf(stderr, "InnoDB: Cannot continue operation.\n");
+
+ fflush(stderr);
+
+ exit(1);
+ }
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error(
+/*=================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation)/*!< in: operation */
+{
+ /* exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, TRUE));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation)/*!< in: operation */
+{
+ /* don't exit in case of unknown error */
+ return(os_file_handle_error_cond_exit(name, operation, FALSE));
+}
+
+#undef USE_FILE_LOCK
+#define USE_FILE_LOCK
+#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
+/* InnoDB Hot Backup does not lock the data files.
+ * On Windows, mandatory locking is used.
+ */
+# undef USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/****************************************************************//**
+Obtain an exclusive lock on a file.
+@return 0 on success */
+static
+int
+os_file_lock(
+/*=========*/
+ int fd, /*!< in: file descriptor */
+ const char* name) /*!< in: file name */
+{
+ struct flock lk;
+ lk.l_type = F_WRLCK;
+ lk.l_whence = SEEK_SET;
+ lk.l_start = lk.l_len = 0;
+ if (fcntl(fd, F_SETLK, &lk) == -1) {
+ fprintf(stderr,
+ "InnoDB: Unable to lock %s, error: %d\n", name, errno);
+
+ if (errno == EAGAIN || errno == EACCES) {
+ fprintf(stderr,
+ "InnoDB: Check that you do not already have"
+ " another mysqld process\n"
+ "InnoDB: using the same InnoDB data"
+ " or log files.\n");
+ }
+
+ return(-1);
+ }
+
+ return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void)
+/*===================*/
+{
+ ulint i;
+
+ os_file_count_mutex = os_mutex_create(NULL);
+
+ for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+ os_file_seek_mutexes[i] = os_mutex_create(NULL);
+ }
+}
+
+/***********************************************************************//**
+Creates a temporary file. This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+On Netware, this function is like tmpfile(3), because the C run-time
+library of Netware does not expose the delete-on-close flag.
+@return temporary file handle, or NULL on error */
+UNIV_INTERN
+FILE*
+os_file_create_tmpfile(void)
+/*========================*/
+{
+#ifdef __NETWARE__
+ FILE* file = tmpfile();
+#else /* __NETWARE__ */
+ FILE* file = NULL;
+ int fd = innobase_mysql_tmpfile();
+
+ if (fd >= 0) {
+ file = fdopen(fd, "w+b");
+ }
+#endif /* __NETWARE__ */
+
+ if (!file) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: unable to create temporary file;"
+ " errno: %d\n", errno);
+#ifndef __NETWARE__
+ if (fd >= 0) {
+ close(fd);
+ }
+#endif /* !__NETWARE__ */
+ }
+
+ return(file);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+ const char* dirname, /*!< in: directory name; it must not
+ contain a trailing '\' or '/' */
+ ibool error_is_fatal) /*!< in: TRUE if we should treat an
+ error as a fatal error; if we try to
+ open symlinks then we do not wish a
+ fatal error if it happens not to be
+ a directory */
+{
+ os_file_dir_t dir;
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ char path[OS_FILE_MAX_PATH + 3];
+
+ ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+ strcpy(path, dirname);
+ strcpy(path + strlen(path), "\\*");
+
+ /* Note that in Windows opening the 'directory stream' also retrieves
+ the first entry in the directory. Since it is '.', that is no problem,
+ as we will skip over the '.' and '..' entries anyway. */
+
+ lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+
+ dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
+
+ ut_free(lpFindFileData);
+
+ if (dir == INVALID_HANDLE_VALUE) {
+
+ if (error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(NULL);
+ }
+
+ return(dir);
+#else
+ dir = opendir(dirname);
+
+ if (dir == NULL && error_is_fatal) {
+ os_file_handle_error(dirname, "opendir");
+ }
+
+ return(dir);
+#endif
+}
+
+/***********************************************************************//**
+Closes a directory stream.
+@return 0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+ os_file_dir_t dir) /*!< in: directory stream */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = FindClose(dir);
+
+ if (!ret) {
+ os_file_handle_error_no_exit(NULL, "closedir");
+
+ return(-1);
+ }
+
+ return(0);
+#else
+ int ret;
+
+ ret = closedir(dir);
+
+ if (ret) {
+ os_file_handle_error_no_exit(NULL, "closedir");
+ }
+
+ return(ret);
+#endif
+}
+
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return 0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+ const char* dirname,/*!< in: directory name or path */
+ os_file_dir_t dir, /*!< in: directory stream */
+ os_file_stat_t* info) /*!< in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+ LPWIN32_FIND_DATA lpFindFileData;
+ BOOL ret;
+
+ lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+next_file:
+ ret = FindNextFile(dir, lpFindFileData);
+
+ if (ret) {
+ ut_a(strlen((char *) lpFindFileData->cFileName)
+ < OS_FILE_MAX_PATH);
+
+ if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
+ || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, (char *) lpFindFileData->cFileName);
+
+ info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
+ + (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
+ << 32);
+
+ if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_REPARSE_POINT) {
+ /* TODO: test Windows symlinks */
+ /* TODO: MySQL has apparently its own symlink
+ implementation in Windows, dbname.sym can
+ redirect a database directory:
+ REFMAN "windows-symbolic-links.html" */
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (lpFindFileData->dwFileAttributes
+ & FILE_ATTRIBUTE_DIRECTORY) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else {
+ /* It is probably safest to assume that all other
+ file types are normal. Better to check them rather
+ than blindly skip them. */
+
+ info->type = OS_FILE_TYPE_FILE;
+ }
+ }
+
+ ut_free(lpFindFileData);
+
+ if (ret) {
+ return(0);
+ } else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+ return(1);
+ } else {
+ os_file_handle_error_no_exit(dirname,
+ "readdir_next_file");
+ return(-1);
+ }
+#else
+ struct dirent* ent;
+ char* full_path;
+ int ret;
+ struct stat statinfo;
+#ifdef HAVE_READDIR_R
+ char dirent_buf[sizeof(struct dirent)
+ + _POSIX_PATH_MAX + 100];
+ /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
+ the max file name len; but in most standards, the
+ length is NAME_MAX; we add 100 to be even safer */
+#endif
+
+next_file:
+
+#ifdef HAVE_READDIR_R
+ ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
+
+ if (ret != 0
+#ifdef UNIV_AIX
+ /* On AIX, only if we got non-NULL 'ent' (result) value and
+ a non-zero 'ret' (return) value, it indicates a failed
+ readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+ would indicate the "end of the directory" is reached. */
+ && ent != NULL
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: cannot read directory %s, error %lu\n",
+ dirname, (ulong)ret);
+
+ return(-1);
+ }
+
+ if (ent == NULL) {
+ /* End of directory */
+
+ return(1);
+ }
+
+ ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
+#else
+ ent = readdir(dir);
+
+ if (ent == NULL) {
+
+ return(1);
+ }
+#endif
+ ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+ if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+ goto next_file;
+ }
+
+ strcpy(info->name, ent->d_name);
+
+ full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
+
+ sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+ ret = stat(full_path, &statinfo);
+
+ if (ret) {
+
+ if (errno == ENOENT) {
+ /* readdir() returned a file that does not exist,
+ it must have been deleted in the meantime. Do what
+ would have happened if the file was deleted before
+ readdir() - ignore and go to the next entry.
+ If this is the last entry then info->name will still
+ contain the name of the deleted file when this
+ function returns, but this is not an issue since the
+ caller shouldn't be looking at info when end of
+ directory is returned. */
+
+ ut_free(full_path);
+
+ goto next_file;
+ }
+
+ os_file_handle_error_no_exit(full_path, "stat");
+
+ ut_free(full_path);
+
+ return(-1);
+ }
+
+ info->size = (ib_int64_t)statinfo.st_size;
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ info->type = OS_FILE_TYPE_FILE;
+ } else {
+ info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ ut_free(full_path);
+
+ return(0);
+#endif
+}
+
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+ const char* pathname, /*!< in: directory name as
+ null-terminated string */
+ ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory
+ is treated as an error. */
+{
+#ifdef __WIN__
+ BOOL rcode;
+
+ rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+ if (!(rcode != 0
+ || (GetLastError() == ERROR_ALREADY_EXISTS
+ && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error(pathname, "CreateDirectory");
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#else
+ int rcode;
+
+ rcode = mkdir(pathname, 0770);
+
+ if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+ /* failure */
+ os_file_handle_error(pathname, "mkdir");
+
+ return(FALSE);
+ }
+
+ return (TRUE);
+#endif
+}
+
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple(
+/*==================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+ opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error), or
+ OS_FILE_CREATE_PATH if new file
+ (if exists, error) and subdirectories along
+ its path are created (if needed)*/
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY or
+ OS_FILE_READ_WRITE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+ os_file_t file;
+ DWORD create_flag;
+ DWORD access;
+ DWORD attributes = 0;
+ ibool retry;
+
+try_again:
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN) {
+ create_flag = OPEN_EXISTING;
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+ /* create subdirs along the path if needed */
+ *success = os_file_create_subdirs_if_needed(name);
+ if (!*success) {
+ ut_error;
+ }
+ create_flag = CREATE_NEW;
+ create_mode = OS_FILE_CREATE;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (access_type == OS_FILE_READ_WRITE) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ access = 0;
+ ut_error;
+ }
+
+ file = CreateFile((LPCTSTR) name,
+ access,
+ FILE_SHARE_READ | FILE_SHARE_WRITE,
+ /* file can be read and written also
+ by other processes */
+ NULL, /* default security attributes */
+ create_flag,
+ attributes,
+ NULL); /*!< no template file */
+
+ if (file == INVALID_HANDLE_VALUE) {
+ *success = FALSE;
+
+ retry = os_file_handle_error(name,
+ create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+ if (retry) {
+ goto try_again;
+ }
+ } else {
+ *success = TRUE;
+ }
+
+ return(file);
+#else /* __WIN__ */
+ os_file_t file;
+ int create_flag;
+ ibool retry;
+
+try_again:
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN) {
+ if (access_type == OS_FILE_READ_ONLY) {
+ create_flag = O_RDONLY;
+ } else {
+ create_flag = O_RDWR;
+ }
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ } else if (create_mode == OS_FILE_CREATE_PATH) {
+ /* create subdirs along the path if needed */
+ *success = os_file_create_subdirs_if_needed(name);
+ if (!*success) {
+ return (-1);
+ }
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ create_mode = OS_FILE_CREATE;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (create_mode == OS_FILE_CREATE) {
+ file = open(name, create_flag, S_IRUSR | S_IWUSR
+ | S_IRGRP | S_IWGRP);
+ } else {
+ file = open(name, create_flag);
+ }
+
+ if (file == -1) {
+ *success = FALSE;
+
+ retry = os_file_handle_error(name,
+ create_mode == OS_FILE_OPEN ?
+ "open" : "create");
+ if (retry) {
+ goto try_again;
+ }
+#ifdef USE_FILE_LOCK
+ } else if (access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+ *success = FALSE;
+ close(file);
+ file = -1;
+#endif
+ } else {
+ *success = TRUE;
+ }
+
+ return(file);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling(
+/*====================================*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error) */
+ ulint access_type,/*!< in: OS_FILE_READ_ONLY,
+ OS_FILE_READ_WRITE, or
+ OS_FILE_READ_ALLOW_DELETE; the last option is
+ used by a backup program reading the file */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+ os_file_t file;
+ DWORD create_flag;
+ DWORD access;
+ DWORD attributes = 0;
+ DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN) {
+ create_flag = OPEN_EXISTING;
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (access_type == OS_FILE_READ_ONLY) {
+ access = GENERIC_READ;
+ } else if (access_type == OS_FILE_READ_WRITE) {
+ access = GENERIC_READ | GENERIC_WRITE;
+ } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+ access = GENERIC_READ;
+ share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
+ | FILE_SHARE_WRITE; /*!< A backup program has to give
+ mysqld the maximum freedom to
+ do what it likes with the
+ file */
+ } else {
+ access = 0;
+ ut_error;
+ }
+
+ file = CreateFile((LPCTSTR) name,
+ access,
+ share_mode,
+ NULL, /* default security attributes */
+ create_flag,
+ attributes,
+ NULL); /*!< no template file */
+
+ if (file == INVALID_HANDLE_VALUE) {
+ *success = FALSE;
+ } else {
+ *success = TRUE;
+ }
+
+ return(file);
+#else /* __WIN__ */
+ os_file_t file;
+ int create_flag;
+
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN) {
+ if (access_type == OS_FILE_READ_ONLY) {
+ create_flag = O_RDONLY;
+ } else {
+ create_flag = O_RDWR;
+ }
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (create_mode == OS_FILE_CREATE) {
+ file = open(name, create_flag, S_IRUSR | S_IWUSR
+ | S_IRGRP | S_IWGRP);
+ } else {
+ file = open(name, create_flag);
+ }
+
+ if (file == -1) {
+ *success = FALSE;
+#ifdef USE_FILE_LOCK
+ } else if (access_type == OS_FILE_READ_WRITE
+ && os_file_lock(file, name)) {
+ *success = FALSE;
+ close(file);
+ file = -1;
+#endif
+ } else {
+ *success = TRUE;
+ }
+
+ return(file);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+ int fd, /*!< in: file descriptor to alter */
+ const char* file_name, /*!< in: file name, used in the
+ diagnostic message */
+ const char* operation_name) /*!< in: "open" or "create"; used in the
+ diagnostic message */
+{
+ /* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+ if (directio(fd, DIRECTIO_ON) == -1) {
+ int errno_save;
+ errno_save = (int)errno;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Failed to set DIRECTIO_ON "
+ "on file %s: %s: %s, continuing anyway\n",
+ file_name, operation_name, strerror(errno_save));
+ }
+#elif defined(O_DIRECT)
+ if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+ int errno_save;
+ errno_save = (int)errno;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Failed to set O_DIRECT "
+ "on file %s: %s: %s, continuing anyway\n",
+ file_name, operation_name, strerror(errno_save));
+ if (errno_save == EINVAL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: O_DIRECT is known to result in "
+ "'Invalid argument' on Linux on tmpfs, "
+ "see MySQL Bug#26662\n");
+ }
+ }
+#endif
+}
+
+/****************************************************************//**
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create(
+/*===========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ ulint create_mode,/*!< in: OS_FILE_OPEN if an existing file
+ is opened (if does not exist, error), or
+ OS_FILE_CREATE if a new file is created
+ (if exists, error),
+ OS_FILE_OVERWRITE if a new file is created
+ or an old overwritten;
+ OS_FILE_OPEN_RAW, if a raw device or disk
+ partition should be opened */
+ ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+ non-buffered i/o is desired,
+ OS_FILE_NORMAL, if any normal file;
+ NOTE that it also depends on type, os_aio_..
+ and srv_.. variables whether we really use
+ async i/o or unbuffered i/o: look in the
+ function source code for the exact rules */
+ ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
+ ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+ os_file_t file;
+ DWORD share_mode = FILE_SHARE_READ;
+ DWORD create_flag;
+ DWORD attributes;
+ ibool retry;
+try_again:
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN_RAW) {
+ create_flag = OPEN_EXISTING;
+ share_mode = FILE_SHARE_WRITE;
+ } else if (create_mode == OS_FILE_OPEN
+ || create_mode == OS_FILE_OPEN_RETRY) {
+ create_flag = OPEN_EXISTING;
+ } else if (create_mode == OS_FILE_CREATE) {
+ create_flag = CREATE_NEW;
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+ create_flag = CREATE_ALWAYS;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (purpose == OS_FILE_AIO) {
+ /* If specified, use asynchronous (overlapped) io and no
+ buffering of writes in the OS */
+ attributes = 0;
+#ifdef WIN_ASYNC_IO
+ if (os_aio_use_native_aio) {
+ attributes = attributes | FILE_FLAG_OVERLAPPED;
+ }
+#endif
+#ifdef UNIV_NON_BUFFERED_IO
+# ifndef UNIV_HOTBACKUP
+ if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+ /* Do not use unbuffered i/o to log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+ } else if (srv_win_file_flush_method
+ == SRV_WIN_IO_UNBUFFERED) {
+ attributes = attributes | FILE_FLAG_NO_BUFFERING;
+ }
+# else /* !UNIV_HOTBACKUP */
+ attributes = attributes | FILE_FLAG_NO_BUFFERING;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_NON_BUFFERED_IO */
+ } else if (purpose == OS_FILE_NORMAL) {
+ attributes = 0;
+#ifdef UNIV_NON_BUFFERED_IO
+# ifndef UNIV_HOTBACKUP
+ if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+ /* Do not use unbuffered i/o to log files because
+ value 2 denotes that we do not flush the log at every
+ commit, but only once per second */
+ } else if (srv_win_file_flush_method
+ == SRV_WIN_IO_UNBUFFERED) {
+ attributes = attributes | FILE_FLAG_NO_BUFFERING;
+ }
+# else /* !UNIV_HOTBACKUP */
+ attributes = attributes | FILE_FLAG_NO_BUFFERING;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_NON_BUFFERED_IO */
+ } else {
+ attributes = 0;
+ ut_error;
+ }
+
+ file = CreateFile((LPCTSTR) name,
+ GENERIC_READ | GENERIC_WRITE, /* read and write
+ access */
+ share_mode, /* File can be read also by other
+ processes; we must give the read
+ permission because of ibbackup. We do
+ not give the write permission to
+ others because if one would succeed to
+ start 2 instances of mysqld on the
+ SAME files, that could cause severe
+ database corruption! When opening
+ raw disk partitions, Microsoft manuals
+ say that we must give also the write
+ permission. */
+ NULL, /* default security attributes */
+ create_flag,
+ attributes,
+ NULL); /*!< no template file */
+
+ if (file == INVALID_HANDLE_VALUE) {
+ *success = FALSE;
+
+ /* When srv_file_per_table is on, file creation failure may not
+ be critical to the whole instance. Do not crash the server in
+ case of unknown errors.
+ Please note "srv_file_per_table" is a global variable with
+ no explicit synchronization protection. It could be
+ changed during this execution path. It might not have the
+ same value as the one when building the table definition */
+ if (srv_file_per_table) {
+ retry = os_file_handle_error_no_exit(name,
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
+ } else {
+ retry = os_file_handle_error(name,
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
+ }
+
+ if (retry) {
+ goto try_again;
+ }
+ } else {
+ *success = TRUE;
+ }
+
+ return(file);
+#else /* __WIN__ */
+ os_file_t file;
+ int create_flag;
+ ibool retry;
+ const char* mode_str = NULL;
+ const char* type_str = NULL;
+ const char* purpose_str = NULL;
+
+try_again:
+ ut_a(name);
+
+ if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
+ || create_mode == OS_FILE_OPEN_RETRY) {
+ mode_str = "OPEN";
+ create_flag = O_RDWR;
+ } else if (create_mode == OS_FILE_CREATE) {
+ mode_str = "CREATE";
+ create_flag = O_RDWR | O_CREAT | O_EXCL;
+ } else if (create_mode == OS_FILE_OVERWRITE) {
+ mode_str = "OVERWRITE";
+ create_flag = O_RDWR | O_CREAT | O_TRUNC;
+ } else {
+ create_flag = 0;
+ ut_error;
+ }
+
+ if (type == OS_LOG_FILE) {
+ type_str = "LOG";
+ } else if (type == OS_DATA_FILE) {
+ type_str = "DATA";
+ } else {
+ ut_error;
+ }
+
+ if (purpose == OS_FILE_AIO) {
+ purpose_str = "AIO";
+ } else if (purpose == OS_FILE_NORMAL) {
+ purpose_str = "NORMAL";
+ } else {
+ ut_error;
+ }
+
+#if 0
+ fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
+ name, mode_str, type_str, purpose_str);
+#endif
+#ifdef O_SYNC
+ /* We let O_SYNC only affect log files; note that we map O_DSYNC to
+ O_SYNC because the datasync options seemed to corrupt files in 2001
+ in both Linux and Solaris */
+ if (type == OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
+# if 0
+ fprintf(stderr, "Using O_SYNC for file %s\n", name);
+# endif
+
+ create_flag = create_flag | O_SYNC;
+ }
+#endif /* O_SYNC */
+
+ file = open(name, create_flag, os_innodb_umask);
+
+ if (file == -1) {
+ *success = FALSE;
+
+ /* When srv_file_per_table is on, file creation failure may not
+ be critical to the whole instance. Do not crash the server in
+ case of unknown errors.
+ Please note "srv_file_per_table" is a global variable with
+ no explicit synchronization protection. It could be
+ changed during this execution path. It might not have the
+ same value as the one when building the table definition */
+ if (srv_file_per_table) {
+ retry = os_file_handle_error_no_exit(name,
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
+ } else {
+ retry = os_file_handle_error(name,
+ create_mode == OS_FILE_CREATE ?
+ "create" : "open");
+ }
+
+ if (retry) {
+ goto try_again;
+ } else {
+ return(file /* -1 */);
+ }
+ }
+ /* else */
+
+ *success = TRUE;
+
+ /* We disable OS caching (O_DIRECT) only on data files */
+ if (type != OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+ /* ALL_O_DIRECT: O_DIRECT also for transaction log file */
+ if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+ os_file_set_nocache(file, name, mode_str);
+ }
+
+#ifdef USE_FILE_LOCK
+ if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
+
+ if (create_mode == OS_FILE_OPEN_RETRY) {
+ int i;
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Retrying to lock"
+ " the first data file\n",
+ stderr);
+ for (i = 0; i < 100; i++) {
+ os_thread_sleep(1000000);
+ if (!os_file_lock(file, name)) {
+ *success = TRUE;
+ return(file);
+ }
+ }
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Unable to open the first data file\n",
+ stderr);
+ }
+
+ *success = FALSE;
+ close(file);
+ file = -1;
+ }
+#endif /* USE_FILE_LOCK */
+
+ return(file);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+ const char* name) /*!< in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if ibbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR)name);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* the file does not exist, this not an error */
+
+ return(TRUE);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ fprintf(stderr,
+ "InnoDB: Warning: cannot delete file %s\n"
+ "InnoDB: Are you running ibbackup"
+ " to back up the file?\n", name);
+
+ os_file_get_last_error(TRUE); /* print error information */
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(FALSE);
+ }
+
+ goto loop;
+#else
+ int ret;
+
+ ret = unlink(name);
+
+ if (ret != 0 && errno != ENOENT) {
+ os_file_handle_error_no_exit(name, "delete");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete(
+/*===========*/
+ const char* name) /*!< in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+ BOOL ret;
+ ulint count = 0;
+loop:
+ /* In Windows, deleting an .ibd file may fail if ibbackup is copying
+ it */
+
+ ret = DeleteFile((LPCTSTR)name);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+ /* If the file does not exist, we classify this as a 'mild'
+ error and return */
+
+ return(FALSE);
+ }
+
+ count++;
+
+ if (count > 100 && 0 == (count % 10)) {
+ fprintf(stderr,
+ "InnoDB: Warning: cannot delete file %s\n"
+ "InnoDB: Are you running ibbackup"
+ " to back up the file?\n", name);
+
+ os_file_get_last_error(TRUE); /* print error information */
+ }
+
+ os_thread_sleep(1000000); /* sleep for a second */
+
+ if (count > 2000) {
+
+ return(FALSE);
+ }
+
+ goto loop;
+#else
+ int ret;
+
+ ret = unlink(name);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(name, "delete");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename(
+/*===========*/
+ const char* oldpath,/*!< in: old file path as a null-terminated
+ string */
+ const char* newpath)/*!< in: new file path */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error_no_exit(oldpath, "rename");
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = rename(oldpath, newpath);
+
+ if (ret != 0) {
+ os_file_handle_error_no_exit(oldpath, "rename");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close(
+/*==========*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ut_a(file);
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+ os_file_handle_error(NULL, "close");
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ut_a(file);
+
+ ret = CloseHandle(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+#else
+ int ret;
+
+ ret = close(file);
+
+ if (ret == -1) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+#endif
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Gets a file size.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_get_size(
+/*=============*/
+ os_file_t file, /*!< in: handle to a file */
+ ulint* size, /*!< out: least significant 32 bits of file
+ size */
+ ulint* size_high)/*!< out: most significant 32 bits of size */
+{
+#ifdef __WIN__
+ DWORD high;
+ DWORD low;
+
+ low = GetFileSize(file, &high);
+
+ if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
+ return(FALSE);
+ }
+
+ *size = low;
+ *size_high = high;
+
+ return(TRUE);
+#else
+ off_t offs;
+
+ offs = lseek(file, 0, SEEK_END);
+
+ if (offs == ((off_t)-1)) {
+
+ return(FALSE);
+ }
+
+ if (sizeof(off_t) > 4) {
+ *size = (ulint)(offs & 0xFFFFFFFFUL);
+ *size_high = (ulint)(offs >> 32);
+ } else {
+ *size = (ulint) offs;
+ *size_high = 0;
+ }
+
+ return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Gets file size as a 64-bit integer ib_int64_t.
+@return size in bytes, -1 if error */
+UNIV_INTERN
+ib_int64_t
+os_file_get_size_as_iblonglong(
+/*===========================*/
+ os_file_t file) /*!< in: handle to a file */
+{
+ ulint size;
+ ulint size_high;
+ ibool success;
+
+ success = os_file_get_size(file, &size, &size_high);
+
+ if (!success) {
+
+ return(-1);
+ }
+
+ return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
+}
+
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ ulint size, /*!< in: least significant 32 bits of file
+ size */
+ ulint size_high)/*!< in: most significant 32 bits of size */
+{
+ ib_int64_t current_size;
+ ib_int64_t desired_size;
+ ibool ret;
+ byte* buf;
+ byte* buf2;
+ ulint buf_size;
+
+ ut_a(size == (size & 0xFFFFFFFF));
+
+ current_size = 0;
+ desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
+
+ /* Write up to 1 megabyte at a time. */
+ buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
+ * UNIV_PAGE_SIZE;
+ buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
+
+ /* Align the buffer for possible raw i/o */
+ buf = ut_align(buf2, UNIV_PAGE_SIZE);
+
+ /* Write buffer full of zeros */
+ memset(buf, 0, buf_size);
+
+ if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+
+ fprintf(stderr, "InnoDB: Progress in MB:");
+ }
+
+ while (current_size < desired_size) {
+ ulint n_bytes;
+
+ if (desired_size - current_size < (ib_int64_t) buf_size) {
+ n_bytes = (ulint) (desired_size - current_size);
+ } else {
+ n_bytes = buf_size;
+ }
+
+ ret = os_file_write(name, file, buf,
+ (ulint)(current_size & 0xFFFFFFFF),
+ (ulint)(current_size >> 32),
+ n_bytes);
+ if (!ret) {
+ ut_free(buf2);
+ goto error_handling;
+ }
+
+ /* Print about progress for each 100 MB written */
+ if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
+ != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
+
+ fprintf(stderr, " %lu00",
+ (ulong) ((current_size + n_bytes)
+ / (ib_int64_t)(100 * 1024 * 1024)));
+ }
+
+ current_size += n_bytes;
+ }
+
+ if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+
+ fprintf(stderr, "\n");
+ }
+
+ ut_free(buf2);
+
+ ret = os_file_flush(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+error_handling:
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Truncates a file at its current position.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+ FILE* file) /*!< in: file to be truncated */
+{
+#ifdef __WIN__
+ HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+ return(SetEndOfFile(h));
+#else /* __WIN__ */
+ return(!ftruncate(fileno(file), ftell(file)));
+#endif /* __WIN__ */
+}
+
+#ifndef __WIN__
+/***********************************************************************//**
+Wrapper to fsync(2) that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@return 0 if success, -1 otherwise */
+
+static
+int
+os_file_fsync(
+/*==========*/
+ os_file_t file) /*!< in: handle to a file */
+{
+ int ret;
+ int failures;
+ ibool retry;
+
+ failures = 0;
+
+ do {
+ ret = fsync(file);
+
+ os_n_fsyncs++;
+
+ if (ret == -1 && errno == ENOLCK) {
+
+ if (failures % 100 == 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: fsync(): "
+ "No locks available; retrying\n");
+ }
+
+ os_thread_sleep(200000 /* 0.2 sec */);
+
+ failures++;
+
+ retry = TRUE;
+ } else {
+
+ retry = FALSE;
+ }
+ } while (retry);
+
+ return(ret);
+}
+#endif /* !__WIN__ */
+
+/***********************************************************************//**
+Flushes the write buffers of a given file to the disk.
+@return TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush(
+/*==========*/
+ os_file_t file) /*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+ BOOL ret;
+
+ ut_a(file);
+
+ os_n_fsyncs++;
+
+ ret = FlushFileBuffers(file);
+
+ if (ret) {
+ return(TRUE);
+ }
+
+ /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+ actually a raw device, we choose to ignore that error if we are using
+ raw disks */
+
+ if (srv_start_raw_disk_in_use && GetLastError()
+ == ERROR_INVALID_FUNCTION) {
+ return(TRUE);
+ }
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#else
+ int ret;
+
+#if defined(HAVE_DARWIN_THREADS)
+# ifndef F_FULLFSYNC
+ /* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
+# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
+# elif F_FULLFSYNC != 51
+# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
+# endif
+ /* Apple has disabled fsync() for internal disk drives in OS X. That
+ caused corruption for a user when he tested a power outage. Let us in
+ OS X use a nonstandard flush method recommended by an Apple
+ engineer. */
+
+ if (!srv_have_fullfsync) {
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+ ret = os_file_fsync(file);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+ ret = os_file_fsync(file);
+ }
+ }
+#else
+ ret = os_file_fsync(file);
+#endif
+
+ if (ret == 0) {
+ return(TRUE);
+ }
+
+ /* Since Linux returns EINVAL if the 'file' is actually a raw device,
+ we choose to ignore that error if we are using raw disks */
+
+ if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+ return(TRUE);
+ }
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: the OS said file flush did not succeed\n");
+
+ os_file_handle_error(NULL, "flush");
+
+ /* It is a fatal error if a file flush does not succeed, because then
+ the database can get corrupt on disk */
+ ut_error;
+
+ return(FALSE);
+#endif
+}
+
+#ifndef __WIN__
+/*******************************************************************//**
+Does a synchronous read operation in Posix.
+@return number of bytes read, -1 if error */
+#define os_file_pread(file, buf, n, offset, offset_high) \
+ _os_file_pread(file, buf, n, offset, offset_high, NULL);
+
+static
+ssize_t
+_os_file_pread(
+/*==========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint n, /*!< in: number of bytes to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset from where to read */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ trx_t* trx)
+{
+ off_t offs;
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+ ssize_t n_bytes;
+#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
+ ulint sec;
+ ulint ms;
+ ib_uint64_t start_time;
+ ib_uint64_t finish_time;
+
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+
+ if (sizeof(off_t) > 4) {
+ offs = (off_t)offset + (((off_t)offset_high) << 32);
+
+ } else {
+ offs = (off_t)offset;
+
+ if (offset_high > 0) {
+ fprintf(stderr,
+ "InnoDB: Error: file read at offset > 4 GB\n");
+ }
+ }
+
+ os_n_file_reads++;
+
+ if (innobase_get_slow_log() && trx && trx->take_stats)
+ {
+ trx->io_reads++;
+ trx->io_read += n;
+ ut_usectime(&sec, &ms);
+ start_time = (ib_uint64_t)sec * 1000000 + ms;
+ } else {
+ start_time = 0;
+ }
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+ os_mutex_enter(os_file_count_mutex);
+ os_file_n_pending_preads++;
+ os_n_pending_reads++;
+ os_mutex_exit(os_file_count_mutex);
+
+ n_bytes = pread(file, buf, (ssize_t)n, offs);
+
+ os_mutex_enter(os_file_count_mutex);
+ os_file_n_pending_preads--;
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+
+ return(n_bytes);
+#else
+ {
+ off_t ret_offset;
+ ssize_t ret;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads++;
+ os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+ } else {
+ ret = read(file, buf, (ssize_t)n);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+ {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+ }
+
+ return(ret);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Does a synchronous write operation in Posix.
+@return number of bytes written, -1 if error */
+static
+ssize_t
+os_file_pwrite(
+/*===========*/
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from where to write */
+ ulint n, /*!< in: number of bytes to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high) /*!< in: most significant 32 bits of
+ offset */
+{
+ ssize_t ret;
+ off_t offs;
+
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+ 64-bit address */
+
+ if (sizeof(off_t) > 4) {
+ offs = (off_t)offset + (((off_t)offset_high) << 32);
+ } else {
+ offs = (off_t)offset;
+
+ if (offset_high > 0) {
+ fprintf(stderr,
+ "InnoDB: Error: file write"
+ " at offset > 4 GB\n");
+ }
+ }
+
+ os_n_file_writes++;
+
+#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
+ os_mutex_enter(os_file_count_mutex);
+ os_file_n_pending_pwrites++;
+ os_n_pending_writes++;
+ os_mutex_exit(os_file_count_mutex);
+
+ ret = pwrite(file, buf, (ssize_t)n, offs);
+
+ os_mutex_enter(os_file_count_mutex);
+ os_file_n_pending_pwrites--;
+ os_n_pending_writes--;
+ os_mutex_exit(os_file_count_mutex);
+
+# ifdef UNIV_DO_FLUSH
+ if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && !os_do_not_call_flush_at_each_write) {
+
+ /* Always do fsync to reduce the probability that when
+ the OS crashes, a database page is only partially
+ physically written to disk. */
+
+ ut_a(TRUE == os_file_flush(file));
+ }
+# endif /* UNIV_DO_FLUSH */
+
+ return(ret);
+#else
+ {
+ off_t ret_offset;
+# ifndef UNIV_HOTBACKUP
+ ulint i;
+# endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_writes++;
+ os_mutex_exit(os_file_count_mutex);
+
+# ifndef UNIV_HOTBACKUP
+ /* Protect the seek / write operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+# endif /* UNIV_HOTBACKUP */
+
+ ret_offset = lseek(file, offs, SEEK_SET);
+
+ if (ret_offset < 0) {
+ ret = -1;
+
+ goto func_exit;
+ }
+
+ ret = write(file, buf, (ssize_t)n);
+
+# ifdef UNIV_DO_FLUSH
+ if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && !os_do_not_call_flush_at_each_write) {
+
+ /* Always do fsync to reduce the probability that when
+ the OS crashes, a database page is only partially
+ physically written to disk. */
+
+ ut_a(TRUE == os_file_flush(file));
+ }
+# endif /* UNIV_DO_FLUSH */
+
+func_exit:
+# ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+# endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_writes--;
+ os_mutex_exit(os_file_count_mutex);
+
+ return(ret);
+ }
+#endif
+}
+#endif
+
+/*******************************************************************//**
+Requests a synchronous positioned read operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+_os_file_read(
+/*=========*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read */
+ trx_t* trx)
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ibool retry;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ut_ad(file);
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ low = (DWORD) offset;
+ high = (DWORD) offset_high;
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads++;
+ os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ goto error_handling;
+ }
+
+ ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = _os_file_pread(file, buf, n, offset, offset_high, trx);
+
+ if ((ulint)ret == n) {
+
+ return(TRUE);
+ }
+
+ fprintf(stderr,
+ "InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
+ "InnoDB: Was only able to read %ld.\n",
+ (ulong)n, (ulong)offset_high,
+ (ulong)offset, (long)ret);
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+ retry = os_file_handle_error(NULL, "read");
+
+ if (retry) {
+ goto try_again;
+ }
+
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot read from file."
+ " OS error number %lu.\n",
+#ifdef __WIN__
+ (ulong) GetLastError()
+#else
+ (ulong) errno
+#endif
+ );
+ fflush(stderr);
+
+ ut_error;
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n) /*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ibool retry;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ os_n_file_reads++;
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ut_ad(file);
+ ut_ad(buf);
+ ut_ad(n > 0);
+
+ low = (DWORD) offset;
+ high = (DWORD) offset_high;
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads++;
+ os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / read operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ goto error_handling;
+ }
+
+ ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
+ if (ret && len == n) {
+ return(TRUE);
+ }
+#else /* __WIN__ */
+ ibool retry;
+ ssize_t ret;
+
+ os_bytes_read_since_printout += n;
+
+try_again:
+ ret = os_file_pread(file, buf, n, offset, offset_high);
+
+ if ((ulint)ret == n) {
+
+ return(TRUE);
+ }
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+ retry = os_file_handle_error_no_exit(NULL, "read");
+
+ if (retry) {
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+ FILE* file, /*!< in: file to read from */
+ char* str, /*!< in: buffer where to read */
+ ulint size) /*!< in: size of buffer */
+{
+ size_t flen;
+
+ if (size == 0) {
+ return;
+ }
+
+ rewind(file);
+ flen = fread(str, 1, size - 1, file);
+ str[flen] = '\0';
+}
+
+/*******************************************************************//**
+Requests a synchronous write operation.
+@return TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write(
+/*==========*/
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ const void* buf, /*!< in: buffer from which to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to write */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n) /*!< in: number of bytes to write */
+{
+#ifdef __WIN__
+ BOOL ret;
+ DWORD len;
+ DWORD ret2;
+ DWORD low;
+ DWORD high;
+ ulint n_retries = 0;
+ ulint err;
+#ifndef UNIV_HOTBACKUP
+ ulint i;
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_a((offset & 0xFFFFFFFF) == offset);
+
+ os_n_file_writes++;
+
+ ut_ad(file);
+ ut_ad(buf);
+ ut_ad(n > 0);
+retry:
+ low = (DWORD) offset;
+ high = (DWORD) offset_high;
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_writes++;
+ os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+ /* Protect the seek / write operation with a mutex */
+ i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+ os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+ if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_writes--;
+ os_mutex_exit(os_file_count_mutex);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: File pointer positioning to"
+ " file %s failed at\n"
+ "InnoDB: offset %lu %lu. Operating system"
+ " error number %lu.\n"
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n",
+ name, (ulong) offset_high, (ulong) offset,
+ (ulong) GetLastError());
+
+ return(FALSE);
+ }
+
+ ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
+
+ /* Always do fsync to reduce the probability that when the OS crashes,
+ a database page is only partially physically written to disk. */
+
+# ifdef UNIV_DO_FLUSH
+ if (!os_do_not_call_flush_at_each_write) {
+ ut_a(TRUE == os_file_flush(file));
+ }
+# endif /* UNIV_DO_FLUSH */
+
+#ifndef UNIV_HOTBACKUP
+ os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+ os_mutex_enter(os_file_count_mutex);
+ os_n_pending_writes--;
+ os_mutex_exit(os_file_count_mutex);
+
+ if (ret && len == n) {
+
+ return(TRUE);
+ }
+
+ /* If some background file system backup tool is running, then, at
+ least in Windows 2000, we may get here a specific error. Let us
+ retry the operation 100 times, with 1 second waits. */
+
+ if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
+
+ os_thread_sleep(1000000);
+
+ n_retries++;
+
+ goto retry;
+ }
+
+ if (!os_has_said_disk_full) {
+
+ err = (ulint)GetLastError();
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset %lu %lu.\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %lu were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, (ulong) offset_high, (ulong) offset,
+ (ulong) n, (ulong) len, (ulong) err);
+
+ if (strerror((int)err) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %lu means '%s'.\n",
+ (ulong) err, strerror((int)err));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#else
+ ssize_t ret;
+
+ ret = os_file_pwrite(file, buf, n, offset, offset_high);
+
+ if ((ulint)ret == n) {
+
+ return(TRUE);
+ }
+
+ if (!os_has_said_disk_full) {
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: Write to file %s failed"
+ " at offset %lu %lu.\n"
+ "InnoDB: %lu bytes should have been written,"
+ " only %ld were written.\n"
+ "InnoDB: Operating system error number %lu.\n"
+ "InnoDB: Check that your OS and file system"
+ " support files of this size.\n"
+ "InnoDB: Check also that the disk is not full"
+ " or a disk quota exceeded.\n",
+ name, offset_high, offset, n, (long int)ret,
+ (ulint)errno);
+ if (strerror(errno) != NULL) {
+ fprintf(stderr,
+ "InnoDB: Error number %lu means '%s'.\n",
+ (ulint)errno, strerror(errno));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Some operating system error numbers"
+ " are described at\n"
+ "InnoDB: "
+ REFMAN "operating-system-error-codes.html\n");
+
+ os_has_said_disk_full = TRUE;
+ }
+
+ return(FALSE);
+#endif
+}
+
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+ const char* path, /*!< in: pathname of the file */
+ ibool* exists, /*!< out: TRUE if file exists */
+ os_file_type_t* type) /*!< out: type of the file (if it exists) */
+{
+#ifdef __WIN__
+ int ret;
+ struct _stat statinfo;
+
+ ret = _stat(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat");
+
+ return(FALSE);
+ }
+
+ if (_S_IFDIR & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#else
+ int ret;
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+ *exists = FALSE;
+ return(TRUE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat");
+
+ return(FALSE);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ *type = OS_FILE_TYPE_FILE;
+ } else {
+ *type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ *exists = TRUE;
+
+ return(TRUE);
+#endif
+}
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return TRUE if stat information found */
+UNIV_INTERN
+ibool
+os_file_get_status(
+/*===============*/
+ const char* path, /*!< in: pathname of the file */
+ os_file_stat_t* stat_info) /*!< information of a file in a
+ directory */
+{
+#ifdef __WIN__
+ int ret;
+ struct _stat statinfo;
+
+ ret = _stat(path, &statinfo);
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(FALSE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat");
+
+ return(FALSE);
+ }
+ if (_S_IFDIR & statinfo.st_mode) {
+ stat_info->type = OS_FILE_TYPE_DIR;
+ } else if (_S_IFREG & statinfo.st_mode) {
+ stat_info->type = OS_FILE_TYPE_FILE;
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ stat_info->ctime = statinfo.st_ctime;
+ stat_info->atime = statinfo.st_atime;
+ stat_info->mtime = statinfo.st_mtime;
+ stat_info->size = statinfo.st_size;
+
+ return(TRUE);
+#else
+ int ret;
+ struct stat statinfo;
+
+ ret = stat(path, &statinfo);
+
+ if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+ /* file does not exist */
+
+ return(FALSE);
+ } else if (ret) {
+ /* file exists, but stat call failed */
+
+ os_file_handle_error_no_exit(path, "stat");
+
+ return(FALSE);
+ }
+
+ if (S_ISDIR(statinfo.st_mode)) {
+ stat_info->type = OS_FILE_TYPE_DIR;
+ } else if (S_ISLNK(statinfo.st_mode)) {
+ stat_info->type = OS_FILE_TYPE_LINK;
+ } else if (S_ISREG(statinfo.st_mode)) {
+ stat_info->type = OS_FILE_TYPE_FILE;
+ } else {
+ stat_info->type = OS_FILE_TYPE_UNKNOWN;
+ }
+
+ stat_info->ctime = statinfo.st_ctime;
+ stat_info->atime = statinfo.st_atime;
+ stat_info->mtime = statinfo.st_mtime;
+ stat_info->size = statinfo.st_size;
+
+ return(TRUE);
+#endif
+}
+
+/* path name separator character */
+#ifdef __WIN__
+# define OS_FILE_PATH_SEPARATOR '\\'
+#else
+# define OS_FILE_PATH_SEPARATOR '/'
+#endif
+
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string. In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'. Trailing '/' charac­
+ters are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+ path dirname basename
+ "/usr/lib" "/usr" "lib"
+ "/usr/" "/" "usr"
+ "usr" "." "usr"
+ "/" "/" "/"
+ "." "." "."
+ ".." "." ".."
+
+@return own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+ const char* path) /*!< in: pathname */
+{
+ /* Find the offset of the last slash */
+ const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
+ if (!last_slash) {
+ /* No slash in the path, return "." */
+
+ return(mem_strdup("."));
+ }
+
+ /* Ok, there is a slash */
+
+ if (last_slash == path) {
+ /* last slash is the first char of the path */
+
+ return(mem_strdup("/"));
+ }
+
+ /* Non-trivial directory component */
+
+ return(mem_strdupl(path, last_slash - path));
+}
+
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+ const char* path) /*!< in: path name */
+{
+ char* subdir;
+ ibool success, subdir_exists;
+ os_file_type_t type;
+
+ subdir = os_file_dirname(path);
+ if (strlen(subdir) == 1
+ && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
+ /* subdir is root or cwd, nothing to do */
+ mem_free(subdir);
+
+ return(TRUE);
+ }
+
+ /* Test if subdir exists */
+ success = os_file_status(subdir, &subdir_exists, &type);
+ if (success && !subdir_exists) {
+ /* subdir does not exist, create it */
+ success = os_file_create_subdirs_if_needed(subdir);
+ if (!success) {
+ mem_free(subdir);
+
+ return(FALSE);
+ }
+ success = os_file_create_directory(subdir, FALSE);
+ }
+
+ mem_free(subdir);
+
+ return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Returns a pointer to the nth slot in the aio array.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_get_nth_slot(
+/*======================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ ulint index) /*!< in: index of the slot */
+{
+ ut_a(index < array->n_slots);
+
+ return((array->slots) + index);
+}
+
+/************************************************************************//**
+Creates an aio wait array.
+@return own: aio array */
+static
+os_aio_array_t*
+os_aio_array_create(
+/*================*/
+ ulint n, /*!< in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+ ulint n_segments) /*!< in: number of segments in the aio array */
+{
+ os_aio_array_t* array;
+ ulint i;
+ os_aio_slot_t* slot;
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED* over;
+#endif
+ ut_a(n > 0);
+ ut_a(n_segments > 0);
+
+ array = ut_malloc(sizeof(os_aio_array_t));
+
+ array->mutex = os_mutex_create(NULL);
+ array->not_full = os_event_create(NULL);
+ array->is_empty = os_event_create(NULL);
+
+ os_event_set(array->is_empty);
+
+ array->n_slots = n;
+ array->n_segments = n_segments;
+ array->n_reserved = 0;
+ array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
+#ifdef __WIN__
+ array->native_events = ut_malloc(n * sizeof(os_native_event_t));
+#endif
+ for (i = 0; i < n; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ slot->pos = i;
+ slot->reserved = FALSE;
+#ifdef WIN_ASYNC_IO
+ slot->event = os_event_create(NULL);
+
+ over = &(slot->control);
+
+ over->hEvent = slot->event->handle;
+
+ *((array->native_events) + i) = over->hEvent;
+#endif
+ }
+
+ return(array);
+}
+
+/************************************************************************//**
+Frees an aio wait array. */
+static
+void
+os_aio_array_free(
+/*==============*/
+ os_aio_array_t* array) /*!< in, own: array to free */
+{
+#ifdef WIN_ASYNC_IO
+ ulint i;
+
+ for (i = 0; i < array->n_slots; i++) {
+ os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
+ os_event_free(slot->event);
+ }
+#endif /* WIN_ASYNC_IO */
+
+#ifdef __WIN__
+ ut_free(array->native_events);
+#endif /* __WIN__ */
+ os_mutex_free(array->mutex);
+ os_event_free(array->not_full);
+ os_event_free(array->is_empty);
+
+ ut_free(array->slots);
+ ut_free(array);
+}
+
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+void
+os_aio_init(
+/*========*/
+ ulint n_per_seg, /*<! in: maximum number of pending aio
+ operations allowed per segment */
+ ulint n_read_segs, /*<! in: number of reader threads */
+ ulint n_write_segs, /*<! in: number of writer threads */
+ ulint n_slots_sync) /*<! in: number of slots in the sync aio
+ array */
+{
+ ulint i;
+ ulint n_segments = 2 + n_read_segs + n_write_segs;
+
+ ut_ad(n_segments >= 4);
+
+ os_io_init_simple();
+
+ for (i = 0; i < n_segments; i++) {
+ srv_set_io_thread_op_info(i, "not started yet");
+ os_aio_thread_buffer[i] = 0;
+ os_aio_thread_buffer_size[i] = 0;
+ }
+
+
+ /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+
+ os_aio_first_write_segment = os_aio_first_read_segment + n_read_segs;
+ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+ srv_io_thread_function[0] = "insert buffer thread";
+
+ os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+
+ srv_io_thread_function[1] = "log thread";
+
+ os_aio_read_array = os_aio_array_create(n_per_seg,
+ n_read_segs);
+ for (i = 2; i < 2 + n_read_segs; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "read thread";
+ }
+
+ os_aio_write_array = os_aio_array_create(n_per_seg,
+ n_write_segs);
+ for (i = 2 + n_read_segs; i < n_segments; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "write thread";
+ }
+
+ os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
+ os_aio_n_segments = n_segments;
+
+ os_aio_validate();
+
+ os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
+
+ for (i = 0; i < n_segments; i++) {
+ os_aio_segment_wait_events[i] = os_event_create(NULL);
+ }
+
+ os_last_printout = time(NULL);
+
+}
+
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void)
+/*=============*/
+{
+ ulint i;
+
+ os_aio_array_free(os_aio_ibuf_array);
+ os_aio_ibuf_array = NULL;
+ os_aio_array_free(os_aio_log_array);
+ os_aio_log_array = NULL;
+ os_aio_array_free(os_aio_read_array);
+ os_aio_read_array = NULL;
+ os_aio_array_free(os_aio_write_array);
+ os_aio_write_array = NULL;
+ os_aio_array_free(os_aio_sync_array);
+ os_aio_sync_array = NULL;
+
+ for (i = 0; i < os_aio_n_segments; i++) {
+ os_event_free(os_aio_segment_wait_events[i]);
+ }
+
+ ut_free(os_aio_segment_wait_events);
+ os_aio_segment_wait_events = 0;
+ os_aio_n_segments = 0;
+}
+
+#ifdef WIN_ASYNC_IO
+/************************************************************************//**
+Wakes up all async i/o threads in the array in Windows async i/o at
+shutdown. */
+static
+void
+os_aio_array_wake_win_aio_at_shutdown(
+/*==================================*/
+ os_aio_array_t* array) /*!< in: aio array */
+{
+ ulint i;
+
+ for (i = 0; i < array->n_slots; i++) {
+
+ os_event_set((array->slots + i)->event);
+ }
+}
+#endif
+
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void)
+/*=====================================*/
+{
+ ulint i;
+
+#ifdef WIN_ASYNC_IO
+ /* This code wakes up all ai/o threads in Windows native aio */
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+ os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+#endif
+ /* This loop wakes up all simulated ai/o threads */
+
+ for (i = 0; i < os_aio_n_segments; i++) {
+
+ os_event_set(os_aio_segment_wait_events[i]);
+ }
+}
+
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+ os_event_wait(os_aio_write_array->is_empty);
+}
+
+/**********************************************************************//**
+Calculates segment number for a slot.
+@return segment number (which is the number used by, for example,
+i/o-handler threads) */
+static
+ulint
+os_aio_get_segment_no_from_slot(
+/*============================*/
+ os_aio_array_t* array, /*!< in: aio wait array */
+ os_aio_slot_t* slot) /*!< in: slot in this array */
+{
+ ulint segment;
+ ulint seg_len;
+
+ if (array == os_aio_ibuf_array) {
+ segment = 0;
+
+ } else if (array == os_aio_log_array) {
+ segment = 1;
+
+ } else if (array == os_aio_read_array) {
+ seg_len = os_aio_read_array->n_slots
+ / os_aio_read_array->n_segments;
+
+ segment = 2 + slot->pos / seg_len;
+ } else {
+ ut_a(array == os_aio_write_array);
+ seg_len = os_aio_write_array->n_slots
+ / os_aio_write_array->n_segments;
+
+ segment = os_aio_read_array->n_segments + 2
+ + slot->pos / seg_len;
+ }
+
+ return(segment);
+}
+
+/**********************************************************************//**
+Calculates local segment number and aio array from global segment number.
+@return local segment number within the aio array */
+static
+ulint
+os_aio_get_array_and_local_segment(
+/*===============================*/
+ os_aio_array_t** array, /*!< out: aio wait array */
+ ulint global_segment)/*!< in: global segment number */
+{
+ ulint segment;
+
+ ut_a(global_segment < os_aio_n_segments);
+
+ if (global_segment == 0) {
+ *array = os_aio_ibuf_array;
+ segment = 0;
+
+ } else if (global_segment == 1) {
+ *array = os_aio_log_array;
+ segment = 0;
+
+ } else if (global_segment < os_aio_read_array->n_segments + 2) {
+ *array = os_aio_read_array;
+
+ segment = global_segment - 2;
+ } else {
+ *array = os_aio_write_array;
+
+ segment = global_segment - (os_aio_read_array->n_segments + 2);
+ }
+
+ return(segment);
+}
+
+/*******************************************************************//**
+Requests for a slot in the aio array. If no slot is available, waits until
+not_full-event becomes signaled.
+@return pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_reserve_slot(
+/*======================*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ os_aio_array_t* array, /*!< in: aio array */
+ fil_node_t* message1,/*!< in: message to be passed along with
+ the aio operation */
+ void* message2,/*!< in: message to be passed along with
+ the aio operation */
+ os_file_t file, /*!< in: file handle */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint len, /*!< in: length of the block to read or write */
+ trx_t* trx)
+{
+ os_aio_slot_t* slot;
+#ifdef WIN_ASYNC_IO
+ OVERLAPPED* control;
+#endif
+ ulint i;
+ ulint slots_per_seg;
+ ulint local_seg;
+
+ /* No need of a mutex. Only reading constant fields */
+ slots_per_seg = array->n_slots / array->n_segments;
+
+ /* We attempt to keep adjacent blocks in the same local
+ segment. This can help in merging IO requests when we are
+ doing simulated AIO */
+ local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
+ % array->n_segments;
+
+loop:
+ os_mutex_enter(array->mutex);
+
+ if (array->n_reserved == array->n_slots) {
+ os_mutex_exit(array->mutex);
+
+ if (!os_aio_use_native_aio) {
+ /* If the handler threads are suspended, wake them
+ so that we get more slots */
+
+ os_aio_simulated_wake_handler_threads();
+ }
+
+ os_event_wait(array->not_full);
+
+ goto loop;
+ }
+
+ /* First try to find a slot in the preferred local segment */
+ for (i = local_seg * slots_per_seg; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved == FALSE) {
+ goto found;
+ }
+ }
+
+ /* Fall back to a full scan. We are guaranteed to find a slot */
+ for (i = 0;; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved == FALSE) {
+ goto found;
+ }
+ }
+
+found:
+ ut_a(slot->reserved == FALSE);
+ array->n_reserved++;
+
+ if (array->n_reserved == 1) {
+ os_event_reset(array->is_empty);
+ }
+
+ if (array->n_reserved == array->n_slots) {
+ os_event_reset(array->not_full);
+ }
+
+ slot->reserved = TRUE;
+ slot->reservation_time = time(NULL);
+ slot->message1 = message1;
+ slot->message2 = message2;
+ slot->file = file;
+ slot->name = name;
+ slot->len = len;
+ slot->type = type;
+ slot->buf = buf;
+ slot->offset = offset;
+ slot->offset_high = offset_high;
+// slot->io_already_done = FALSE;
+ slot->status = OS_AIO_NOT_ISSUED;
+
+#ifdef WIN_ASYNC_IO
+ control = &(slot->control);
+ control->Offset = (DWORD)offset;
+ control->OffsetHigh = (DWORD)offset_high;
+ os_event_reset(slot->event);
+#endif
+
+ os_mutex_exit(array->mutex);
+
+ return(slot);
+}
+
+/*******************************************************************//**
+Frees a slot in the aio array. */
+static
+void
+os_aio_array_free_slot(
+/*===================*/
+ os_aio_array_t* array, /*!< in: aio array */
+ os_aio_slot_t* slot) /*!< in: pointer to slot */
+{
+ ut_ad(array);
+ ut_ad(slot);
+
+ os_mutex_enter(array->mutex);
+
+ ut_ad(slot->reserved);
+
+ slot->reserved = FALSE;
+ slot->status = OS_AIO_NOT_ISSUED;
+
+ array->n_reserved--;
+
+ if (array->n_reserved == array->n_slots - 1) {
+ os_event_set(array->not_full);
+ }
+
+ if (array->n_reserved == 0) {
+ os_event_set(array->is_empty);
+ }
+
+#ifdef WIN_ASYNC_IO
+ os_event_reset(slot->event);
+#endif
+ os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up a simulated aio i/o-handler thread if it has something to do. */
+static
+void
+os_aio_simulated_wake_handler_thread(
+/*=================================*/
+ ulint global_segment) /*!< in: the number of the segment in the aio
+ arrays */
+{
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint segment;
+ ulint n;
+ ulint i;
+
+ ut_ad(!os_aio_use_native_aio);
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ os_mutex_enter(array->mutex);
+
+ for (i = 0; i < n; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved &&
+ (slot->status == OS_AIO_NOT_ISSUED ||
+ slot->status == OS_AIO_DONE)) {
+ /* Found an i/o request */
+
+ break;
+ }
+ }
+
+ os_mutex_exit(array->mutex);
+
+ if (i < n) {
+ if (array == os_aio_ibuf_array) {
+ os_event_set(os_aio_segment_wait_events[0]);
+
+ } else if (array == os_aio_log_array) {
+ os_event_set(os_aio_segment_wait_events[1]);
+
+ } else if (array == os_aio_read_array) {
+ ulint x;
+ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
+ os_event_set(os_aio_segment_wait_events[x]);
+
+ } else if (array == os_aio_write_array) {
+ ulint x;
+ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
+ os_event_set(os_aio_segment_wait_events[x]);
+
+ } else {
+ ut_a(0);
+ }
+ }
+}
+
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void)
+/*=======================================*/
+{
+ if (os_aio_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ os_aio_simulated_wake_handler_thread(0);
+ os_aio_simulated_wake_handler_thread(1);
+ os_aio_simulated_wake_handler_thread(os_aio_first_read_segment);
+ os_aio_simulated_wake_handler_thread(os_aio_first_write_segment);
+}
+
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void)
+/*============================================*/
+{
+
+/* The idea of putting background IO threads to sleep is only for
+Windows when using simulated AIO. Windows XP seems to schedule
+background threads too eagerly to allow for coalescing during
+readahead requests. */
+#ifdef __WIN__
+ os_aio_array_t* array;
+ ulint g;
+
+ if (os_aio_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+ return;
+ }
+
+ os_aio_recommend_sleep_for_read_threads = TRUE;
+
+ for (g = 0; g < os_aio_n_segments; g++) {
+ os_aio_get_array_and_local_segment(&array, g);
+
+ if (array == os_aio_read_array) {
+
+ os_event_reset(os_aio_segment_wait_events[g]);
+ }
+ }
+#endif /* __WIN__ */
+}
+
+/*******************************************************************//**
+Requests an asynchronous i/o operation.
+@return TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio(
+/*===*/
+ ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */
+ ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed
+ to OS_AIO_SIMULATED_WAKE_LATER: the
+ last flag advises this function not to wake
+ i/o-handler threads, but the caller will
+ do the waking explicitly later, in this
+ way the caller can post several requests in
+ a batch; NOTE that the batch must not be
+ so big that it exhausts the slots in aio
+ arrays! NOTE that a simulated batch
+ may introduce hidden chances of deadlocks,
+ because i/os are not actually handled until
+ all have been posted: use with great
+ caution! */
+ const char* name, /*!< in: name of the file or path as a
+ null-terminated string */
+ os_file_t file, /*!< in: handle to a file */
+ void* buf, /*!< in: buffer where to read or from which
+ to write */
+ ulint offset, /*!< in: least significant 32 bits of file
+ offset where to read or write */
+ ulint offset_high, /*!< in: most significant 32 bits of
+ offset */
+ ulint n, /*!< in: number of bytes to read or write */
+ fil_node_t* message1,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ void* message2,/*!< in: message for the aio handler
+ (can be used to identify a completed
+ aio operation); ignored if mode is
+ OS_AIO_SYNC */
+ trx_t* trx)
+{
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+#ifdef WIN_ASYNC_IO
+ ibool retval;
+ BOOL ret = TRUE;
+ DWORD len = (DWORD) n;
+ struct fil_node_struct * dummy_mess1;
+ void* dummy_mess2;
+ ulint dummy_type;
+#endif
+ ulint err = 0;
+ ibool retry;
+ ulint wake_later;
+
+ ut_ad(file);
+ ut_ad(buf);
+ ut_ad(n > 0);
+ ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+ ut_ad(os_aio_validate());
+
+ wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+ mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+
+ if (mode == OS_AIO_SYNC
+#ifdef WIN_ASYNC_IO
+ && !os_aio_use_native_aio
+#endif
+ ) {
+ /* This is actually an ordinary synchronous read or write:
+ no need to use an i/o-handler thread. NOTE that if we use
+ Windows async i/o, Windows does not allow us to use
+ ordinary synchronous os_file_read etc. on the same file,
+ therefore we have built a special mechanism for synchronous
+ wait in the Windows case. */
+
+ if (type == OS_FILE_READ) {
+ return(_os_file_read(file, buf, offset,
+ offset_high, n, trx));
+ }
+
+ ut_a(type == OS_FILE_WRITE);
+
+ return(os_file_write(name, file, buf, offset, offset_high, n));
+ }
+
+try_again:
+ if (mode == OS_AIO_NORMAL) {
+ if (type == OS_FILE_READ) {
+ array = os_aio_read_array;
+ } else {
+ array = os_aio_write_array;
+ }
+ } else if (mode == OS_AIO_IBUF) {
+ ut_ad(type == OS_FILE_READ);
+ /* Reduce probability of deadlock bugs in connection with ibuf:
+ do not let the ibuf i/o handler sleep */
+
+ wake_later = FALSE;
+
+ array = os_aio_ibuf_array;
+ } else if (mode == OS_AIO_LOG) {
+
+ array = os_aio_log_array;
+ } else if (mode == OS_AIO_SYNC) {
+ array = os_aio_sync_array;
+ } else {
+ array = NULL; /* Eliminate compiler warning */
+ ut_error;
+ }
+
+ if (trx && type == OS_FILE_READ)
+ {
+ trx->io_reads++;
+ trx->io_read += n;
+ }
+ slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+ name, buf, offset, offset_high, n, trx);
+ if (type == OS_FILE_READ) {
+ if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+ os_n_file_reads++;
+ os_bytes_read_since_printout += len;
+
+ ret = ReadFile(file, buf, (DWORD)n, &len,
+ &(slot->control));
+#endif
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else if (type == OS_FILE_WRITE) {
+ if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+ os_n_file_writes++;
+ ret = WriteFile(file, buf, (DWORD)n, &len,
+ &(slot->control));
+#endif
+ } else {
+ if (!wake_later) {
+ os_aio_simulated_wake_handler_thread(
+ os_aio_get_segment_no_from_slot(
+ array, slot));
+ }
+ }
+ } else {
+ ut_error;
+ }
+
+#ifdef WIN_ASYNC_IO
+ if (os_aio_use_native_aio) {
+ if ((ret && len == n)
+ || (!ret && GetLastError() == ERROR_IO_PENDING)) {
+ /* aio was queued successfully! */
+
+ if (mode == OS_AIO_SYNC) {
+ /* We want a synchronous i/o operation on a
+ file where we also use async i/o: in Windows
+ we must use the same wait mechanism as for
+ async i/o */
+
+ retval = os_aio_windows_handle(ULINT_UNDEFINED,
+ slot->pos,
+ &dummy_mess1,
+ &dummy_mess2,
+ &dummy_type);
+
+ return(retval);
+ }
+
+ return(TRUE);
+ }
+
+ err = 1; /* Fall through the next if */
+ }
+#endif
+ if (err == 0) {
+ /* aio was queued successfully! */
+
+ return(TRUE);
+ }
+
+ os_aio_array_free_slot(array, slot);
+
+ retry = os_file_handle_error(name,
+ type == OS_FILE_READ
+ ? "aio read" : "aio write");
+ if (retry) {
+
+ goto try_again;
+ }
+
+ return(FALSE);
+}
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+ ulint segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads; if
+ this is ULINT_UNDEFINED, then it means that
+ sync aio is used, and this parameter is
+ ignored */
+ ulint pos, /*!< this parameter is used only in sync aio:
+ wait for the aio slot at this position */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ ulint orig_seg = segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+ ibool ret_val;
+ BOOL ret;
+ DWORD len;
+ BOOL retry = FALSE;
+
+ if (segment == ULINT_UNDEFINED) {
+ array = os_aio_sync_array;
+ segment = 0;
+ } else {
+ segment = os_aio_get_array_and_local_segment(&array, segment);
+ }
+
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+ n = array->n_slots;
+
+ if (array == os_aio_sync_array) {
+ os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+ i = pos;
+ } else {
+ srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+ i = os_event_wait_multiple(n,
+ (array->native_events)
+ );
+ }
+
+ os_mutex_enter(array->mutex);
+
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ ut_a(slot->reserved);
+
+ if (orig_seg != ULINT_UNDEFINED) {
+ srv_set_io_thread_op_info(orig_seg,
+ "get windows aio return value");
+ }
+
+ ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+
+ if (ret && len == slot->len) {
+ ret_val = TRUE;
+
+#ifdef UNIV_DO_FLUSH
+ if (slot->type == OS_FILE_WRITE
+ && !os_do_not_call_flush_at_each_write) {
+ ut_a(TRUE == os_file_flush(slot->file));
+ }
+#endif /* UNIV_DO_FLUSH */
+ } else if (os_file_handle_error(slot->name, "Windows aio")) {
+
+ retry = TRUE;
+ } else {
+
+ ret_val = FALSE;
+ }
+
+ os_mutex_exit(array->mutex);
+
+ if (retry) {
+ /* retry failed read/write operation synchronously.
+ No need to hold array->mutex. */
+
+ switch (slot->type) {
+ case OS_FILE_WRITE:
+ ret = WriteFile(slot->file, slot->buf,
+ slot->len, &len,
+ &(slot->control));
+
+ break;
+ case OS_FILE_READ:
+ ret = ReadFile(slot->file, slot->buf,
+ slot->len, &len,
+ &(slot->control));
+
+ break;
+ default:
+ ut_error;
+ }
+
+ if (!ret && GetLastError() == ERROR_IO_PENDING) {
+ /* aio was queued successfully!
+ We want a synchronous i/o operation on a
+ file where we also use async i/o: in Windows
+ we must use the same wait mechanism as for
+ async i/o */
+
+ ret = GetOverlappedResult(slot->file,
+ &(slot->control),
+ &len, TRUE);
+ }
+
+ ret_val = ret && len == slot->len;
+ }
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret_val);
+}
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+ ulint global_segment, /*!< in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+ the last are the non-ibuf write threads */
+ fil_node_t**message1, /*!< out: the messages passed with the aio
+ request; note that also in the case where
+ the aio operation failed, these output
+ parameters are valid and can be used to
+ restart the operation, for example */
+ void** message2,
+ ulint* type) /*!< out: OS_FILE_WRITE or ..._READ */
+{
+ os_aio_array_t* array;
+ ulint segment;
+ os_aio_slot_t* slot;
+ os_aio_slot_t* slot2;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+ os_aio_slot_t* lowest_request;
+ os_aio_slot_t* oldest_request;
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ ulint lowest_offset;
+ ulint oldest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+ byte* combined_buf2;
+ ibool ret;
+ ulint n;
+ ulint i;
+ time_t now;
+
+ /* Fix compiler warning */
+ *consecutive_ios = NULL;
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+restart:
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (a)");
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ if (array == os_aio_read_array
+ && os_aio_recommend_sleep_for_read_threads) {
+
+ /* Give other threads chance to add several i/os to the array
+ at once. */
+
+ goto recommended_sleep;
+ }
+
+ os_mutex_enter(array->mutex);
+
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (b)");
+
+ /* Check if there is a slot for which the i/o has already been
+ done */
+
+ for (i = 0; i < n; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+ "InnoDB: i/o for slot %lu"
+ " already done, returning\n",
+ (ulong) i);
+ }
+
+ ret = TRUE;
+
+ goto slot_io_done;
+ }
+ }
+
+ n_consecutive = 0;
+
+ /* If there are at least 2 seconds old requests, then pick the oldest
+ one to prevent starvation. If several requests have the same age,
+ then pick the one at the lowest offset. */
+
+ biggest_age = 0;
+ now = time(NULL);
+ oldest_request = lowest_request = NULL;
+ oldest_offset = lowest_offset = ULINT_MAX;
+
+ /* Find the oldest request and the request with the smallest offset */
+ for (i = 0; i < n; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
+ age = (ulint)difftime(now, slot->reservation_time);
+
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+ && slot->offset < oldest_offset)) {
+
+ /* Found an i/o request */
+ biggest_age = age;
+ oldest_request = slot;
+ oldest_offset = slot->offset;
+ }
+
+ /* Look for an i/o request at the lowest offset in the array
+ * (we ignore the high 32 bits of the offset) */
+ if (slot->offset < lowest_offset) {
+ /* Found an i/o request */
+ lowest_request = slot;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+ if (!lowest_request && !oldest_request) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+ if (oldest_request) {
+ slot = oldest_request;
+ } else {
+ slot = lowest_request;
+ }
+ consecutive_ios[0] = slot;
+ n_consecutive = 1;
+
+ /* Check if there are several consecutive blocks to read or write */
+
+consecutive_loop:
+ for (i = 0; i < n; i++) {
+ slot2 = os_aio_array_get_nth_slot(array, i);
+
+ if (slot2->reserved && slot2 != slot
+ && slot2->offset == slot->offset + slot->len
+ /* check that sum does not wrap over */
+ && slot->offset + slot->len > slot->offset
+ && slot2->offset_high == slot->offset_high
+ && slot2->type == slot->type
+ && slot2->file == slot->file
+ && slot2->status == OS_AIO_NOT_ISSUED) {
+
+ /* Found a consecutive i/o request */
+
+ consecutive_ios[n_consecutive] = slot2;
+ n_consecutive++;
+
+ slot = slot2;
+
+ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+
+ goto consecutive_loop;
+ } else {
+ break;
+ }
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
+
+ /* We have now collected n_consecutive i/o requests in the array;
+ allocate a single buffer which can hold all data, and perform the
+ i/o */
+
+ total_len = 0;
+ slot = consecutive_ios[0];
+
+ for (i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
+ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
+ consecutive_ios[i]->status = OS_AIO_ISSUED;
+ }
+
+ if (n_consecutive == 1) {
+ /* We can use the buffer of the i/o request */
+ combined_buf = slot->buf;
+ combined_buf2 = NULL;
+ } else {
+ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
+ if (os_aio_thread_buffer[global_segment])
+ ut_free(os_aio_thread_buffer[global_segment]);
+
+ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
+ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
+ }
+ combined_buf2 = os_aio_thread_buffer[global_segment];
+
+ ut_a(combined_buf2);
+
+ combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
+ }
+
+ /* We release the array mutex for the time of the i/o: NOTE that
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
+ ut_a(slot->reserved);
+ ut_a(slot->status == OS_AIO_ISSUED);
+
+ os_mutex_exit(array->mutex);
+
+ if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+ /* Copy the buffers to the combined buffer */
+ offs = 0;
+
+ for (i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
+ consecutive_ios[i]->len);
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ srv_set_io_thread_op_info(global_segment, "doing file i/o");
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+ "InnoDB: doing i/o of type %lu at offset %lu %lu,"
+ " length %lu\n",
+ (ulong) slot->type, (ulong) slot->offset_high,
+ (ulong) slot->offset, (ulong) total_len);
+ }
+
+ /* Do the i/o with ordinary, synchronous i/o functions: */
+ if (slot->type == OS_FILE_WRITE) {
+ ret = os_file_write(slot->name, slot->file, combined_buf,
+ slot->offset, slot->offset_high,
+ total_len);
+ } else {
+ ret = os_file_read(slot->file, combined_buf,
+ slot->offset, slot->offset_high, total_len);
+ }
+
+ ut_a(ret);
+ srv_set_io_thread_op_info(global_segment, "file i/o done");
+
+#if 0
+ fprintf(stderr,
+ "aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
+ n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
+#endif
+
+ if (slot->type == OS_FILE_READ && n_consecutive > 1) {
+ /* Copy the combined buffer to individual buffers */
+ offs = 0;
+
+ for (i = 0; i < n_consecutive; i++) {
+
+ ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
+ consecutive_ios[i]->len);
+ offs += consecutive_ios[i]->len;
+ }
+ }
+
+ if (srv_recovery_stats && recv_recovery_is_on() && n_consecutive) {
+ mutex_enter(&(recv_sys->mutex));
+ if (slot->type == OS_FILE_READ) {
+ recv_sys->stats_read_io_pages += n_consecutive;
+ recv_sys->stats_read_io_consecutive[n_consecutive - 1]++;
+ } else if (slot->type == OS_FILE_WRITE) {
+ recv_sys->stats_write_io_pages += n_consecutive;
+ recv_sys->stats_write_io_consecutive[n_consecutive - 1]++;
+ }
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (i = 0; i < n_consecutive; i++) {
+ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
+ consecutive_ios[i]->status = OS_AIO_DONE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+ several slots, the messages will be returned with subsequent calls
+ of this function */
+
+slot_io_done:
+
+ ut_a(slot->reserved);
+ ut_a(slot->status == OS_AIO_DONE);
+ slot->status = OS_AIO_CLAIMED;
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+
+ *type = slot->type;
+
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+
+ return(ret);
+
+wait_for_io:
+ srv_set_io_thread_op_info(global_segment, "resetting wait event");
+
+ /* We wait here until there again can be i/os in the segment
+ of this thread */
+
+ os_event_reset(os_aio_segment_wait_events[global_segment]);
+
+ os_mutex_exit(array->mutex);
+
+recommended_sleep:
+ srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
+
+ os_event_wait(os_aio_segment_wait_events[global_segment]);
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+ "InnoDB: i/o handler thread for i/o"
+ " segment %lu wakes up\n",
+ (ulong) global_segment);
+ }
+
+ goto restart;
+}
+
+/**********************************************************************//**
+Validates the consistency of an aio array.
+@return TRUE if ok */
+static
+ibool
+os_aio_array_validate(
+/*==================*/
+ os_aio_array_t* array) /*!< in: aio wait array */
+{
+ os_aio_slot_t* slot;
+ ulint n_reserved = 0;
+ ulint i;
+
+ ut_a(array);
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
+ n_reserved++;
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ os_mutex_exit(array->mutex);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Validates the consistency the aio system.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void)
+/*=================*/
+{
+ os_aio_array_validate(os_aio_read_array);
+ os_aio_array_validate(os_aio_write_array);
+ os_aio_array_validate(os_aio_ibuf_array);
+ os_aio_array_validate(os_aio_log_array);
+ os_aio_array_validate(os_aio_sync_array);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+ FILE* file) /*!< in: file where to print */
+{
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n_reserved;
+ time_t current_time;
+ double time_elapsed;
+ double avg_bytes_read;
+ ulint i;
+
+ for (i = 0; i < srv_n_file_io_threads; i++) {
+ fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
+ srv_io_thread_op_info[i],
+ srv_io_thread_function[i]);
+
+#ifndef __WIN__
+ if (os_aio_segment_wait_events[i]->is_set) {
+ fprintf(file, " ev set");
+ }
+#endif
+
+ fprintf(file, "\n");
+ }
+
+ fputs("Pending normal aio reads:", file);
+
+ array = os_aio_read_array;
+loop:
+ ut_a(array);
+
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+ ut_a(array->n_segments > 0);
+
+ n_reserved = 0;
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
+ n_reserved++;
+#if 0
+ fprintf(stderr, "Reserved slot, messages %p %p\n",
+ (void*) slot->message1,
+ (void*) slot->message2);
+#endif
+ ut_a(slot->len > 0);
+ }
+ }
+
+ ut_a(array->n_reserved == n_reserved);
+
+ fprintf(file, " %lu", (ulong) n_reserved);
+
+ os_mutex_exit(array->mutex);
+
+ if (array == os_aio_read_array) {
+ fputs(", aio writes:", file);
+
+ array = os_aio_write_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_write_array) {
+ fputs(",\n ibuf aio reads:", file);
+ array = os_aio_ibuf_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_ibuf_array) {
+ fputs(", log i/o's:", file);
+ array = os_aio_log_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_log_array) {
+ fputs(", sync i/o's:", file);
+ array = os_aio_sync_array;
+
+ goto loop;
+ }
+
+ putc('\n', file);
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
+ "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
+ (ulong) fil_n_pending_log_flushes,
+ (ulong) fil_n_pending_tablespace_flushes,
+ (ulong) os_n_file_reads, (ulong) os_n_file_writes,
+ (ulong) os_n_fsyncs);
+
+ if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
+ fprintf(file,
+ "%lu pending preads, %lu pending pwrites\n",
+ (ulong) os_file_n_pending_preads,
+ (ulong) os_file_n_pending_pwrites);
+ }
+
+ if (os_n_file_reads == os_n_file_reads_old) {
+ avg_bytes_read = 0.0;
+ } else {
+ avg_bytes_read = (double) os_bytes_read_since_printout
+ / (os_n_file_reads - os_n_file_reads_old);
+ }
+
+ fprintf(file,
+ "%.2f reads/s, %lu avg bytes/read,"
+ " %.2f writes/s, %.2f fsyncs/s\n",
+ (os_n_file_reads - os_n_file_reads_old)
+ / time_elapsed,
+ (ulong)avg_bytes_read,
+ (os_n_file_writes - os_n_file_writes_old)
+ / time_elapsed,
+ (os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = current_time;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void)
+/*======================*/
+{
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+ os_n_fsyncs_old = os_n_fsyncs;
+ os_bytes_read_since_printout = 0;
+
+ os_last_printout = time(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations.
+@return TRUE if all free */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void)
+/*=======================*/
+{
+ os_aio_array_t* array;
+ ulint n_res = 0;
+
+ array = os_aio_read_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_write_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_ibuf_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_log_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ array = os_aio_sync_array;
+
+ os_mutex_enter(array->mutex);
+
+ n_res += array->n_reserved;
+
+ os_mutex_exit(array->mutex);
+
+ if (n_res == 0) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.c
new file mode 100644
index 00000000000..4567d96b6f4
--- /dev/null
+++ b/storage/xtradb/os/os0proc.c
@@ -0,0 +1,401 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0proc.c
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0proc.h"
+#ifdef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#include "ut0mem.h"
+#include "ut0byte.h"
+
+/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
+MAP_ANON but MAP_ANON is marked as deprecated */
+#if defined(MAP_ANONYMOUS)
+#define OS_MAP_ANON MAP_ANONYMOUS
+#elif defined(MAP_ANON)
+#define OS_MAP_ANON MAP_ANON
+#endif
+
+UNIV_INTERN ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+UNIV_INTERN ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+ return((ulint)GetCurrentProcessId());
+#else
+ return((ulint)getpid());
+#endif
+}
+
+/****************************************************************//**
+Allocates large pages memory.
+@return allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+ ulint* n) /*!< in/out: number of bytes */
+{
+ void* ptr;
+ ulint size;
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+ int shmid;
+ struct shmid_ds buf;
+
+ if (!os_use_large_pages || !os_large_page_size) {
+ goto skip;
+ }
+
+ /* Align block size to os_large_page_size */
+ ut_ad(ut_is_2pow(os_large_page_size));
+ size = ut_2pow_round(*n + (os_large_page_size - 1),
+ os_large_page_size);
+
+ shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W);
+ if (shmid < 0) {
+ fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
+ " %lu bytes. errno %d\n", size, errno);
+ ptr = NULL;
+ } else {
+ ptr = shmat(shmid, NULL, 0);
+ if (ptr == (void *)-1) {
+ fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
+ " attach shared memory segment, errno %d\n",
+ errno);
+ ptr = NULL;
+ }
+
+ /* Remove the shared memory segment so that it will be
+ automatically freed after memory is detached or
+ process exits */
+ shmctl(shmid, IPC_RMID, &buf);
+ }
+
+ if (ptr) {
+ *n = size;
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_total_allocated_memory += size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+# ifdef UNIV_SET_MEM_TO_ZERO
+ memset(ptr, '\0', size);
+# endif
+ UNIV_MEM_ALLOC(ptr, size);
+ return(ptr);
+ }
+
+ fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional"
+ " memory pool\n");
+skip:
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+
+#ifdef __WIN__
+ SYSTEM_INFO system_info;
+ GetSystemInfo(&system_info);
+
+ /* Align block size to system page size */
+ ut_ad(ut_is_2pow(system_info.dwPageSize));
+ /* system_info.dwPageSize is only 32-bit. Casting to ulint is required
+ on 64-bit Windows. */
+ size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1),
+ (ulint) system_info.dwPageSize);
+ ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE,
+ PAGE_READWRITE);
+ if (!ptr) {
+ fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;"
+ " Windows error %lu\n",
+ (ulong) size, (ulong) GetLastError());
+ } else {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_total_allocated_memory += size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_ALLOC(ptr, size);
+ }
+#elif defined __NETWARE__ || !defined OS_MAP_ANON
+ size = *n;
+ ptr = ut_malloc_low(size, TRUE, FALSE);
+#else
+# ifdef HAVE_GETPAGESIZE
+ size = getpagesize();
+# else
+ size = UNIV_PAGE_SIZE;
+# endif
+ /* Align block size to system page size */
+ ut_ad(ut_is_2pow(size));
+ size = *n = ut_2pow_round(*n + (size - 1), size);
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+ if (UNIV_UNLIKELY(ptr == (void*) -1)) {
+ fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
+ " errno %lu\n",
+ (ulong) size, (ulong) errno);
+ ptr = NULL;
+ } else {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_total_allocated_memory += size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_ALLOC(ptr, size);
+ }
+#endif
+ return(ptr);
+}
+
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+ void *ptr, /*!< in: pointer returned by
+ os_mem_alloc_large() */
+ ulint size) /*!< in: size returned by
+ os_mem_alloc_large() */
+{
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ os_fast_mutex_unlock(&ut_list_mutex);
+
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+ if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ ut_total_allocated_memory -= size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_FREE(ptr, size);
+ return;
+ }
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+#ifdef __WIN__
+ /* When RELEASE memory, the size parameter must be 0.
+ Do not use MEM_RELEASE with MEM_DECOMMIT. */
+ if (!VirtualFree(ptr, 0, MEM_RELEASE)) {
+ fprintf(stderr, "InnoDB: VirtualFree(%p, %lu) failed;"
+ " Windows error %lu\n",
+ ptr, (ulong) size, (ulong) GetLastError());
+ } else {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ ut_total_allocated_memory -= size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_FREE(ptr, size);
+ }
+#elif defined __NETWARE__ || !defined OS_MAP_ANON
+ ut_free(ptr);
+#else
+ if (munmap(ptr, size)) {
+ fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;"
+ " errno %lu\n",
+ ptr, (ulong) size, (ulong) errno);
+ } else {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ ut_total_allocated_memory -= size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_FREE(ptr, size);
+ }
+#endif
+}
+
+/****************************************************************//**
+Allocates or attaches and reuses shared memory segment.
+The content is not cleared automatically.
+@return allocated memory */
+UNIV_INTERN
+void*
+os_shm_alloc(
+/*=========*/
+ ulint* n, /*!< in/out: number of bytes */
+ uint key,
+ ibool* is_new)
+{
+ void* ptr;
+#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+ ulint size;
+ int shmid;
+
+ *is_new = FALSE;
+ fprintf(stderr,
+ "InnoDB: The shared memory segment containing the buffer pool is: key %#x (%d).\n",
+ key, key);
+# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+ if (!os_use_large_pages || !os_large_page_size) {
+ goto skip;
+ }
+
+ /* Align block size to os_large_page_size */
+ ut_ad(ut_is_2pow(os_large_page_size));
+ size = ut_2pow_round(*n + (os_large_page_size - 1),
+ os_large_page_size);
+
+ shmid = shmget((key_t)key, (size_t)size,
+ IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W);
+ if (shmid < 0) {
+ if (errno == EEXIST) {
+ fprintf(stderr,
+ "InnoDB: HugeTLB: The shared memory segment exists.\n");
+ shmid = shmget((key_t)key, (size_t)size,
+ SHM_HUGETLB | SHM_R | SHM_W);
+ if (shmid < 0) {
+ fprintf(stderr,
+ "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
+ size, errno);
+ goto skip;
+ } else {
+ fprintf(stderr,
+ "InnoDB: HugeTLB: The existent shared memory segment is used.\n");
+ }
+ } else {
+ fprintf(stderr,
+ "InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
+ size, errno);
+ goto skip;
+ }
+ } else {
+ *is_new = TRUE;
+ fprintf(stderr,
+ "InnoDB: HugeTLB: A new shared memory segment has been created .\n");
+ }
+
+ ptr = shmat(shmid, NULL, 0);
+ if (ptr == (void *)-1) {
+ fprintf(stderr,
+ "InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n",
+ errno);
+ ptr = NULL;
+ }
+
+ if (ptr) {
+ *n = size;
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_total_allocated_memory += size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_ALLOC(ptr, size);
+ return(ptr);
+ }
+skip:
+ *is_new = FALSE;
+# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */
+# ifdef HAVE_GETPAGESIZE
+ size = getpagesize();
+# else
+ size = UNIV_PAGE_SIZE;
+# endif
+ /* Align block size to system page size */
+ ut_ad(ut_is_2pow(size));
+ size = *n = ut_2pow_round(*n + (size - 1), size);
+
+ shmid = shmget((key_t)key, (size_t)size,
+ IPC_CREAT | IPC_EXCL | SHM_R | SHM_W);
+ if (shmid < 0) {
+ if (errno == EEXIST) {
+ fprintf(stderr,
+ "InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n");
+ shmid = shmget((key_t)key, (size_t)size,
+ SHM_R | SHM_W);
+ if (shmid < 0) {
+ fprintf(stderr,
+ "InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
+ size, errno);
+ ptr = NULL;
+ goto end;
+ } else {
+ fprintf(stderr,
+ "InnoDB: The existent shared memory segment is used.\n");
+ }
+ } else {
+ fprintf(stderr,
+ "InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
+ size, errno);
+ ptr = NULL;
+ goto end;
+ }
+ } else {
+ *is_new = TRUE;
+ fprintf(stderr,
+ "InnoDB: A new shared memory segment has been created.\n");
+ }
+
+ ptr = shmat(shmid, NULL, 0);
+ if (ptr == (void *)-1) {
+ fprintf(stderr,
+ "InnoDB: Warning: Failed to attach shared memory segment, errno %d\n",
+ errno);
+ ptr = NULL;
+ }
+
+ if (ptr) {
+ *n = size;
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_total_allocated_memory += size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_ALLOC(ptr, size);
+ }
+end:
+#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+ fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
+ ptr = NULL;
+#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+ return(ptr);
+}
+
+/****************************************************************//**
+Detach shared memory segment. */
+UNIV_INTERN
+void
+os_shm_free(
+/*========*/
+ void *ptr, /*!< in: pointer returned by
+ os_shm_alloc() */
+ ulint size) /*!< in: size returned by
+ os_shm_alloc() */
+{
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ os_fast_mutex_unlock(&ut_list_mutex);
+
+#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+ if (!shmdt(ptr)) {
+ os_fast_mutex_lock(&ut_list_mutex);
+ ut_a(ut_total_allocated_memory >= size);
+ ut_total_allocated_memory -= size;
+ os_fast_mutex_unlock(&ut_list_mutex);
+ UNIV_MEM_FREE(ptr, size);
+ }
+#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+ fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
+#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+}
diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.c
new file mode 100644
index 00000000000..f9ab58c2ee4
--- /dev/null
+++ b/storage/xtradb/os/os0sync.c
@@ -0,0 +1,762 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0sync.c
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0sync.h"
+#ifdef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+#include "ut0mem.h"
+#include "srv0start.h"
+
+/* Type definition for an operating system mutex struct */
+struct os_mutex_struct{
+ os_event_t event; /*!< Used by sync0arr.c for queing threads */
+ void* handle; /*!< OS handle to mutex */
+ ulint count; /*!< we use this counter to check
+ that the same thread does not
+ recursively lock the mutex: we
+ do not assume that the OS mutex
+ supports recursive locking, though
+ NT seems to do that */
+ UT_LIST_NODE_T(os_mutex_str_t) os_mutex_list;
+ /* list of all 'slow' OS mutexes created */
+};
+
+/** Mutex protecting counts and the lists of OS mutexes and events */
+UNIV_INTERN os_mutex_t os_sync_mutex;
+/** TRUE if os_sync_mutex has been initialized */
+static ibool os_sync_mutex_inited = FALSE;
+/** TRUE when os_sync_free() is being executed */
+static ibool os_sync_free_called = FALSE;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+UNIV_INTERN ulint os_thread_count = 0;
+
+/** The list of all events created */
+static UT_LIST_BASE_NODE_T(os_event_struct_t) os_event_list;
+
+/** The list of all OS 'slow' mutexes */
+static UT_LIST_BASE_NODE_T(os_mutex_str_t) os_mutex_list;
+
+UNIV_INTERN ulint os_event_count = 0;
+UNIV_INTERN ulint os_mutex_count = 0;
+UNIV_INTERN ulint os_fast_mutex_count = 0;
+
+/* Because a mutex is embedded inside an event and there is an
+event embedded inside a mutex, on free, this generates a recursive call.
+This version of the free event function doesn't acquire the global lock */
+static void os_event_free_internal(os_event_t event);
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void)
+/*==============*/
+{
+ UT_LIST_INIT(os_event_list);
+ UT_LIST_INIT(os_mutex_list);
+
+ os_sync_mutex = NULL;
+ os_sync_mutex_inited = FALSE;
+
+ os_sync_mutex = os_mutex_create(NULL);
+
+ os_sync_mutex_inited = TRUE;
+}
+
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void)
+/*==============*/
+{
+ os_event_t event;
+ os_mutex_t mutex;
+
+ os_sync_free_called = TRUE;
+ event = UT_LIST_GET_FIRST(os_event_list);
+
+ while (event) {
+
+ os_event_free(event);
+
+ event = UT_LIST_GET_FIRST(os_event_list);
+ }
+
+ mutex = UT_LIST_GET_FIRST(os_mutex_list);
+
+ while (mutex) {
+ if (mutex == os_sync_mutex) {
+ /* Set the flag to FALSE so that we do not try to
+ reserve os_sync_mutex any more in remaining freeing
+ operations in shutdown */
+ os_sync_mutex_inited = FALSE;
+ }
+
+ os_mutex_free(mutex);
+
+ mutex = UT_LIST_GET_FIRST(os_mutex_list);
+ }
+ os_sync_free_called = FALSE;
+}
+
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two
+states: signaled and nonsignaled. The created event is manual reset: it
+must be reset explicitly by calling sync_os_reset_event.
+@return the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(
+/*============*/
+ const char* name) /*!< in: the name of the event, if NULL
+ the event is created without a name */
+{
+#ifdef __WIN__
+ os_event_t event;
+
+ event = ut_malloc(sizeof(struct os_event_struct));
+
+ event->handle = CreateEvent(NULL, /* No security attributes */
+ TRUE, /* Manual reset */
+ FALSE, /* Initial state nonsignaled */
+ (LPCTSTR) name);
+ if (!event->handle) {
+ fprintf(stderr,
+ "InnoDB: Could not create a Windows event semaphore;"
+ " Windows error %lu\n",
+ (ulong) GetLastError());
+ }
+#else /* Unix */
+ os_event_t event;
+
+ UT_NOT_USED(name);
+
+ event = ut_malloc(sizeof(struct os_event_struct));
+
+ os_fast_mutex_init(&(event->os_mutex));
+
+ ut_a(0 == pthread_cond_init(&(event->cond_var), NULL));
+
+ event->is_set = FALSE;
+
+ /* We return this value in os_event_reset(), which can then be
+ be used to pass to the os_event_wait_low(). The value of zero
+ is reserved in os_event_wait_low() for the case when the
+ caller does not want to pass any signal_count value. To
+ distinguish between the two cases we initialize signal_count
+ to 1 here. */
+ event->signal_count = 1;
+#endif /* __WIN__ */
+
+ /* The os_sync_mutex can be NULL because during startup an event
+ can be created [ because it's embedded in the mutex/rwlock ] before
+ this module has been initialized */
+ if (os_sync_mutex != NULL) {
+ os_mutex_enter(os_sync_mutex);
+ }
+
+ /* Put to the list of events */
+ UT_LIST_ADD_FIRST(os_event_list, os_event_list, event);
+
+ os_event_count++;
+
+ if (os_sync_mutex != NULL) {
+ os_mutex_exit(os_sync_mutex);
+ }
+
+ return(event);
+}
+
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+ os_event_t event) /*!< in: event to set */
+{
+#ifdef __WIN__
+ ut_a(event);
+ ut_a(SetEvent(event->handle));
+#else
+ ut_a(event);
+
+ os_fast_mutex_lock(&(event->os_mutex));
+
+ if (event->is_set) {
+ /* Do nothing */
+ } else {
+ event->is_set = TRUE;
+ event->signal_count += 1;
+ ut_a(0 == pthread_cond_broadcast(&(event->cond_var)));
+ }
+
+ os_fast_mutex_unlock(&(event->os_mutex));
+#endif
+}
+
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low().
+@return current signal_count. */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+ os_event_t event) /*!< in: event to reset */
+{
+ ib_int64_t ret = 0;
+
+#ifdef __WIN__
+ ut_a(event);
+
+ ut_a(ResetEvent(event->handle));
+#else
+ ut_a(event);
+
+ os_fast_mutex_lock(&(event->os_mutex));
+
+ if (!event->is_set) {
+ /* Do nothing */
+ } else {
+ event->is_set = FALSE;
+ }
+ ret = event->signal_count;
+
+ os_fast_mutex_unlock(&(event->os_mutex));
+#endif
+ return(ret);
+}
+
+/**********************************************************//**
+Frees an event object, without acquiring the global lock. */
+static
+void
+os_event_free_internal(
+/*===================*/
+ os_event_t event) /*!< in: event to free */
+{
+#ifdef __WIN__
+ ut_a(event);
+
+ ut_a(CloseHandle(event->handle));
+#else
+ ut_a(event);
+
+ /* This is to avoid freeing the mutex twice */
+ os_fast_mutex_free(&(event->os_mutex));
+
+ ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
+#endif
+ /* Remove from the list of events */
+
+ UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+ os_event_count--;
+
+ ut_free(event);
+}
+
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+ os_event_t event) /*!< in: event to free */
+
+{
+#ifdef __WIN__
+ ut_a(event);
+
+ ut_a(CloseHandle(event->handle));
+#else
+ ut_a(event);
+
+ os_fast_mutex_free(&(event->os_mutex));
+ ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
+#endif
+ /* Remove from the list of events */
+
+ os_mutex_enter(os_sync_mutex);
+
+ UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+ os_event_count--;
+
+ os_mutex_exit(os_sync_mutex);
+
+ ut_free(event);
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state. If
+srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the
+waiting thread when the event becomes signaled (or immediately if the
+event is already in the signaled state).
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set() [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait() [infinite wait!]
+thread C calls os_event_wait() [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+ os_event_t event, /*!< in: event to wait */
+ ib_int64_t reset_sig_count)/*!< in: zero or the value
+ returned by previous call of
+ os_event_reset(). */
+{
+#ifdef __WIN__
+ DWORD err;
+
+ ut_a(event);
+
+ UT_NOT_USED(reset_sig_count);
+
+ /* Specify an infinite time limit for waiting */
+ err = WaitForSingleObject(event->handle, INFINITE);
+
+ ut_a(err == WAIT_OBJECT_0);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_thread_exit(NULL);
+ }
+#else
+ ib_int64_t old_signal_count;
+
+ os_fast_mutex_lock(&(event->os_mutex));
+
+ if (reset_sig_count) {
+ old_signal_count = reset_sig_count;
+ } else {
+ old_signal_count = event->signal_count;
+ }
+
+ for (;;) {
+ if (event->is_set == TRUE
+ || event->signal_count != old_signal_count) {
+
+ os_fast_mutex_unlock(&(event->os_mutex));
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+
+ os_thread_exit(NULL);
+ }
+ /* Ok, we may return */
+
+ return;
+ }
+
+ pthread_cond_wait(&(event->cond_var), &(event->os_mutex));
+
+ /* Solaris manual said that spurious wakeups may occur: we
+ have to check if the event really has been signaled after
+ we came here to wait */
+ }
+#endif
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time(
+/*===============*/
+ os_event_t event, /*!< in: event to wait */
+ ulint wtime) /*!< in: timeout in microseconds, or
+ OS_SYNC_INFINITE_TIME */
+{
+#ifdef __WIN__
+ DWORD err;
+
+ ut_a(event);
+
+ if (wtime != OS_SYNC_INFINITE_TIME) {
+ err = WaitForSingleObject(event->handle, (DWORD) wtime / 1000);
+ } else {
+ err = WaitForSingleObject(event->handle, INFINITE);
+ }
+
+ if (err == WAIT_OBJECT_0) {
+
+ return(0);
+ } else if (err == WAIT_TIMEOUT) {
+
+ return(OS_SYNC_TIME_EXCEEDED);
+ } else {
+ ut_error;
+ return(1000000); /* dummy value to eliminate compiler warn. */
+ }
+#else
+ int err;
+ int ret = 0;
+ ulint tmp;
+ ib_int64_t old_count;
+ struct timeval tv_start;
+ struct timespec timeout;
+
+ if (wtime == OS_SYNC_INFINITE_TIME) {
+ os_event_wait(event);
+ return 0;
+ }
+
+ /* Compute the absolute point in time at which to time out. */
+ gettimeofday(&tv_start, NULL);
+ tmp = tv_start.tv_usec + wtime;
+ timeout.tv_sec = tv_start.tv_sec + (tmp / 1000000);
+ timeout.tv_nsec = (tmp % 1000000) * 1000;
+
+ os_fast_mutex_lock(&(event->os_mutex));
+ old_count = event->signal_count;
+
+ for (;;) {
+ if (event->is_set == TRUE || event->signal_count != old_count)
+ break;
+
+ err = pthread_cond_timedwait(&(event->cond_var),
+ &(event->os_mutex), &timeout);
+ if (err == ETIMEDOUT) {
+ ret = OS_SYNC_TIME_EXCEEDED;
+ break;
+ }
+ }
+
+ os_fast_mutex_unlock(&(event->os_mutex));
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+
+ os_thread_exit(NULL);
+ }
+
+ return ret;
+#endif
+}
+
+#ifdef __WIN__
+/**********************************************************//**
+Waits for any event in an OS native event array. Returns if even a single
+one is signaled or becomes signaled.
+@return index of the event which was signaled */
+UNIV_INTERN
+ulint
+os_event_wait_multiple(
+/*===================*/
+ ulint n, /*!< in: number of events in the
+ array */
+ os_native_event_t* native_event_array)
+ /*!< in: pointer to an array of event
+ handles */
+{
+ DWORD index;
+
+ ut_a(native_event_array);
+ ut_a(n > 0);
+
+ index = WaitForMultipleObjects((DWORD) n, native_event_array,
+ FALSE, /* Wait for any 1 event */
+ INFINITE); /* Infinite wait time
+ limit */
+ ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparison */
+ ut_a(index < WAIT_OBJECT_0 + n);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ os_thread_exit(NULL);
+ }
+
+ return(index - WAIT_OBJECT_0);
+}
+#endif
+
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+@return the mutex handle */
+UNIV_INTERN
+os_mutex_t
+os_mutex_create(
+/*============*/
+ const char* name) /*!< in: the name of the mutex, if NULL
+ the mutex is created without a name */
+{
+#ifdef __WIN__
+ HANDLE mutex;
+ os_mutex_t mutex_str;
+
+ mutex = CreateMutex(NULL, /* No security attributes */
+ FALSE, /* Initial state: no owner */
+ (LPCTSTR) name);
+ ut_a(mutex);
+#else
+ os_fast_mutex_t* mutex;
+ os_mutex_t mutex_str;
+
+ UT_NOT_USED(name);
+
+ mutex = ut_malloc(sizeof(os_fast_mutex_t));
+
+ os_fast_mutex_init(mutex);
+#endif
+ mutex_str = ut_malloc(sizeof(os_mutex_str_t));
+
+ mutex_str->handle = mutex;
+ mutex_str->count = 0;
+ mutex_str->event = os_event_create(NULL);
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ /* When creating os_sync_mutex itself we cannot reserve it */
+ os_mutex_enter(os_sync_mutex);
+ }
+
+ UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str);
+
+ os_mutex_count++;
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ os_mutex_exit(os_sync_mutex);
+ }
+
+ return(mutex_str);
+}
+
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+ os_mutex_t mutex) /*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+ DWORD err;
+
+ ut_a(mutex);
+
+ /* Specify infinite time limit for waiting */
+ err = WaitForSingleObject(mutex->handle, INFINITE);
+
+ ut_a(err == WAIT_OBJECT_0);
+
+ (mutex->count)++;
+ ut_a(mutex->count == 1);
+#else
+ os_fast_mutex_lock(mutex->handle);
+
+ (mutex->count)++;
+
+ ut_a(mutex->count == 1);
+#endif
+}
+
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+ os_mutex_t mutex) /*!< in: mutex to release */
+{
+ ut_a(mutex);
+
+ ut_a(mutex->count == 1);
+
+ (mutex->count)--;
+#ifdef __WIN__
+ ut_a(ReleaseMutex(mutex->handle));
+#else
+ os_fast_mutex_unlock(mutex->handle);
+#endif
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+ os_mutex_t mutex) /*!< in: mutex to free */
+{
+ ut_a(mutex);
+
+ if (UNIV_LIKELY(!os_sync_free_called)) {
+ os_event_free_internal(mutex->event);
+ }
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ os_mutex_enter(os_sync_mutex);
+ }
+
+ UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex);
+
+ os_mutex_count--;
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ os_mutex_exit(os_sync_mutex);
+ }
+
+#ifdef __WIN__
+ ut_a(CloseHandle(mutex->handle));
+
+ ut_free(mutex);
+#else
+ os_fast_mutex_free(mutex->handle);
+ ut_free(mutex->handle);
+ ut_free(mutex);
+#endif
+}
+
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init(
+/*===============*/
+ os_fast_mutex_t* fast_mutex) /*!< in: fast mutex */
+{
+#ifdef __WIN__
+ ut_a(fast_mutex);
+
+ InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+ ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST));
+#endif
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ /* When creating os_sync_mutex itself (in Unix) we cannot
+ reserve it */
+
+ os_mutex_enter(os_sync_mutex);
+ }
+
+ os_fast_mutex_count++;
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ os_mutex_exit(os_sync_mutex);
+ }
+}
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock(
+/*===============*/
+ os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+ EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+ pthread_mutex_lock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock(
+/*=================*/
+ os_fast_mutex_t* fast_mutex) /*!< in: mutex to release */
+{
+#ifdef __WIN__
+ LeaveCriticalSection(fast_mutex);
+#else
+ pthread_mutex_unlock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free(
+/*===============*/
+ os_fast_mutex_t* fast_mutex) /*!< in: mutex to free */
+{
+#ifdef __WIN__
+ ut_a(fast_mutex);
+
+ DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+ int ret;
+
+ ret = pthread_mutex_destroy(fast_mutex);
+
+ if (UNIV_UNLIKELY(ret != 0)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: error: return value %lu when calling\n"
+ "InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
+ fprintf(stderr,
+ "InnoDB: Byte contents of the pthread mutex at %p:\n",
+ (void*) fast_mutex);
+ ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t));
+ putc('\n', stderr);
+ }
+#endif
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ /* When freeing the last mutexes, we have
+ already freed os_sync_mutex */
+
+ os_mutex_enter(os_sync_mutex);
+ }
+
+ ut_ad(os_fast_mutex_count > 0);
+ os_fast_mutex_count--;
+
+ if (UNIV_LIKELY(os_sync_mutex_inited)) {
+ os_mutex_exit(os_sync_mutex);
+ }
+}
diff --git a/storage/xtradb/os/os0thread.c b/storage/xtradb/os/os0thread.c
new file mode 100644
index 00000000000..34818ada804
--- /dev/null
+++ b/storage/xtradb/os/os0thread.c
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0thread.c
+The interface to the operating system thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0thread.h"
+#ifdef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "srv0srv.h"
+#include "os0sync.h"
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+ os_thread_id_t a, /*!< in: OS thread or thread id */
+ os_thread_id_t b) /*!< in: OS thread or thread id */
+{
+#ifdef __WIN__
+ if (a == b) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+#else
+ if (pthread_equal(a, b)) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+#endif
+}
+
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+ os_thread_id_t a) /*!< in: OS thread identifier */
+{
+#ifdef UNIV_HPUX10
+ /* In HP-UX-10.20 a pthread_t is a struct of 3 fields: field1, field2,
+ field3. We do not know if field1 determines the thread uniquely. */
+
+ return((ulint)(a.field1));
+#else
+ return((ulint)a);
+#endif
+}
+
+/*****************************************************************//**
+Returns the thread identifier of current thread. Currently the thread
+identifier in Unix is the thread handle itself. Note that in HP-UX
+pthread_t is a struct of 3 fields.
+@return current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void)
+/*=======================*/
+{
+#ifdef __WIN__
+ return(GetCurrentThreadId());
+#else
+ return(pthread_self());
+#endif
+}
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns an ulint.
+@return handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create(
+/*=============*/
+#ifndef __WIN__
+ os_posix_f_t start_f,
+#else
+ ulint (*start_f)(void*), /*!< in: pointer to function
+ from which to start */
+#endif
+ void* arg, /*!< in: argument to start
+ function */
+ os_thread_id_t* thread_id) /*!< out: id of the created
+ thread, or NULL */
+{
+#ifdef __WIN__
+ os_thread_t thread;
+ DWORD win_thread_id;
+
+ os_mutex_enter(os_sync_mutex);
+ os_thread_count++;
+ os_mutex_exit(os_sync_mutex);
+
+ thread = CreateThread(NULL, /* no security attributes */
+ 0, /* default size stack */
+ (LPTHREAD_START_ROUTINE)start_f,
+ arg,
+ 0, /* thread runs immediately */
+ &win_thread_id);
+
+ if (srv_set_thread_priorities) {
+
+ /* Set created thread priority the same as a normal query
+ in MYSQL: we try to prevent starvation of threads by
+ assigning same priority QUERY_PRIOR to all */
+
+ ut_a(SetThreadPriority(thread, srv_query_thread_priority));
+ }
+
+ if (thread_id) {
+ *thread_id = win_thread_id;
+ }
+
+ return(thread);
+#else
+ int ret;
+ os_thread_t pthread;
+ pthread_attr_t attr;
+
+#ifndef UNIV_HPUX10
+ pthread_attr_init(&attr);
+#endif
+
+#ifdef UNIV_AIX
+ /* We must make sure a thread stack is at least 32 kB, otherwise
+ InnoDB might crash; we do not know if the default stack size on
+ AIX is always big enough. An empirical test on AIX-4.3 suggested
+ the size was 96 kB, though. */
+
+ ret = pthread_attr_setstacksize(&attr,
+ (size_t)(PTHREAD_STACK_MIN
+ + 32 * 1024));
+ if (ret) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_attr_setstacksize"
+ " returned %d\n", ret);
+ exit(1);
+ }
+#endif
+#ifdef __NETWARE__
+ ret = pthread_attr_setstacksize(&attr,
+ (size_t) NW_THD_STACKSIZE);
+ if (ret) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_attr_setstacksize"
+ " returned %d\n", ret);
+ exit(1);
+ }
+#endif
+ os_mutex_enter(os_sync_mutex);
+ os_thread_count++;
+ os_mutex_exit(os_sync_mutex);
+
+#ifdef UNIV_HPUX10
+ ret = pthread_create(&pthread, pthread_attr_default, start_f, arg);
+#else
+ ret = pthread_create(&pthread, &attr, start_f, arg);
+#endif
+ if (ret) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_create returned %d\n", ret);
+ exit(1);
+ }
+
+#ifndef UNIV_HPUX10
+ pthread_attr_destroy(&attr);
+#endif
+ if (srv_set_thread_priorities) {
+
+ my_pthread_setprio(pthread, srv_query_thread_priority);
+ }
+
+ if (thread_id) {
+ *thread_id = pthread;
+ }
+
+ return(pthread);
+#endif
+}
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+ void* exit_value) /*!< in: exit value; in Windows this void*
+ is cast as a DWORD */
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Thread exits, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ os_mutex_enter(os_sync_mutex);
+ os_thread_count--;
+ os_mutex_exit(os_sync_mutex);
+
+#ifdef __WIN__
+ ExitThread((DWORD)exit_value);
+#else
+ pthread_detach(pthread_self());
+ pthread_exit(exit_value);
+#endif
+}
+
+/*****************************************************************//**
+Returns handle to the current thread.
+@return current thread handle */
+UNIV_INTERN
+os_thread_t
+os_thread_get_curr(void)
+/*====================*/
+{
+#ifdef __WIN__
+ return(GetCurrentThread());
+#else
+ return(pthread_self());
+#endif
+}
+
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void)
+/*=================*/
+{
+#if defined(__WIN__)
+ Sleep(0);
+#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
+ sched_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
+ pthread_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ONE_ARG)
+ pthread_yield(0);
+#else
+ os_thread_sleep(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+ ulint tm) /*!< in: time in microseconds */
+{
+#ifdef __WIN__
+ Sleep((DWORD) tm / 1000);
+#elif defined(__NETWARE__)
+ delay(tm / 1000);
+#else
+ struct timeval t;
+
+ t.tv_sec = tm / 1000000;
+ t.tv_usec = tm % 1000000;
+
+ select(0, NULL, NULL, NULL, &t);
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Sets a thread priority. */
+UNIV_INTERN
+void
+os_thread_set_priority(
+/*===================*/
+ os_thread_t handle, /*!< in: OS handle to the thread */
+ ulint pri) /*!< in: priority */
+{
+#ifdef __WIN__
+ int os_pri;
+
+ if (pri == OS_THREAD_PRIORITY_BACKGROUND) {
+ os_pri = THREAD_PRIORITY_BELOW_NORMAL;
+ } else if (pri == OS_THREAD_PRIORITY_NORMAL) {
+ os_pri = THREAD_PRIORITY_NORMAL;
+ } else if (pri == OS_THREAD_PRIORITY_ABOVE_NORMAL) {
+ os_pri = THREAD_PRIORITY_HIGHEST;
+ } else {
+ ut_error;
+ }
+
+ ut_a(SetThreadPriority(handle, os_pri));
+#else
+ UT_NOT_USED(handle);
+ UT_NOT_USED(pri);
+#endif
+}
+
+/******************************************************************//**
+Gets a thread priority.
+@return priority */
+UNIV_INTERN
+ulint
+os_thread_get_priority(
+/*===================*/
+ os_thread_t handle __attribute__((unused)))
+ /*!< in: OS handle to the thread */
+{
+#ifdef __WIN__
+ int os_pri;
+ ulint pri;
+
+ os_pri = GetThreadPriority(handle);
+
+ if (os_pri == THREAD_PRIORITY_BELOW_NORMAL) {
+ pri = OS_THREAD_PRIORITY_BACKGROUND;
+ } else if (os_pri == THREAD_PRIORITY_NORMAL) {
+ pri = OS_THREAD_PRIORITY_NORMAL;
+ } else if (os_pri == THREAD_PRIORITY_HIGHEST) {
+ pri = OS_THREAD_PRIORITY_ABOVE_NORMAL;
+ } else {
+ ut_error;
+ }
+
+ return(pri);
+#else
+ return(0);
+#endif
+}
+
+/******************************************************************//**
+Gets the last operating system error code for the calling thread.
+@return last error on Windows, 0 otherwise */
+UNIV_INTERN
+ulint
+os_thread_get_last_error(void)
+/*==========================*/
+{
+#ifdef __WIN__
+ return(GetLastError());
+#else
+ return(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c
new file mode 100644
index 00000000000..fa3d2532deb
--- /dev/null
+++ b/storage/xtradb/page/page0cur.c
@@ -0,0 +1,2055 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.c
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#ifdef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#include "page0zip.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "ut0ut.h"
+#ifndef UNIV_HOTBACKUP
+#include "rem0cmp.h"
+
+#ifdef PAGE_CUR_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint page_cur_short_succ = 0;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/*******************************************************************//**
+This is a linear congruential generator PRNG. Returns a pseudo random
+number between 0 and 2^64-1 inclusive. The formula and the constants
+being used are:
+X[n+1] = (a * X[n] + c) mod m
+where:
+X[0] = ut_time_us(NULL)
+a = 1103515245 (3^5 * 5 * 7 * 129749)
+c = 12345 (3 * 5 * 823)
+m = 18446744073709551616 (2^64)
+
+@return number between 0 and 2^64-1 */
+static
+ib_uint64_t
+page_cur_lcg_prng(void)
+/*===================*/
+{
+#define LCG_a 1103515245
+#define LCG_c 12345
+ static ib_uint64_t lcg_current = 0;
+ static ibool initialized = FALSE;
+
+ if (!initialized) {
+ lcg_current = (ib_uint64_t) ut_time_us(NULL);
+ initialized = TRUE;
+ }
+
+ /* no need to "% 2^64" explicitly because lcg_current is
+ 64 bit and this will be done anyway */
+ lcg_current = LCG_a * lcg_current + LCG_c;
+
+ return(lcg_current);
+}
+
+/****************************************************************//**
+Tries a search shortcut based on the last insert.
+@return TRUE on success */
+UNIV_INLINE
+ibool
+page_cur_try_search_shortcut(
+/*=========================*/
+ const buf_block_t* block, /*!< in: index page */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* iup_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ ulint* ilow_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ const rec_t* rec;
+ const rec_t* next_rec;
+ ulint low_match;
+ ulint low_bytes;
+ ulint up_match;
+ ulint up_bytes;
+#ifdef UNIV_SEARCH_DEBUG
+ page_cur_t cursor2;
+#endif
+ ibool success = FALSE;
+ const page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(dtuple_check_typed(tuple));
+
+ rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ offsets = rec_get_offsets(rec, index, offsets,
+ dtuple_get_n_fields(tuple), &heap);
+
+ ut_ad(rec);
+ ut_ad(page_rec_is_user_rec(rec));
+
+ ut_pair_min(&low_match, &low_bytes,
+ *ilow_matched_fields, *ilow_matched_bytes,
+ *iup_matched_fields, *iup_matched_bytes);
+
+ up_match = low_match;
+ up_bytes = low_bytes;
+
+ if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets,
+ &low_match, &low_bytes) < 0) {
+ goto exit_func;
+ }
+
+ next_rec = page_rec_get_next_const(rec);
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ dtuple_get_n_fields(tuple), &heap);
+
+ if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets,
+ &up_match, &up_bytes) >= 0) {
+ goto exit_func;
+ }
+
+ page_cur_position(rec, block, cursor);
+
+#ifdef UNIV_SEARCH_DEBUG
+ page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG,
+ iup_matched_fields,
+ iup_matched_bytes,
+ ilow_matched_fields,
+ ilow_matched_bytes,
+ &cursor2);
+ ut_a(cursor2.rec == cursor->rec);
+
+ if (!page_rec_is_supremum(next_rec)) {
+
+ ut_a(*iup_matched_fields == up_match);
+ ut_a(*iup_matched_bytes == up_bytes);
+ }
+
+ ut_a(*ilow_matched_fields == low_match);
+ ut_a(*ilow_matched_bytes == low_bytes);
+#endif
+ if (!page_rec_is_supremum(next_rec)) {
+
+ *iup_matched_fields = up_match;
+ *iup_matched_bytes = up_bytes;
+ }
+
+ *ilow_matched_fields = low_match;
+ *ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+ page_cur_short_succ++;
+#endif
+ success = TRUE;
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(success);
+}
+
+#endif
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+ const dtuple_t* tuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint n) /*!< in: compare nth field */
+{
+ const dtype_t* type;
+ const dfield_t* dfield;
+ const byte* rec_f;
+ ulint rec_f_len;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ dfield = dtuple_get_nth_field(tuple, n);
+
+ type = dfield_get_type(dfield);
+
+ rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+ if (type->mtype == DATA_VARCHAR
+ || type->mtype == DATA_CHAR
+ || type->mtype == DATA_FIXBINARY
+ || type->mtype == DATA_BINARY
+ || type->mtype == DATA_BLOB
+ || type->mtype == DATA_VARMYSQL
+ || type->mtype == DATA_MYSQL) {
+
+ if (dfield_get_len(dfield) != UNIV_SQL_NULL
+ && rec_f_len != UNIV_SQL_NULL
+ && rec_f_len >= dfield_get_len(dfield)
+ && !cmp_data_data_slow(type->mtype, type->prtype,
+ dfield_get_data(dfield),
+ dfield_get_len(dfield),
+ rec_f, dfield_get_len(dfield))) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+ const buf_block_t* block, /*!< in: buffer block */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* tuple, /*!< in: data tuple */
+ ulint mode, /*!< in: PAGE_CUR_L,
+ PAGE_CUR_LE, PAGE_CUR_G, or
+ PAGE_CUR_GE */
+ ulint* iup_matched_fields,
+ /*!< in/out: already matched
+ fields in upper limit record */
+ ulint* iup_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ ulint* ilow_matched_fields,
+ /*!< in/out: already matched
+ fields in lower limit record */
+ ulint* ilow_matched_bytes,
+ /*!< in/out: already matched
+ bytes in a field not yet
+ completely matched */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ ulint up;
+ ulint low;
+ ulint mid;
+ const page_t* page;
+ const page_dir_slot_t* slot;
+ const rec_t* up_rec;
+ const rec_t* low_rec;
+ const rec_t* mid_rec;
+ ulint up_matched_fields;
+ ulint up_matched_bytes;
+ ulint low_matched_fields;
+ ulint low_matched_bytes;
+ ulint cur_matched_fields;
+ ulint cur_matched_bytes;
+ int cmp;
+#ifdef UNIV_SEARCH_DEBUG
+ int dbg_cmp;
+ ulint dbg_matched_fields;
+ ulint dbg_matched_bytes;
+#endif
+#ifdef UNIV_ZIP_DEBUG
+ const page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes
+ && ilow_matched_fields && ilow_matched_bytes && cursor);
+ ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+ if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+ || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+ page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_check_dir(page);
+
+#ifdef PAGE_CUR_ADAPT
+ if (page_is_leaf(page)
+ && (mode == PAGE_CUR_LE)
+ && (page_header_get_field(page, PAGE_N_DIRECTION) > 3)
+ && (page_header_get_ptr(page, PAGE_LAST_INSERT))
+ && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) {
+
+ if (page_cur_try_search_shortcut(
+ block, index, tuple,
+ iup_matched_fields, iup_matched_bytes,
+ ilow_matched_fields, ilow_matched_bytes,
+ cursor)) {
+ return;
+ }
+ }
+# ifdef PAGE_CUR_DBG
+ if (mode == PAGE_CUR_DBG) {
+ mode = PAGE_CUR_LE;
+ }
+# endif
+#endif
+
+ /* The following flag does not work for non-latin1 char sets because
+ cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+ /* If mode PAGE_CUR_G is specified, we are trying to position the
+ cursor to answer a query of the form "tuple < X", where tuple is
+ the input parameter, and X denotes an arbitrary physical record on
+ the page. We want to position the cursor on the first X which
+ satisfies the condition. */
+
+ up_matched_fields = *iup_matched_fields;
+ up_matched_bytes = *iup_matched_bytes;
+ low_matched_fields = *ilow_matched_fields;
+ low_matched_bytes = *ilow_matched_bytes;
+
+ /* Perform binary search. First the search is done through the page
+ directory, after that as a linear search in the list of records
+ owned by the upper limit directory slot. */
+
+ low = 0;
+ up = page_dir_get_n_slots(page) - 1;
+
+ /* Perform binary search until the lower and upper limit directory
+ slots come to the distance 1 of each other */
+
+ while (up - low > 1) {
+ mid = (low + up) / 2;
+ slot = page_dir_get_nth_slot(page, mid);
+ mid_rec = page_dir_slot_get_rec(slot);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ offsets = rec_get_offsets(mid_rec, index, offsets,
+ dtuple_get_n_fields_cmp(tuple),
+ &heap);
+
+ cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+ &cur_matched_fields,
+ &cur_matched_bytes);
+ if (UNIV_LIKELY(cmp > 0)) {
+low_slot_match:
+ low = mid;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_slot_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+ up = mid;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+
+ goto low_slot_match;
+ } else {
+
+ goto up_slot_match;
+ }
+ }
+
+ slot = page_dir_get_nth_slot(page, low);
+ low_rec = page_dir_slot_get_rec(slot);
+ slot = page_dir_get_nth_slot(page, up);
+ up_rec = page_dir_slot_get_rec(slot);
+
+ /* Perform linear search until the upper and lower records come to
+ distance 1 of each other. */
+
+ while (page_rec_get_next_const(low_rec) != up_rec) {
+
+ mid_rec = page_rec_get_next_const(low_rec);
+
+ ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+ low_matched_fields, low_matched_bytes,
+ up_matched_fields, up_matched_bytes);
+
+ offsets = rec_get_offsets(mid_rec, index, offsets,
+ dtuple_get_n_fields_cmp(tuple),
+ &heap);
+
+ cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+ &cur_matched_fields,
+ &cur_matched_bytes);
+ if (UNIV_LIKELY(cmp > 0)) {
+low_rec_match:
+ low_rec = mid_rec;
+ low_matched_fields = cur_matched_fields;
+ low_matched_bytes = cur_matched_bytes;
+
+ } else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ if (mode == PAGE_CUR_LE_OR_EXTENDS
+ && page_cur_rec_field_extends(
+ tuple, mid_rec, offsets,
+ cur_matched_fields)) {
+
+ goto low_rec_match;
+ }
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+ up_rec = mid_rec;
+ up_matched_fields = cur_matched_fields;
+ up_matched_bytes = cur_matched_bytes;
+ } else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+ || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+ ) {
+
+ goto low_rec_match;
+ } else {
+
+ goto up_rec_match;
+ }
+ }
+
+#ifdef UNIV_SEARCH_DEBUG
+
+ /* Check that the lower and upper limit records have the
+ right alphabetical order compared to tuple. */
+ dbg_matched_fields = 0;
+ dbg_matched_bytes = 0;
+
+ offsets = rec_get_offsets(low_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets,
+ &dbg_matched_fields,
+ &dbg_matched_bytes);
+ if (mode == PAGE_CUR_G) {
+ ut_a(dbg_cmp >= 0);
+ } else if (mode == PAGE_CUR_GE) {
+ ut_a(dbg_cmp == 1);
+ } else if (mode == PAGE_CUR_L) {
+ ut_a(dbg_cmp == 1);
+ } else if (mode == PAGE_CUR_LE) {
+ ut_a(dbg_cmp >= 0);
+ }
+
+ if (!page_rec_is_infimum(low_rec)) {
+
+ ut_a(low_matched_fields == dbg_matched_fields);
+ ut_a(low_matched_bytes == dbg_matched_bytes);
+ }
+
+ dbg_matched_fields = 0;
+ dbg_matched_bytes = 0;
+
+ offsets = rec_get_offsets(up_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets,
+ &dbg_matched_fields,
+ &dbg_matched_bytes);
+ if (mode == PAGE_CUR_G) {
+ ut_a(dbg_cmp == -1);
+ } else if (mode == PAGE_CUR_GE) {
+ ut_a(dbg_cmp <= 0);
+ } else if (mode == PAGE_CUR_L) {
+ ut_a(dbg_cmp <= 0);
+ } else if (mode == PAGE_CUR_LE) {
+ ut_a(dbg_cmp == -1);
+ }
+
+ if (!page_rec_is_supremum(up_rec)) {
+
+ ut_a(up_matched_fields == dbg_matched_fields);
+ ut_a(up_matched_bytes == dbg_matched_bytes);
+ }
+#endif
+ if (mode <= PAGE_CUR_GE) {
+ page_cur_position(up_rec, block, cursor);
+ } else {
+ page_cur_position(low_rec, block, cursor);
+ }
+
+ *iup_matched_fields = up_matched_fields;
+ *iup_matched_bytes = up_matched_bytes;
+ *ilow_matched_fields = low_matched_fields;
+ *ilow_matched_bytes = low_matched_bytes;
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor) /*!< out: page cursor */
+{
+ ulint rnd;
+ ulint n_recs = page_get_n_recs(buf_block_get_frame(block));
+
+ page_cur_set_before_first(block, cursor);
+
+ if (UNIV_UNLIKELY(n_recs == 0)) {
+
+ return;
+ }
+
+ rnd = (ulint) (page_cur_lcg_prng() % n_recs);
+
+ do {
+ page_cur_move_to_next(cursor);
+ } while (rnd--);
+}
+
+UNIV_INTERN
+void
+page_cur_open_on_nth_user_rec(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor, /*!< out: page cursor */
+ ulint nth)
+{
+ ulint n_recs = page_get_n_recs(buf_block_get_frame(block));
+
+ page_cur_set_before_first(block, cursor);
+
+ if (UNIV_UNLIKELY(n_recs == 0)) {
+
+ return;
+ }
+
+ nth--;
+
+ if (nth >= n_recs) {
+ nth = n_recs - 1;
+ }
+
+ do {
+ page_cur_move_to_next(cursor);
+ } while (nth--);
+}
+
+UNIV_INTERN
+ibool
+page_cur_open_on_rnd_user_rec_after_nth(
+/*==========================*/
+ buf_block_t* block, /*!< in: page */
+ page_cur_t* cursor, /*!< out: page cursor */
+ ulint nth)
+{
+ ulint rnd;
+ ulint n_recs = page_get_n_recs(buf_block_get_frame(block));
+ ibool ret;
+
+ page_cur_set_before_first(block, cursor);
+
+ if (UNIV_UNLIKELY(n_recs == 0)) {
+
+ return (FALSE);
+ }
+
+ nth--;
+
+ if (nth >= n_recs) {
+ nth = n_recs - 1;
+ }
+
+ rnd = (ulint) (nth + page_cur_lcg_prng() % (n_recs - nth));
+
+ if (rnd == nth) {
+ ret = TRUE;
+ } else {
+ ret = FALSE;
+ }
+
+ do {
+ page_cur_move_to_next(cursor);
+ } while (rnd--);
+
+ return (ret);
+}
+
+/***********************************************************//**
+Writes the log record of a record insert on a page. */
+static
+void
+page_cur_insert_rec_write_log(
+/*==========================*/
+ rec_t* insert_rec, /*!< in: inserted physical record */
+ ulint rec_size, /*!< in: insert_rec size */
+ rec_t* cursor_rec, /*!< in: record the
+ cursor is pointing to */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ ulint cur_rec_size;
+ ulint extra_size;
+ ulint cur_extra_size;
+ const byte* ins_ptr;
+ byte* log_ptr;
+ const byte* log_end;
+ ulint i;
+
+ ut_a(rec_size < UNIV_PAGE_SIZE);
+ ut_ad(page_align(insert_rec) == page_align(cursor_rec));
+ ut_ad(!page_rec_is_comp(insert_rec)
+ == !dict_table_is_comp(index->table));
+
+ {
+ mem_heap_t* heap = NULL;
+ ulint cur_offs_[REC_OFFS_NORMAL_SIZE];
+ ulint ins_offs_[REC_OFFS_NORMAL_SIZE];
+
+ ulint* cur_offs;
+ ulint* ins_offs;
+
+ rec_offs_init(cur_offs_);
+ rec_offs_init(ins_offs_);
+
+ cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_,
+ ULINT_UNDEFINED, &heap);
+ ins_offs = rec_get_offsets(insert_rec, index, ins_offs_,
+ ULINT_UNDEFINED, &heap);
+
+ extra_size = rec_offs_extra_size(ins_offs);
+ cur_extra_size = rec_offs_extra_size(cur_offs);
+ ut_ad(rec_size == rec_offs_size(ins_offs));
+ cur_rec_size = rec_offs_size(cur_offs);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ ins_ptr = insert_rec - extra_size;
+
+ i = 0;
+
+ if (cur_extra_size == extra_size) {
+ ulint min_rec_size = ut_min(cur_rec_size, rec_size);
+
+ const byte* cur_ptr = cursor_rec - cur_extra_size;
+
+ /* Find out the first byte in insert_rec which differs from
+ cursor_rec; skip the bytes in the record info */
+
+ do {
+ if (*ins_ptr == *cur_ptr) {
+ i++;
+ ins_ptr++;
+ cur_ptr++;
+ } else if ((i < extra_size)
+ && (i >= extra_size
+ - page_rec_get_base_extra_size
+ (insert_rec))) {
+ i = extra_size;
+ ins_ptr = insert_rec;
+ cur_ptr = cursor_rec;
+ } else {
+ break;
+ }
+ } while (i < min_rec_size);
+ }
+
+ if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) {
+
+ if (page_rec_is_comp(insert_rec)) {
+ log_ptr = mlog_open_and_write_index(
+ mtr, insert_rec, index, MLOG_COMP_REC_INSERT,
+ 2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+ if (UNIV_UNLIKELY(!log_ptr)) {
+ /* Logging in mtr is switched off
+ during crash recovery: in that case
+ mlog_open returns NULL */
+ return;
+ }
+ } else {
+ log_ptr = mlog_open(mtr, 11
+ + 2 + 5 + 1 + 5 + 5
+ + MLOG_BUF_MARGIN);
+ if (UNIV_UNLIKELY(!log_ptr)) {
+ /* Logging in mtr is switched off
+ during crash recovery: in that case
+ mlog_open returns NULL */
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(
+ insert_rec, MLOG_REC_INSERT, log_ptr, mtr);
+ }
+
+ log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+ /* Write the cursor rec offset as a 2-byte ulint */
+ mach_write_to_2(log_ptr, page_offset(cursor_rec));
+ log_ptr += 2;
+ } else {
+ log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash
+ recovery: in that case mlog_open returns NULL */
+ return;
+ }
+ log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+ }
+
+ if (page_rec_is_comp(insert_rec)) {
+ if (UNIV_UNLIKELY
+ (rec_get_info_and_status_bits(insert_rec, TRUE)
+ != rec_get_info_and_status_bits(cursor_rec, TRUE))) {
+
+ goto need_extra_info;
+ }
+ } else {
+ if (UNIV_UNLIKELY
+ (rec_get_info_and_status_bits(insert_rec, FALSE)
+ != rec_get_info_and_status_bits(cursor_rec, FALSE))) {
+
+ goto need_extra_info;
+ }
+ }
+
+ if (extra_size != cur_extra_size || rec_size != cur_rec_size) {
+need_extra_info:
+ /* Write the record end segment length
+ and the extra info storage flag */
+ log_ptr += mach_write_compressed(log_ptr,
+ 2 * (rec_size - i) + 1);
+
+ /* Write the info bits */
+ mach_write_to_1(log_ptr,
+ rec_get_info_and_status_bits(
+ insert_rec,
+ page_rec_is_comp(insert_rec)));
+ log_ptr++;
+
+ /* Write the record origin offset */
+ log_ptr += mach_write_compressed(log_ptr, extra_size);
+
+ /* Write the mismatch index */
+ log_ptr += mach_write_compressed(log_ptr, i);
+
+ ut_a(i < UNIV_PAGE_SIZE);
+ ut_a(extra_size < UNIV_PAGE_SIZE);
+ } else {
+ /* Write the record end segment length
+ and the extra info storage flag */
+ log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i));
+ }
+
+ /* Write to the log the inserted index record end segment which
+ differs from the cursor record */
+
+ rec_size -= i;
+
+ if (log_ptr + rec_size <= log_end) {
+ memcpy(log_ptr, ins_ptr, rec_size);
+ mlog_close(mtr, log_ptr + rec_size);
+ } else {
+ mlog_close(mtr, log_ptr);
+ ut_a(rec_size < UNIV_PAGE_SIZE);
+ mlog_catenate_string(mtr, ins_ptr, rec_size);
+ }
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_insert_rec_write_log(ins_rec,size,cur,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+ ibool is_short,/*!< in: TRUE if short inserts */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ulint origin_offset;
+ ulint end_seg_len;
+ ulint mismatch_index;
+ page_t* page;
+ rec_t* cursor_rec;
+ byte buf1[1024];
+ byte* buf;
+ byte* ptr2 = ptr;
+ ulint info_and_status_bits = 0; /* remove warning */
+ page_cur_t cursor;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page = block ? buf_block_get_frame(block) : NULL;
+
+ if (is_short) {
+ cursor_rec = page_rec_get_prev(page_get_supremum_rec(page));
+ } else {
+ ulint offset;
+
+ /* Read the cursor rec offset as a 2-byte ulint */
+
+ if (UNIV_UNLIKELY(end_ptr < ptr + 2)) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ cursor_rec = page + offset;
+
+ if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) {
+
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+ }
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) {
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (end_seg_len & 0x1UL) {
+ /* Read the info bits */
+
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ info_and_status_bits = mach_read_from_1(ptr);
+ ptr++;
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ ut_a(origin_offset < UNIV_PAGE_SIZE);
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ ut_a(mismatch_index < UNIV_PAGE_SIZE);
+ }
+
+ if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) {
+
+ return(NULL);
+ }
+
+ if (!block) {
+
+ return(ptr + (end_seg_len >> 1));
+ }
+
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+ ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+ /* Read from the log the inserted index record end segment which
+ differs from the cursor record */
+
+ offsets = rec_get_offsets(cursor_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (!(end_seg_len & 0x1UL)) {
+ info_and_status_bits = rec_get_info_and_status_bits(
+ cursor_rec, page_is_comp(page));
+ origin_offset = rec_offs_extra_size(offsets);
+ mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1);
+ }
+
+ end_seg_len >>= 1;
+
+ if (mismatch_index + end_seg_len < sizeof buf1) {
+ buf = buf1;
+ } else {
+ buf = mem_alloc(mismatch_index + end_seg_len);
+ }
+
+ /* Build the inserted record to buf */
+
+ if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "Is short %lu, info_and_status_bits %lu, offset %lu, "
+ "o_offset %lu\n"
+ "mismatch index %lu, end_seg_len %lu\n"
+ "parsed len %lu\n",
+ (ulong) is_short, (ulong) info_and_status_bits,
+ (ulong) page_offset(cursor_rec),
+ (ulong) origin_offset,
+ (ulong) mismatch_index, (ulong) end_seg_len,
+ (ulong) (ptr - ptr2));
+
+ fputs("Dump of 300 bytes of log:\n", stderr);
+ ut_print_buf(stderr, ptr2, 300);
+ putc('\n', stderr);
+
+ buf_page_print(page, 0);
+
+ ut_error;
+ }
+
+ ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index);
+ ut_memcpy(buf + mismatch_index, ptr, end_seg_len);
+
+ if (page_is_comp(page)) {
+ rec_set_info_and_status_bits(buf + origin_offset,
+ info_and_status_bits);
+ } else {
+ rec_set_info_bits_old(buf + origin_offset,
+ info_and_status_bits);
+ }
+
+ page_cur_position(cursor_rec, block, &cursor);
+
+ offsets = rec_get_offsets(buf + origin_offset, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor,
+ buf + origin_offset,
+ index, offsets, mtr))) {
+ /* The redo log record should only have been written
+ after the write was successful. */
+ ut_error;
+ }
+
+ if (buf != buf1) {
+
+ mem_free(buf);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ return(ptr + end_seg_len);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+ rec_t* current_rec,/*!< in: pointer to current record after
+ which the new record is inserted */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ byte* insert_buf;
+ ulint rec_size;
+ page_t* page; /*!< the relevant page */
+ rec_t* last_insert; /*!< cursor position at previous
+ insert */
+ rec_t* free_rec; /*!< a free record that was reused,
+ or NULL */
+ rec_t* insert_rec; /*!< inserted record */
+ ulint heap_no; /*!< heap number of the inserted
+ record */
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ page = page_align(current_rec);
+ ut_ad(dict_table_is_comp(index->table)
+ == (ibool) !!page_is_comp(page));
+
+ ut_ad(!page_rec_is_supremum(current_rec));
+
+ /* 1. Get the size of the physical record in the page */
+ rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ const void* rec_start
+ = rec - rec_offs_extra_size(offsets);
+ ulint extra_size
+ = rec_offs_extra_size(offsets)
+ - (rec_offs_comp(offsets)
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES);
+
+ /* All data bytes of the record must be valid. */
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ /* 2. Try to find suitable space from page memory management */
+
+ free_rec = page_header_get_ptr(page, PAGE_FREE);
+ if (UNIV_LIKELY_NULL(free_rec)) {
+ /* Try to allocate from the head of the free list. */
+ ulint foffsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* foffsets = foffsets_;
+ mem_heap_t* heap = NULL;
+
+ rec_offs_init(foffsets_);
+
+ foffsets = rec_get_offsets(free_rec, index, foffsets,
+ ULINT_UNDEFINED, &heap);
+ if (rec_offs_size(foffsets) < rec_size) {
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ goto use_heap;
+ }
+
+ insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+ if (page_is_comp(page)) {
+ heap_no = rec_get_heap_no_new(free_rec);
+ page_mem_alloc_free(page, NULL,
+ rec_get_next_ptr(free_rec, TRUE),
+ rec_size);
+ } else {
+ heap_no = rec_get_heap_no_old(free_rec);
+ page_mem_alloc_free(page, NULL,
+ rec_get_next_ptr(free_rec, FALSE),
+ rec_size);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ } else {
+use_heap:
+ free_rec = NULL;
+ insert_buf = page_mem_alloc_heap(page, NULL,
+ rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(insert_buf == NULL)) {
+ return(NULL);
+ }
+ }
+
+ /* 3. Create the record */
+ insert_rec = rec_copy(insert_buf, rec, offsets);
+ rec_offs_make_valid(insert_rec, index, offsets);
+
+ /* 4. Insert the record in the linked list of records */
+ ut_ad(current_rec != insert_rec);
+
+ {
+ /* next record after current before the insertion */
+ rec_t* next_rec = page_rec_get_next(current_rec);
+#ifdef UNIV_DEBUG
+ if (page_is_comp(page)) {
+ ut_ad(rec_get_status(current_rec)
+ <= REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+ }
+#endif
+ page_rec_set_next(insert_rec, next_rec);
+ page_rec_set_next(current_rec, insert_rec);
+ }
+
+ page_header_set_field(page, NULL, PAGE_N_RECS,
+ 1 + page_get_n_recs(page));
+
+ /* 5. Set the n_owned field in the inserted record to zero,
+ and set the heap_no field */
+ if (page_is_comp(page)) {
+ rec_set_n_owned_new(insert_rec, NULL, 0);
+ rec_set_heap_no_new(insert_rec, heap_no);
+ } else {
+ rec_set_n_owned_old(insert_rec, 0);
+ rec_set_heap_no_old(insert_rec, heap_no);
+ }
+
+ UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+ rec_offs_size(offsets));
+ /* 6. Update the last insertion info in page header */
+
+ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ ut_ad(!last_insert || !page_is_comp(page)
+ || rec_get_node_ptr_flag(last_insert)
+ == rec_get_node_ptr_flag(insert_rec));
+
+ if (UNIV_UNLIKELY(last_insert == NULL)) {
+ page_header_set_field(page, NULL, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+
+ } else if ((last_insert == current_rec)
+ && (page_header_get_field(page, PAGE_DIRECTION)
+ != PAGE_LEFT)) {
+
+ page_header_set_field(page, NULL, PAGE_DIRECTION,
+ PAGE_RIGHT);
+ page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+ page_header_get_field(
+ page, PAGE_N_DIRECTION) + 1);
+
+ } else if ((page_rec_get_next(insert_rec) == last_insert)
+ && (page_header_get_field(page, PAGE_DIRECTION)
+ != PAGE_RIGHT)) {
+
+ page_header_set_field(page, NULL, PAGE_DIRECTION,
+ PAGE_LEFT);
+ page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+ page_header_get_field(
+ page, PAGE_N_DIRECTION) + 1);
+ } else {
+ page_header_set_field(page, NULL, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+ }
+
+ page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec);
+
+ /* 7. It remains to update the owner record. */
+ {
+ rec_t* owner_rec = page_rec_find_owner_rec(insert_rec);
+ ulint n_owned;
+ if (page_is_comp(page)) {
+ n_owned = rec_get_n_owned_new(owner_rec);
+ rec_set_n_owned_new(owner_rec, NULL, n_owned + 1);
+ } else {
+ n_owned = rec_get_n_owned_old(owner_rec);
+ rec_set_n_owned_old(owner_rec, n_owned + 1);
+ }
+
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+ page_dir_split_slot(
+ page, NULL,
+ page_dir_find_owner_slot(owner_rec));
+ }
+ }
+
+ /* 9. Write log record of the insert */
+ if (UNIV_LIKELY(mtr != NULL)) {
+ page_cur_insert_rec_write_log(insert_rec, rec_size,
+ current_rec, index, mtr);
+ }
+
+ return(insert_rec);
+}
+
+/***********************************************************//**
+Compresses or reorganizes a page after an optimistic insert.
+@return rec if succeed, NULL otherwise */
+static
+rec_t*
+page_cur_insert_rec_zip_reorg(
+/*==========================*/
+ rec_t** current_rec,/*!< in/out: pointer to current record after
+ which the new record is inserted */
+ buf_block_t* block, /*!< in: buffer block */
+ dict_index_t* index, /*!< in: record descriptor */
+ rec_t* rec, /*!< in: inserted record */
+ page_t* page, /*!< in: uncompressed page */
+ page_zip_des_t* page_zip,/*!< in: compressed page */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+{
+ ulint pos;
+
+ /* Recompress or reorganize and recompress the page. */
+ if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) {
+ return(rec);
+ }
+
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ pos = page_rec_get_n_recs_before(rec);
+
+ if (page_zip_reorganize(block, index, mtr)) {
+ /* The page was reorganized: Find rec by seeking to pos,
+ and update *current_rec. */
+ rec = page + PAGE_NEW_INFIMUM;
+
+ while (--pos) {
+ rec = page + rec_get_next_offs(rec, TRUE);
+ }
+
+ *current_rec = rec;
+ rec = page + rec_get_next_offs(rec, TRUE);
+
+ return(rec);
+ }
+
+ /* Out of space: restore the page */
+ if (!page_zip_decompress(page_zip, page, FALSE)) {
+ ut_error; /* Memory corrupted? */
+ }
+ ut_ad(page_validate(page, index));
+ return(NULL);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+@return pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+ rec_t** current_rec,/*!< in/out: pointer to current record after
+ which the new record is inserted */
+ buf_block_t* block, /*!< in: buffer block of *current_rec */
+ dict_index_t* index, /*!< in: record descriptor */
+ const rec_t* rec, /*!< in: pointer to a physical record */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle, or NULL */
+{
+ byte* insert_buf;
+ ulint rec_size;
+ page_t* page; /*!< the relevant page */
+ rec_t* last_insert; /*!< cursor position at previous
+ insert */
+ rec_t* free_rec; /*!< a free record that was reused,
+ or NULL */
+ rec_t* insert_rec; /*!< inserted record */
+ ulint heap_no; /*!< heap number of the inserted
+ record */
+ page_zip_des_t* page_zip;
+
+ page_zip = buf_block_get_page_zip(block);
+ ut_ad(page_zip);
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ page = page_align(*current_rec);
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(page_is_comp(page));
+
+ ut_ad(!page_rec_is_supremum(*current_rec));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ /* 1. Get the size of the physical record in the page */
+ rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ {
+ const void* rec_start
+ = rec - rec_offs_extra_size(offsets);
+ ulint extra_size
+ = rec_offs_extra_size(offsets)
+ - (rec_offs_comp(offsets)
+ ? REC_N_NEW_EXTRA_BYTES
+ : REC_N_OLD_EXTRA_BYTES);
+
+ /* All data bytes of the record must be valid. */
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ /* The variable-length header must be valid. */
+ UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+ }
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ /* 2. Try to find suitable space from page memory management */
+ if (!page_zip_available(page_zip, dict_index_is_clust(index),
+ rec_size, 1)) {
+
+ /* Try compressing the whole page afterwards. */
+ insert_rec = page_cur_insert_rec_low(*current_rec,
+ index, rec, offsets,
+ NULL);
+
+ if (UNIV_LIKELY(insert_rec != NULL)) {
+ insert_rec = page_cur_insert_rec_zip_reorg(
+ current_rec, block, index, insert_rec,
+ page, page_zip, mtr);
+ }
+
+ return(insert_rec);
+ }
+
+ free_rec = page_header_get_ptr(page, PAGE_FREE);
+ if (UNIV_LIKELY_NULL(free_rec)) {
+ /* Try to allocate from the head of the free list. */
+ lint extra_size_diff;
+ ulint foffsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* foffsets = foffsets_;
+ mem_heap_t* heap = NULL;
+
+ rec_offs_init(foffsets_);
+
+ foffsets = rec_get_offsets(free_rec, index, foffsets,
+ ULINT_UNDEFINED, &heap);
+ if (rec_offs_size(foffsets) < rec_size) {
+too_small:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ goto use_heap;
+ }
+
+ insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+ /* On compressed pages, do not relocate records from
+ the free list. If extra_size would grow, use the heap. */
+ extra_size_diff
+ = rec_offs_extra_size(offsets)
+ - rec_offs_extra_size(foffsets);
+
+ if (UNIV_UNLIKELY(extra_size_diff < 0)) {
+ /* Add an offset to the extra_size. */
+ if (rec_offs_size(foffsets)
+ < rec_size - extra_size_diff) {
+
+ goto too_small;
+ }
+
+ insert_buf -= extra_size_diff;
+ } else if (UNIV_UNLIKELY(extra_size_diff)) {
+ /* Do not allow extra_size to grow */
+
+ goto too_small;
+ }
+
+ heap_no = rec_get_heap_no_new(free_rec);
+ page_mem_alloc_free(page, page_zip,
+ rec_get_next_ptr(free_rec, TRUE),
+ rec_size);
+
+ if (!page_is_leaf(page)) {
+ /* Zero out the node pointer of free_rec,
+ in case it will not be overwritten by
+ insert_rec. */
+
+ ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+ if (rec_offs_extra_size(foffsets)
+ + rec_offs_data_size(foffsets) > rec_size) {
+
+ memset(rec_get_end(free_rec, foffsets)
+ - REC_NODE_PTR_SIZE, 0,
+ REC_NODE_PTR_SIZE);
+ }
+ } else if (dict_index_is_clust(index)) {
+ /* Zero out the DB_TRX_ID and DB_ROLL_PTR
+ columns of free_rec, in case it will not be
+ overwritten by insert_rec. */
+
+ ulint trx_id_col;
+ ulint trx_id_offs;
+ ulint len;
+
+ trx_id_col = dict_index_get_sys_col_pos(index,
+ DATA_TRX_ID);
+ ut_ad(trx_id_col > 0);
+ ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+ trx_id_offs = rec_get_nth_field_offs(foffsets,
+ trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs
+ + rec_offs_extra_size(foffsets) > rec_size) {
+ /* We will have to zero out the
+ DB_TRX_ID and DB_ROLL_PTR, because
+ they will not be fully overwritten by
+ insert_rec. */
+
+ memset(free_rec + trx_id_offs, 0,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ }
+
+ ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN
+ == rec_get_nth_field(free_rec, foffsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ } else {
+use_heap:
+ free_rec = NULL;
+ insert_buf = page_mem_alloc_heap(page, page_zip,
+ rec_size, &heap_no);
+
+ if (UNIV_UNLIKELY(insert_buf == NULL)) {
+ return(NULL);
+ }
+
+ page_zip_dir_add_slot(page_zip, dict_index_is_clust(index));
+ }
+
+ /* 3. Create the record */
+ insert_rec = rec_copy(insert_buf, rec, offsets);
+ rec_offs_make_valid(insert_rec, index, offsets);
+
+ /* 4. Insert the record in the linked list of records */
+ ut_ad(*current_rec != insert_rec);
+
+ {
+ /* next record after current before the insertion */
+ rec_t* next_rec = page_rec_get_next(*current_rec);
+ ut_ad(rec_get_status(*current_rec)
+ <= REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+ ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+ page_rec_set_next(insert_rec, next_rec);
+ page_rec_set_next(*current_rec, insert_rec);
+ }
+
+ page_header_set_field(page, page_zip, PAGE_N_RECS,
+ 1 + page_get_n_recs(page));
+
+ /* 5. Set the n_owned field in the inserted record to zero,
+ and set the heap_no field */
+ rec_set_n_owned_new(insert_rec, NULL, 0);
+ rec_set_heap_no_new(insert_rec, heap_no);
+
+ UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+ rec_offs_size(offsets));
+
+ page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec);
+
+ /* 6. Update the last insertion info in page header */
+
+ last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+ ut_ad(!last_insert
+ || rec_get_node_ptr_flag(last_insert)
+ == rec_get_node_ptr_flag(insert_rec));
+
+ if (UNIV_UNLIKELY(last_insert == NULL)) {
+ page_header_set_field(page, page_zip, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+ } else if ((last_insert == *current_rec)
+ && (page_header_get_field(page, PAGE_DIRECTION)
+ != PAGE_LEFT)) {
+
+ page_header_set_field(page, page_zip, PAGE_DIRECTION,
+ PAGE_RIGHT);
+ page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+ page_header_get_field(
+ page, PAGE_N_DIRECTION) + 1);
+
+ } else if ((page_rec_get_next(insert_rec) == last_insert)
+ && (page_header_get_field(page, PAGE_DIRECTION)
+ != PAGE_RIGHT)) {
+
+ page_header_set_field(page, page_zip, PAGE_DIRECTION,
+ PAGE_LEFT);
+ page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+ page_header_get_field(
+ page, PAGE_N_DIRECTION) + 1);
+ } else {
+ page_header_set_field(page, page_zip, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+ }
+
+ page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec);
+
+ /* 7. It remains to update the owner record. */
+ {
+ rec_t* owner_rec = page_rec_find_owner_rec(insert_rec);
+ ulint n_owned;
+
+ n_owned = rec_get_n_owned_new(owner_rec);
+ rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1);
+
+ /* 8. Now we have incremented the n_owned field of the owner
+ record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+ we have to split the corresponding directory slot in two. */
+
+ if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+ page_dir_split_slot(
+ page, page_zip,
+ page_dir_find_owner_slot(owner_rec));
+ }
+ }
+
+ page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+
+ /* 9. Write log record of the insert */
+ if (UNIV_LIKELY(mtr != NULL)) {
+ page_cur_insert_rec_write_log(insert_rec, rec_size,
+ *current_rec, index, mtr);
+ }
+
+ return(insert_rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of copying a record list end to a new created page.
+@return 4-byte field where to write the log data length, or NULL if
+logging is disabled */
+UNIV_INLINE
+byte*
+page_copy_rec_list_to_created_page_write_log(
+/*=========================================*/
+ page_t* page, /*!< in: index page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ log_ptr = mlog_open_and_write_index(mtr, page, index,
+ page_is_comp(page)
+ ? MLOG_COMP_LIST_END_COPY_CREATED
+ : MLOG_LIST_END_COPY_CREATED, 4);
+ if (UNIV_LIKELY(log_ptr != NULL)) {
+ mlog_close(mtr, log_ptr + 4);
+ }
+
+ return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ byte* rec_end;
+ ulint log_data_len;
+ page_t* page;
+ page_zip_des_t* page_zip;
+
+ if (ptr + 4 > end_ptr) {
+
+ return(NULL);
+ }
+
+ log_data_len = mach_read_from_4(ptr);
+ ptr += 4;
+
+ rec_end = ptr + log_data_len;
+
+ if (rec_end > end_ptr) {
+
+ return(NULL);
+ }
+
+ if (!block) {
+
+ return(rec_end);
+ }
+
+ while (ptr < rec_end) {
+ ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr,
+ block, index, mtr);
+ }
+
+ ut_a(ptr == rec_end);
+
+ page = buf_block_get_frame(block);
+ page_zip = buf_block_get_page_zip(block);
+
+ page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+ page_header_set_field(page, page_zip, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+ return(rec_end);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+ page_t* new_page, /*!< in/out: index page to copy to */
+ rec_t* rec, /*!< in: first record to copy */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_dir_slot_t* slot = 0; /* remove warning */
+ byte* heap_top;
+ rec_t* insert_rec = 0; /* remove warning */
+ rec_t* prev_rec;
+ ulint count;
+ ulint n_recs;
+ ulint slot_index;
+ ulint rec_size;
+ ulint log_mode;
+ byte* log_ptr;
+ ulint log_data_len;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+ ut_ad(page_align(rec) != new_page);
+ ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page));
+
+ if (page_rec_is_infimum(rec)) {
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ return;
+ }
+
+#ifdef UNIV_DEBUG
+ /* To pass the debug tests we have to set these dummy values
+ in the debug version */
+ page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2);
+ page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP,
+ new_page + UNIV_PAGE_SIZE - 1);
+#endif
+
+ log_ptr = page_copy_rec_list_to_created_page_write_log(new_page,
+ index, mtr);
+
+ log_data_len = dyn_array_get_data_size(&(mtr->log));
+
+ /* Individual inserts are logged in a shorter form */
+
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS);
+
+ prev_rec = page_get_infimum_rec(new_page);
+ if (page_is_comp(new_page)) {
+ heap_top = new_page + PAGE_NEW_SUPREMUM_END;
+ } else {
+ heap_top = new_page + PAGE_OLD_SUPREMUM_END;
+ }
+ count = 0;
+ slot_index = 0;
+ n_recs = 0;
+
+ do {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ insert_rec = rec_copy(heap_top, rec, offsets);
+
+ if (page_is_comp(new_page)) {
+ rec_set_next_offs_new(prev_rec,
+ page_offset(insert_rec));
+
+ rec_set_n_owned_new(insert_rec, NULL, 0);
+ rec_set_heap_no_new(insert_rec,
+ PAGE_HEAP_NO_USER_LOW + n_recs);
+ } else {
+ rec_set_next_offs_old(prev_rec,
+ page_offset(insert_rec));
+
+ rec_set_n_owned_old(insert_rec, 0);
+ rec_set_heap_no_old(insert_rec,
+ PAGE_HEAP_NO_USER_LOW + n_recs);
+ }
+
+ count++;
+ n_recs++;
+
+ if (UNIV_UNLIKELY
+ (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) {
+
+ slot_index++;
+
+ slot = page_dir_get_nth_slot(new_page, slot_index);
+
+ page_dir_slot_set_rec(slot, insert_rec);
+ page_dir_slot_set_n_owned(slot, NULL, count);
+
+ count = 0;
+ }
+
+ rec_size = rec_offs_size(offsets);
+
+ ut_ad(heap_top < new_page + UNIV_PAGE_SIZE);
+
+ heap_top += rec_size;
+
+ page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
+ index, mtr);
+ prev_rec = insert_rec;
+ rec = page_rec_get_next(rec);
+ } while (!page_rec_is_supremum(rec));
+
+ if ((slot_index > 0) && (count + 1
+ + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2
+ <= PAGE_DIR_SLOT_MAX_N_OWNED)) {
+ /* We can merge the two last dir slots. This operation is
+ here to make this function imitate exactly the equivalent
+ task made using page_cur_insert_rec, which we use in database
+ recovery to reproduce the task performed by this function.
+ To be able to check the correctness of recovery, it is good
+ that it imitates exactly. */
+
+ count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+ page_dir_slot_set_n_owned(slot, NULL, 0);
+
+ slot_index--;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len;
+
+ ut_a(log_data_len < 100 * UNIV_PAGE_SIZE);
+
+ if (UNIV_LIKELY(log_ptr != NULL)) {
+ mach_write_to_4(log_ptr, log_data_len);
+ }
+
+ if (page_is_comp(new_page)) {
+ rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM);
+ }
+
+ slot = page_dir_get_nth_slot(new_page, 1 + slot_index);
+
+ page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page));
+ page_dir_slot_set_n_owned(slot, NULL, count + 1);
+
+ page_dir_set_n_slots(new_page, NULL, 2 + slot_index);
+ page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top);
+ page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs);
+ page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs);
+
+ page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL);
+ page_header_set_field(new_page, NULL, PAGE_DIRECTION,
+ PAGE_NO_DIRECTION);
+ page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0);
+
+ /* Restore the log mode */
+
+ mtr_set_log_mode(mtr, log_mode);
+}
+
+/***********************************************************//**
+Writes log record of a record delete on a page. */
+UNIV_INLINE
+void
+page_cur_delete_rec_write_log(
+/*==========================*/
+ rec_t* rec, /*!< in: record to be deleted */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ byte* log_ptr;
+
+ ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+ log_ptr = mlog_open_and_write_index(mtr, rec, index,
+ page_rec_is_comp(rec)
+ ? MLOG_COMP_REC_DELETE
+ : MLOG_REC_DELETE, 2);
+
+ if (!log_ptr) {
+ /* Logging in mtr is switched off during crash recovery:
+ in that case mlog_open returns NULL */
+ return;
+ }
+
+ /* Write the cursor rec offset as a 2-byte ulint */
+ mach_write_to_2(log_ptr, page_offset(rec));
+
+ mlog_close(mtr, log_ptr + 2);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_delete_rec_write_log(rec,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in: page or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ulint offset;
+ page_cur_t cursor;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ /* Read the cursor rec offset as a 2-byte ulint */
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ ut_a(offset <= UNIV_PAGE_SIZE);
+
+ if (block) {
+ page_t* page = buf_block_get_frame(block);
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_t* rec = page + offset;
+ rec_offs_init(offsets_);
+
+ page_cur_position(rec, block, &cursor);
+ ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+ page_cur_delete_rec(&cursor, index,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ return(ptr);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+ page_cur_t* cursor, /*!< in/out: a page cursor */
+ dict_index_t* index, /*!< in: record descriptor */
+ const ulint* offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ page_dir_slot_t* cur_dir_slot;
+ page_dir_slot_t* prev_slot;
+ page_t* page;
+ page_zip_des_t* page_zip;
+ rec_t* current_rec;
+ rec_t* prev_rec = NULL;
+ rec_t* next_rec;
+ ulint cur_slot_no;
+ ulint cur_n_owned;
+ rec_t* rec;
+
+ ut_ad(cursor && mtr);
+
+ page = page_cur_get_page(cursor);
+ page_zip = page_cur_get_page_zip(cursor);
+
+ /* page_zip_validate() will fail here when
+ btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+ Then, both "page_zip" and "page" would have the min-rec-mark
+ set on the smallest user record, but "page" would additionally
+ have it set on the smallest-but-one record. Because sloppy
+ page_zip_validate_low() only ignores min-rec-flag differences
+ in the smallest user record, it cannot be used here either. */
+
+ current_rec = cursor->rec;
+ ut_ad(rec_offs_validate(current_rec, index, offsets));
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ /* The record must not be the supremum or infimum record. */
+ ut_ad(page_rec_is_user_rec(current_rec));
+
+ /* Save to local variables some data associated with current_rec */
+ cur_slot_no = page_dir_find_owner_slot(current_rec);
+ cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
+ cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+ /* 0. Write the log record */
+ page_cur_delete_rec_write_log(current_rec, index, mtr);
+
+ /* 1. Reset the last insert info in the page header and increment
+ the modify clock for the frame */
+
+ page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+ /* The page gets invalid for optimistic searches: increment the
+ frame modify clock */
+
+ buf_block_modify_clock_inc(page_cur_get_block(cursor));
+
+ /* 2. Find the next and the previous record. Note that the cursor is
+ left at the next record. */
+
+ ut_ad(cur_slot_no > 0);
+ prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1);
+
+ rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+ /* rec now points to the record of the previous directory slot. Look
+ for the immediate predecessor of current_rec in a loop. */
+
+ while(current_rec != rec) {
+ prev_rec = rec;
+ rec = page_rec_get_next(rec);
+ }
+
+ page_cur_move_to_next(cursor);
+ next_rec = cursor->rec;
+
+ /* 3. Remove the record from the linked list of records */
+
+ page_rec_set_next(prev_rec, next_rec);
+
+ /* 4. If the deleted record is pointed to by a dir slot, update the
+ record pointer in slot. In the following if-clause we assume that
+ prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+ >= 2. */
+
+#if PAGE_DIR_SLOT_MIN_N_OWNED < 2
+# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2"
+#endif
+ ut_ad(cur_n_owned > 1);
+
+ if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) {
+ page_dir_slot_set_rec(cur_dir_slot, prev_rec);
+ }
+
+ /* 5. Update the number of owned records of the slot */
+
+ page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
+
+ /* 6. Free the memory occupied by the record */
+ page_mem_free(page, page_zip, current_rec, index, offsets);
+
+ /* 7. Now we have decremented the number of owned records of the slot.
+ If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+ slots. */
+
+ if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) {
+ page_dir_balance_slot(page, page_zip, cur_slot_no);
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by page_cur_lcg_prng() to make sure
+(visually) that it works properly. */
+void
+test_page_cur_lcg_prng(
+/*===================*/
+ int n) /*!< in: print first n numbers */
+{
+ int i;
+ unsigned long long rnd;
+
+ for (i = 0; i < n; i++) {
+ rnd = page_cur_lcg_prng();
+ printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+ rnd,
+ rnd % 2,
+ rnd % 3,
+ rnd % 5,
+ rnd % 7,
+ rnd % 11);
+ }
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c
new file mode 100644
index 00000000000..10008f9ac25
--- /dev/null
+++ b/storage/xtradb/page/page0page.c
@@ -0,0 +1,2624 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.c
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#define THIS_MODULE
+#include "page0page.h"
+#ifdef UNIV_NONINL
+#include "page0page.ic"
+#endif
+#undef THIS_MODULE
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "btr0btr.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "lock0lock.h"
+# include "fut0lst.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/* THE INDEX PAGE
+ ==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ const page_t* page;
+ register uint16 rec_offs_bytes;
+ register const page_dir_slot_t* slot;
+ register const page_dir_slot_t* first_slot;
+ register const rec_t* r = rec;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+ first_slot = page_dir_get_nth_slot(page, 0);
+ slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1);
+
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(r) == 0) {
+ r = rec_get_next_ptr_const(r, TRUE);
+ ut_ad(r >= page + PAGE_NEW_SUPREMUM);
+ ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+ }
+ } else {
+ while (rec_get_n_owned_old(r) == 0) {
+ r = rec_get_next_ptr_const(r, FALSE);
+ ut_ad(r >= page + PAGE_OLD_SUPREMUM);
+ ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+ }
+ }
+
+ rec_offs_bytes = mach_encode_2(r - page);
+
+ while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
+
+ if (UNIV_UNLIKELY(slot == first_slot)) {
+ fprintf(stderr,
+ "InnoDB: Probable data corruption on"
+ " page %lu\n"
+ "InnoDB: Original record ",
+ (ulong) page_get_page_no(page));
+
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, rec);
+ }
+
+ fputs("\n"
+ "InnoDB: on that page.\n"
+ "InnoDB: Cannot find the dir slot for record ",
+ stderr);
+ if (page_is_comp(page)) {
+ fputs("(compact record)", stderr);
+ } else {
+ rec_print_old(stderr, page
+ + mach_decode_2(rec_offs_bytes));
+ }
+ fputs("\n"
+ "InnoDB: on that page!\n", stderr);
+
+ buf_page_print(page, 0);
+
+ ut_error;
+ }
+
+ slot += PAGE_DIR_SLOT_SIZE;
+ }
+
+ return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+ page_dir_slot_t* slot) /*!< in: slot */
+{
+ page_t* page;
+ ulint n_slots;
+ ulint n_owned;
+
+ ut_a(slot);
+
+ page = page_align(slot);
+
+ n_slots = page_dir_get_n_slots(page);
+
+ ut_a(slot <= page_dir_get_nth_slot(page, 0));
+ ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+ ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+ if (page_is_comp(page)) {
+ n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+ } else {
+ n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+ }
+
+ if (slot == page_dir_get_nth_slot(page, 0)) {
+ ut_a(n_owned == 1);
+ } else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+ ut_a(n_owned >= 1);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ } else {
+ ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+ ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ }
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+ buf_block_t* block, /*!< in/out: page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in/out: mini-transaction, or NULL */
+{
+ page_t* page = buf_block_get_frame(block);
+#ifndef UNIV_HOTBACKUP
+ const ibool is_hashed = block->is_hashed;
+
+ if (is_hashed) {
+ rw_lock_x_lock(&btr_search_latch);
+ }
+
+ ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#endif /* !UNIV_HOTBACKUP */
+
+ /* It is not necessary to write this change to the redo log, as
+ during a database recovery we assume that the max trx id of every
+ page is the maximum trx id assigned before the crash. */
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+ page_zip_write_header(page_zip,
+ page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+ 8, mtr);
+#ifndef UNIV_HOTBACKUP
+ } else if (mtr) {
+ mlog_write_dulint(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+ trx_id, mtr);
+#endif /* !UNIV_HOTBACKUP */
+ } else {
+ mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+ }
+
+#ifndef UNIV_HOTBACKUP
+ if (is_hashed) {
+ rw_lock_x_unlock(&btr_search_latch);
+ }
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page with enough
+ space available for inserting the record,
+ or NULL */
+ ulint need, /*!< in: total number of bytes needed */
+ ulint* heap_no)/*!< out: this contains the heap number
+ of the allocated record
+ if allocation succeeds */
+{
+ byte* block;
+ ulint avl_space;
+
+ ut_ad(page && heap_no);
+
+ avl_space = page_get_max_insert_size(page, 1);
+
+ if (avl_space >= need) {
+ block = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP,
+ block + need);
+ *heap_no = page_dir_get_n_heap(page);
+
+ page_dir_set_n_heap(page, page_zip, 1 + *heap_no);
+
+ return(block);
+ }
+
+ return(NULL);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of page creation. */
+UNIV_INLINE
+void
+page_create_write_log(
+/*==================*/
+ buf_frame_t* frame, /*!< in: a buffer frame where the page is
+ created */
+ mtr_t* mtr, /*!< in: mini-transaction handle */
+ ibool comp) /*!< in: TRUE=compact page format */
+{
+ mlog_write_initial_log_record(frame, comp
+ ? MLOG_COMP_PAGE_CREATE
+ : MLOG_PAGE_CREATE, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_create_write_log(frame,mtr,comp) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ ulint comp, /*!< in: nonzero=compact page format */
+ buf_block_t* block, /*!< in: block or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ /* The record is empty, except for the record initial part */
+
+ if (block) {
+ page_create(block, mtr, comp);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************//**
+The index page creation function.
+@return pointer to the page */
+static
+page_t*
+page_create_low(
+/*============*/
+ buf_block_t* block, /*!< in: a buffer block where the
+ page is created */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ page_dir_slot_t* slot;
+ mem_heap_t* heap;
+ dtuple_t* tuple;
+ dfield_t* field;
+ byte* heap_top;
+ rec_t* infimum_rec;
+ rec_t* supremum_rec;
+ page_t* page;
+ dict_index_t* index;
+ ulint* offsets;
+
+ ut_ad(block);
+#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA"
+#endif
+#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA"
+#endif
+
+ /* The infimum and supremum records use a dummy index. */
+ if (UNIV_LIKELY(comp)) {
+ index = dict_ind_compact;
+ } else {
+ index = dict_ind_redundant;
+ }
+
+ /* 1. INCREMENT MODIFY CLOCK */
+ buf_block_modify_clock_inc(block);
+
+ page = buf_block_get_frame(block);
+
+ fil_page_set_type(page, FIL_PAGE_INDEX);
+
+ heap = mem_heap_create(200);
+
+ /* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */
+
+ /* Create first a data tuple for infimum record */
+ tuple = dtuple_create(heap, 1);
+ dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM);
+ field = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(field, "infimum", 8);
+ dtype_set(dfield_get_type(field),
+ DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8);
+ /* Set the corresponding physical record to its place in the page
+ record heap */
+
+ heap_top = page + PAGE_DATA;
+
+ infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+ if (UNIV_LIKELY(comp)) {
+ ut_a(infimum_rec == page + PAGE_NEW_INFIMUM);
+
+ rec_set_n_owned_new(infimum_rec, NULL, 1);
+ rec_set_heap_no_new(infimum_rec, 0);
+ } else {
+ ut_a(infimum_rec == page + PAGE_OLD_INFIMUM);
+
+ rec_set_n_owned_old(infimum_rec, 1);
+ rec_set_heap_no_old(infimum_rec, 0);
+ }
+
+ offsets = rec_get_offsets(infimum_rec, index, NULL,
+ ULINT_UNDEFINED, &heap);
+
+ heap_top = rec_get_end(infimum_rec, offsets);
+
+ /* Create then a tuple for supremum */
+
+ tuple = dtuple_create(heap, 1);
+ dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM);
+ field = dtuple_get_nth_field(tuple, 0);
+
+ dfield_set_data(field, "supremum", comp ? 8 : 9);
+ dtype_set(dfield_get_type(field),
+ DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9);
+
+ supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+ if (UNIV_LIKELY(comp)) {
+ ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM);
+
+ rec_set_n_owned_new(supremum_rec, NULL, 1);
+ rec_set_heap_no_new(supremum_rec, 1);
+ } else {
+ ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM);
+
+ rec_set_n_owned_old(supremum_rec, 1);
+ rec_set_heap_no_old(supremum_rec, 1);
+ }
+
+ offsets = rec_get_offsets(supremum_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ heap_top = rec_get_end(supremum_rec, offsets);
+
+ ut_ad(heap_top == page
+ + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END));
+
+ mem_heap_free(heap);
+
+ /* 4. INITIALIZE THE PAGE */
+
+ page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2);
+ page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top);
+ page_header_set_field(page, NULL, PAGE_N_HEAP, comp
+ ? 0x8000 | PAGE_HEAP_NO_USER_LOW
+ : PAGE_HEAP_NO_USER_LOW);
+ page_header_set_ptr(page, NULL, PAGE_FREE, NULL);
+ page_header_set_field(page, NULL, PAGE_GARBAGE, 0);
+ page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL);
+ page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION);
+ page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+ page_header_set_field(page, NULL, PAGE_N_RECS, 0);
+ page_set_max_trx_id(block, NULL, ut_dulint_zero, NULL);
+ memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START
+ - page_offset(heap_top));
+
+ /* 5. SET POINTERS IN RECORDS AND DIR SLOTS */
+
+ /* Set the slots to point to infimum and supremum. */
+
+ slot = page_dir_get_nth_slot(page, 0);
+ page_dir_slot_set_rec(slot, infimum_rec);
+
+ slot = page_dir_get_nth_slot(page, 1);
+ page_dir_slot_set_rec(slot, supremum_rec);
+
+ /* Set the next pointers in infimum and supremum */
+
+ if (UNIV_LIKELY(comp)) {
+ rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM);
+ rec_set_next_offs_new(supremum_rec, 0);
+ } else {
+ rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM);
+ rec_set_next_offs_old(supremum_rec, 0);
+ }
+
+ return(page);
+}
+
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+ buf_block_t* block, /*!< in: a buffer block where the
+ page is created */
+ mtr_t* mtr, /*!< in: mini-transaction handle */
+ ulint comp) /*!< in: nonzero=compact page format */
+{
+ page_create_write_log(buf_block_get_frame(block), mtr, comp);
+ return(page_create_low(block, comp));
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+ buf_block_t* block, /*!< in/out: a buffer frame where the
+ page is created */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint level, /*!< in: the B-tree level of the page */
+ mtr_t* mtr) /*!< in: mini-transaction handle */
+{
+ page_t* page;
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+
+ ut_ad(block);
+ ut_ad(page_zip);
+ ut_ad(index);
+ ut_ad(dict_table_is_comp(index->table));
+
+ page = page_create_low(block, TRUE);
+ mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level);
+
+ if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+ /* The compression of a newly created page
+ should always succeed. */
+ ut_error;
+ }
+
+ return(page);
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+ buf_block_t* new_block, /*!< in: index page to copy to */
+ buf_block_t* block, /*!< in: index page of rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_cur_t cur1;
+ rec_t* cur2;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ page_cur_position(rec, block, &cur1);
+
+ if (page_cur_is_before_first(&cur1)) {
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ ut_a((ibool)!!page_is_comp(new_page)
+ == dict_table_is_comp(index->table));
+ ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+ ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint)
+ (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+
+ cur2 = page_get_infimum_rec(buf_block_get_frame(new_block));
+
+ /* Copy records from the original page to the new page */
+
+ while (!page_cur_is_after_last(&cur1)) {
+ rec_t* cur1_rec = page_cur_get_rec(&cur1);
+ rec_t* ins_rec;
+ offsets = rec_get_offsets(cur1_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ins_rec = page_cur_insert_rec_low(cur2, index,
+ cur1_rec, offsets, mtr);
+ if (UNIV_UNLIKELY(!ins_rec)) {
+ /* Track an assertion failure reported on the mailing
+ list on June 18th, 2003 */
+
+ buf_page_print(new_page, 0);
+ buf_page_print(page_align(rec), 0);
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ "InnoDB: rec offset %lu, cur1 offset %lu,"
+ " cur2 offset %lu\n",
+ (ulong) page_offset(rec),
+ (ulong) page_offset(page_cur_get_rec(&cur1)),
+ (ulong) page_offset(cur2));
+ ut_error;
+ }
+
+ page_cur_move_to_next(&cur1);
+ cur2 = ins_rec;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_t* page = page_align(rec);
+ rec_t* ret = page_rec_get_next(
+ page_get_infimum_rec(new_page));
+ ulint log_mode = 0; /* remove warning */
+
+#ifdef UNIV_ZIP_DEBUG
+ if (new_page_zip) {
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ ut_a(page_zip);
+
+ /* Strict page_zip_validate() may fail here.
+ Furthermore, btr_compress() may set FIL_PAGE_PREV to
+ FIL_NULL on new_page while leaving it intact on
+ new_page_zip. So, we cannot validate new_page_zip. */
+ ut_a(page_zip_validate_low(page_zip, page, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+ ut_ad(buf_block_get_frame(block) == page);
+ ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+ ut_ad(page_is_comp(page) == page_is_comp(new_page));
+ /* Here, "ret" may be pointing to a user record or the
+ predefined supremum record. */
+
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+ }
+
+ if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) {
+ page_copy_rec_list_end_to_created_page(new_page, rec,
+ index, mtr);
+ } else {
+ page_copy_rec_list_end_no_locks(new_block, block, rec,
+ index, mtr);
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page), mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (UNIV_UNLIKELY
+ (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ulint ret_pos
+ = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the successor of
+ the predefined infimum record. It must still
+ have at least one predecessor (the predefined
+ infimum record, or a freshly copied record
+ that is smaller than "ret"). */
+ ut_a(ret_pos > 0);
+
+ if (UNIV_UNLIKELY
+ (!page_zip_reorganize(new_block, index, mtr))) {
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress(new_page_zip,
+ new_page, FALSE))) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+ return(NULL);
+ } else {
+ /* The page was reorganized:
+ Seek to ret_pos. */
+ ret = new_page + PAGE_NEW_INFIMUM;
+
+ do {
+ ret = rec_get_next_ptr(ret, TRUE);
+ } while (--ret_pos);
+ }
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ lock_move_rec_list_end(new_block, block, rec);
+
+ btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+ return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page to copy to */
+ buf_block_t* block, /*!< in: index page containing rec */
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ page_zip_des_t* new_page_zip = buf_block_get_page_zip(new_block);
+ page_cur_t cur1;
+ rec_t* cur2;
+ ulint log_mode = 0 /* remove warning */;
+ mem_heap_t* heap = NULL;
+ rec_t* ret
+ = page_rec_get_prev(page_get_supremum_rec(new_page));
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ /* Here, "ret" may be pointing to a user record or the
+ predefined infimum record. */
+
+ if (page_rec_is_infimum(rec)) {
+
+ return(ret);
+ }
+
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+ }
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ cur2 = ret;
+
+ /* Copy records from the original page to the new page */
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ rec_t* cur1_rec = page_cur_get_rec(&cur1);
+ offsets = rec_get_offsets(cur1_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ cur2 = page_cur_insert_rec_low(cur2, index,
+ cur1_rec, offsets, mtr);
+ ut_a(cur2);
+
+ page_cur_move_to_next(&cur1);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Update PAGE_MAX_TRX_ID on the uncompressed page.
+ Modifications will be redo logged and copied to the compressed
+ page in page_zip_compress() or page_zip_reorganize() below. */
+ if (dict_index_is_sec_or_ibuf(index)
+ && page_is_leaf(page_align(rec))) {
+ page_update_max_trx_id(new_block, NULL,
+ page_get_max_trx_id(page_align(rec)),
+ mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(new_page_zip)) {
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (UNIV_UNLIKELY
+ (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+ /* Before trying to reorganize the page,
+ store the number of preceding records on the page. */
+ ulint ret_pos
+ = page_rec_get_n_recs_before(ret);
+ /* Before copying, "ret" was the predecessor
+ of the predefined supremum record. If it was
+ the predefined infimum record, then it would
+ still be the infimum. Thus, the assertion
+ ut_a(ret_pos > 0) would fail here. */
+
+ if (UNIV_UNLIKELY
+ (!page_zip_reorganize(new_block, index, mtr))) {
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress(new_page_zip,
+ new_page, FALSE))) {
+ ut_error;
+ }
+ ut_ad(page_validate(new_page, index));
+ return(NULL);
+ } else {
+ /* The page was reorganized:
+ Seek to ret_pos. */
+ ret = new_page + PAGE_NEW_INFIMUM;
+
+ do {
+ ret = rec_get_next_ptr(ret, TRUE);
+ } while (--ret_pos);
+ }
+ }
+ }
+
+ /* Update the lock table and possible hash index */
+
+ lock_move_rec_list_start(new_block, block, rec, ret);
+
+ btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+ return(ret);
+}
+
+/**********************************************************//**
+Writes a log record of a record list end or start deletion. */
+UNIV_INLINE
+void
+page_delete_rec_list_write_log(
+/*===========================*/
+ rec_t* rec, /*!< in: record on page */
+ dict_index_t* index, /*!< in: record descriptor */
+ byte type, /*!< in: operation type:
+ MLOG_LIST_END_DELETE, ... */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ ut_ad(type == MLOG_LIST_END_DELETE
+ || type == MLOG_LIST_START_DELETE
+ || type == MLOG_COMP_LIST_END_DELETE
+ || type == MLOG_COMP_LIST_START_DELETE);
+
+ log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2);
+ if (log_ptr) {
+ /* Write the parameter as a 2-byte ulint */
+ mach_write_to_2(log_ptr, page_offset(rec));
+ mlog_close(mtr, log_ptr + 2);
+ }
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_delete_rec_list_write_log(rec,index,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+ byte type, /*!< in: MLOG_LIST_END_DELETE,
+ MLOG_LIST_START_DELETE,
+ MLOG_COMP_LIST_END_DELETE or
+ MLOG_COMP_LIST_START_DELETE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ buf_block_t* block, /*!< in/out: buffer block or NULL */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ page_t* page;
+ ulint offset;
+
+ ut_ad(type == MLOG_LIST_END_DELETE
+ || type == MLOG_LIST_START_DELETE
+ || type == MLOG_COMP_LIST_END_DELETE
+ || type == MLOG_COMP_LIST_START_DELETE);
+
+ /* Read the record offset as a 2-byte ulint */
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (!block) {
+
+ return(ptr);
+ }
+
+ page = buf_block_get_frame(block);
+
+ ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ if (type == MLOG_LIST_END_DELETE
+ || type == MLOG_COMP_LIST_END_DELETE) {
+ page_delete_rec_list_end(page + offset, block, index,
+ ULINT_UNDEFINED, ULINT_UNDEFINED,
+ mtr);
+ } else {
+ page_delete_rec_list_start(page + offset, block, index, mtr);
+ }
+
+ return(ptr);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+ rec_t* rec, /*!< in: pointer to record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n_recs, /*!< in: number of records to delete,
+ or ULINT_UNDEFINED if not known */
+ ulint size, /*!< in: the sum of the sizes of the
+ records in the end of the chain to
+ delete, or ULINT_UNDEFINED if not known */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_dir_slot_t*slot;
+ ulint slot_index;
+ rec_t* last_rec;
+ rec_t* prev_rec;
+ ulint n_owned;
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ page_t* page = page_align(rec);
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE);
+ ut_ad(!page_zip || page_rec_is_comp(rec));
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_infimum(rec)) {
+ rec = page_rec_get_next(rec);
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ return;
+ }
+
+ /* Reset the last insert info in the page header and increment
+ the modify clock for the frame */
+
+ page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+ /* The page gets invalid for optimistic searches: increment the
+ frame modify clock */
+
+ buf_block_modify_clock_inc(block);
+
+ page_delete_rec_list_write_log(rec, index, page_is_comp(page)
+ ? MLOG_COMP_LIST_END_DELETE
+ : MLOG_LIST_END_DELETE, mtr);
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ ulint log_mode;
+
+ ut_a(page_is_comp(page));
+ /* Individual deletes are not logged */
+
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+ do {
+ page_cur_t cur;
+ page_cur_position(rec, block, &cur);
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ rec = rec_get_next_ptr(rec, TRUE);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ page_cur_delete_rec(&cur, index, offsets, mtr);
+ } while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Restore log mode */
+
+ mtr_set_log_mode(mtr, log_mode);
+ return;
+ }
+
+ prev_rec = page_rec_get_prev(rec);
+
+ last_rec = page_rec_get_prev(page_get_supremum_rec(page));
+
+ if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) {
+ rec_t* rec2 = rec;
+ /* Calculate the sum of sizes and the number of records */
+ size = 0;
+ n_recs = 0;
+
+ do {
+ ulint s;
+ offsets = rec_get_offsets(rec2, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ s = rec_offs_size(offsets);
+ ut_ad(rec2 - page + s - rec_offs_extra_size(offsets)
+ < UNIV_PAGE_SIZE);
+ ut_ad(size + s < UNIV_PAGE_SIZE);
+ size += s;
+ n_recs++;
+
+ rec2 = page_rec_get_next(rec2);
+ } while (!page_rec_is_supremum(rec2));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ ut_ad(size < UNIV_PAGE_SIZE);
+
+ /* Update the page directory; there is no need to balance the number
+ of the records owned by the supremum record, as it is allowed to be
+ less than PAGE_DIR_SLOT_MIN_N_OWNED */
+
+ if (page_is_comp(page)) {
+ rec_t* rec2 = rec;
+ ulint count = 0;
+
+ while (rec_get_n_owned_new(rec2) == 0) {
+ count++;
+
+ rec2 = rec_get_next_ptr(rec2, TRUE);
+ }
+
+ ut_ad(rec_get_n_owned_new(rec2) > count);
+
+ n_owned = rec_get_n_owned_new(rec2) - count;
+ slot_index = page_dir_find_owner_slot(rec2);
+ slot = page_dir_get_nth_slot(page, slot_index);
+ } else {
+ rec_t* rec2 = rec;
+ ulint count = 0;
+
+ while (rec_get_n_owned_old(rec2) == 0) {
+ count++;
+
+ rec2 = rec_get_next_ptr(rec2, FALSE);
+ }
+
+ ut_ad(rec_get_n_owned_old(rec2) > count);
+
+ n_owned = rec_get_n_owned_old(rec2) - count;
+ slot_index = page_dir_find_owner_slot(rec2);
+ slot = page_dir_get_nth_slot(page, slot_index);
+ }
+
+ page_dir_slot_set_rec(slot, page_get_supremum_rec(page));
+ page_dir_slot_set_n_owned(slot, NULL, n_owned);
+
+ page_dir_set_n_slots(page, NULL, slot_index + 1);
+
+ /* Remove the record chain segment from the record chain */
+ page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+
+ /* Catenate the deleted chain segment to the page free list */
+
+ page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
+ page_header_set_ptr(page, NULL, PAGE_FREE, rec);
+
+ page_header_set_field(page, NULL, PAGE_GARBAGE, size
+ + page_header_get_field(page, PAGE_GARBAGE));
+
+ page_header_set_field(page, NULL, PAGE_N_RECS,
+ (ulint)(page_get_n_recs(page) - n_recs));
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+ rec_t* rec, /*!< in: record on page */
+ buf_block_t* block, /*!< in: buffer block of the page */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_cur_t cur1;
+ ulint log_mode;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ mem_heap_t* heap = NULL;
+ byte type;
+
+ rec_offs_init(offsets_);
+
+ ut_ad((ibool) !!page_rec_is_comp(rec)
+ == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* page_zip= buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+
+ /* page_zip_validate() would detect a min_rec_mark mismatch
+ in btr_page_split_and_insert()
+ between btr_attach_half_pages() and insert_page = ...
+ when btr_page_get_split_rec_to_left() holds
+ (direction == FSP_DOWN). */
+ ut_a(!page_zip || page_zip_validate_low(page_zip, page, TRUE));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (page_rec_is_infimum(rec)) {
+
+ return;
+ }
+
+ if (page_rec_is_comp(rec)) {
+ type = MLOG_COMP_LIST_START_DELETE;
+ } else {
+ type = MLOG_LIST_START_DELETE;
+ }
+
+ page_delete_rec_list_write_log(rec, index, type, mtr);
+
+ page_cur_set_before_first(block, &cur1);
+ page_cur_move_to_next(&cur1);
+
+ /* Individual deletes are not logged */
+
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+ while (page_cur_get_rec(&cur1) != rec) {
+ offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+ offsets, ULINT_UNDEFINED, &heap);
+ page_cur_delete_rec(&cur1, index, offsets, mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Restore log mode */
+
+ mtr_set_log_mode(mtr, log_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in: index page from where to move */
+ rec_t* split_rec, /*!< in: first record to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* new_page = buf_block_get_frame(new_block);
+ ulint old_data_size;
+ ulint new_data_size;
+ ulint old_n_recs;
+ ulint new_n_recs;
+
+ old_data_size = page_get_data_size(new_page);
+ old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+ {
+ page_zip_des_t* new_page_zip
+ = buf_block_get_page_zip(new_block);
+ page_zip_des_t* page_zip
+ = buf_block_get_page_zip(block);
+ ut_a(!new_page_zip == !page_zip);
+ ut_a(!new_page_zip
+ || page_zip_validate(new_page_zip, new_page));
+ ut_a(!page_zip
+ || page_zip_validate(page_zip, page_align(split_rec)));
+ }
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ new_data_size = page_get_data_size(new_page);
+ new_n_recs = page_get_n_recs(new_page);
+
+ ut_ad(new_data_size >= old_data_size);
+
+ page_delete_rec_list_end(split_rec, block, index,
+ new_n_recs - old_n_recs,
+ new_data_size - old_data_size, mtr);
+
+ return(TRUE);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+@return TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+ buf_block_t* new_block, /*!< in/out: index page where to move */
+ buf_block_t* block, /*!< in/out: page containing split_rec */
+ rec_t* split_rec, /*!< in: first record not to move */
+ dict_index_t* index, /*!< in: record descriptor */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
+ split_rec, index, mtr))) {
+ return(FALSE);
+ }
+
+ page_delete_rec_list_start(split_rec, block, index, mtr);
+
+ return(TRUE);
+}
+
+/***********************************************************************//**
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary record. */
+UNIV_INTERN
+void
+page_rec_write_index_page_no(
+/*=========================*/
+ rec_t* rec, /*!< in: record to update */
+ ulint i, /*!< in: index of the field to update */
+ ulint page_no,/*!< in: value to write */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* data;
+ ulint len;
+
+ data = rec_get_nth_field_old(rec, i, &len);
+
+ ut_ad(len == 4);
+
+ mlog_write_ulint(data, page_no, MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Used to delete n slots from the directory. This function updates
+also n_owned fields in the records, so that the first slot after
+the deleted ones inherits the records of the deleted slots. */
+UNIV_INLINE
+void
+page_dir_delete_slot(
+/*=================*/
+ page_t* page, /*!< in/out: the index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint slot_no)/*!< in: slot to be deleted */
+{
+ page_dir_slot_t* slot;
+ ulint n_owned;
+ ulint i;
+ ulint n_slots;
+
+ ut_ad(!page_zip || page_is_comp(page));
+ ut_ad(slot_no > 0);
+ ut_ad(slot_no + 1 < page_dir_get_n_slots(page));
+
+ n_slots = page_dir_get_n_slots(page);
+
+ /* 1. Reset the n_owned fields of the slots to be
+ deleted */
+ slot = page_dir_get_nth_slot(page, slot_no);
+ n_owned = page_dir_slot_get_n_owned(slot);
+ page_dir_slot_set_n_owned(slot, page_zip, 0);
+
+ /* 2. Update the n_owned value of the first non-deleted slot */
+
+ slot = page_dir_get_nth_slot(page, slot_no + 1);
+ page_dir_slot_set_n_owned(slot, page_zip,
+ n_owned + page_dir_slot_get_n_owned(slot));
+
+ /* 3. Destroy the slot by copying slots */
+ for (i = slot_no + 1; i < n_slots; i++) {
+ rec_t* rec = (rec_t*)
+ page_dir_slot_get_rec(page_dir_get_nth_slot(page, i));
+ page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec);
+ }
+
+ /* 4. Zero out the last slot, which will be removed */
+ mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0);
+
+ /* 5. Update the page header */
+ page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1);
+}
+
+/**************************************************************//**
+Used to add n slots to the directory. Does not set the record pointers
+in the added slots or update n_owned values: this is the responsibility
+of the caller. */
+UNIV_INLINE
+void
+page_dir_add_slot(
+/*==============*/
+ page_t* page, /*!< in/out: the index page */
+ page_zip_des_t* page_zip,/*!< in/out: comprssed page, or NULL */
+ ulint start) /*!< in: the slot above which the new slots
+ are added */
+{
+ page_dir_slot_t* slot;
+ ulint n_slots;
+
+ n_slots = page_dir_get_n_slots(page);
+
+ ut_ad(start < n_slots - 1);
+
+ /* Update the page header */
+ page_dir_set_n_slots(page, page_zip, n_slots + 1);
+
+ /* Move slots up */
+ slot = page_dir_get_nth_slot(page, n_slots);
+ memmove(slot, slot + PAGE_DIR_SLOT_SIZE,
+ (n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE);
+}
+
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page whose
+ uncompressed part will be written, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+{
+ rec_t* rec;
+ page_dir_slot_t* new_slot;
+ page_dir_slot_t* prev_slot;
+ page_dir_slot_t* slot;
+ ulint i;
+ ulint n_owned;
+
+ ut_ad(page);
+ ut_ad(!page_zip || page_is_comp(page));
+ ut_ad(slot_no > 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ n_owned = page_dir_slot_get_n_owned(slot);
+ ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1);
+
+ /* 1. We loop to find a record approximately in the middle of the
+ records owned by the slot. */
+
+ prev_slot = page_dir_get_nth_slot(page, slot_no - 1);
+ rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+ for (i = 0; i < n_owned / 2; i++) {
+ rec = page_rec_get_next(rec);
+ }
+
+ ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED);
+
+ /* 2. We add one directory slot immediately below the slot to be
+ split. */
+
+ page_dir_add_slot(page, page_zip, slot_no - 1);
+
+ /* The added slot is now number slot_no, and the old slot is
+ now number slot_no + 1 */
+
+ new_slot = page_dir_get_nth_slot(page, slot_no);
+ slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+ /* 3. We store the appropriate values to the new slot. */
+
+ page_dir_slot_set_rec(new_slot, rec);
+ page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2);
+
+ /* 4. Finally, we update the number of records field of the
+ original slot */
+
+ page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2));
+}
+
+/*************************************************************//**
+Tries to balance the given directory slot with too few records with the upper
+neighbor, so that there are at least the minimum number of records owned by
+the slot; this may result in the merging of two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+ page_t* page, /*!< in/out: index page */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ ulint slot_no)/*!< in: the directory slot */
+{
+ page_dir_slot_t* slot;
+ page_dir_slot_t* up_slot;
+ ulint n_owned;
+ ulint up_n_owned;
+ rec_t* old_rec;
+ rec_t* new_rec;
+
+ ut_ad(page);
+ ut_ad(!page_zip || page_is_comp(page));
+ ut_ad(slot_no > 0);
+
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ /* The last directory slot cannot be balanced with the upper
+ neighbor, as there is none. */
+
+ if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) {
+
+ return;
+ }
+
+ up_slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+ n_owned = page_dir_slot_get_n_owned(slot);
+ up_n_owned = page_dir_slot_get_n_owned(up_slot);
+
+ ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+ /* If the upper slot has the minimum value of n_owned, we will merge
+ the two slots, therefore we assert: */
+ ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED);
+
+ if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) {
+
+ /* In this case we can just transfer one record owned
+ by the upper slot to the property of the lower slot */
+ old_rec = (rec_t*) page_dir_slot_get_rec(slot);
+
+ if (page_is_comp(page)) {
+ new_rec = rec_get_next_ptr(old_rec, TRUE);
+
+ rec_set_n_owned_new(old_rec, page_zip, 0);
+ rec_set_n_owned_new(new_rec, page_zip, n_owned + 1);
+ } else {
+ new_rec = rec_get_next_ptr(old_rec, FALSE);
+
+ rec_set_n_owned_old(old_rec, 0);
+ rec_set_n_owned_old(new_rec, n_owned + 1);
+ }
+
+ page_dir_slot_set_rec(slot, new_rec);
+
+ page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1);
+ } else {
+ /* In this case we may merge the two slots */
+ page_dir_delete_slot(page, page_zip, slot_no);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the record list. If there are an even number
+of records in the list, returns the first record of the upper half-list.
+@return middle record */
+UNIV_INTERN
+rec_t*
+page_get_middle_rec(
+/*================*/
+ page_t* page) /*!< in: page */
+{
+ page_dir_slot_t* slot;
+ ulint middle;
+ ulint i;
+ ulint n_owned;
+ ulint count;
+ rec_t* rec;
+
+ /* This many records we must leave behind */
+ middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2;
+
+ count = 0;
+
+ for (i = 0;; i++) {
+
+ slot = page_dir_get_nth_slot(page, i);
+ n_owned = page_dir_slot_get_n_owned(slot);
+
+ if (count + n_owned > middle) {
+ break;
+ } else {
+ count += n_owned;
+ }
+ }
+
+ ut_ad(i > 0);
+ slot = page_dir_get_nth_slot(page, i - 1);
+ rec = (rec_t*) page_dir_slot_get_rec(slot);
+ rec = page_rec_get_next(rec);
+
+ /* There are now count records behind rec */
+
+ for (i = 0; i < middle - count; i++) {
+ rec = page_rec_get_next(rec);
+ }
+
+ return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+ const rec_t* rec) /*!< in: the physical record */
+{
+ const page_dir_slot_t* slot;
+ const rec_t* slot_rec;
+ const page_t* page;
+ ulint i;
+ lint n = 0;
+
+ ut_ad(page_rec_check(rec));
+
+ page = page_align(rec);
+ if (page_is_comp(page)) {
+ while (rec_get_n_owned_new(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, TRUE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += rec_get_n_owned_new(slot_rec);
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ } else {
+ while (rec_get_n_owned_old(rec) == 0) {
+
+ rec = rec_get_next_ptr_const(rec, FALSE);
+ n--;
+ }
+
+ for (i = 0; ; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ slot_rec = page_dir_slot_get_rec(slot);
+
+ n += rec_get_n_owned_old(slot_rec);
+
+ if (rec == slot_rec) {
+
+ break;
+ }
+ }
+ }
+
+ n--;
+
+ ut_ad(n >= 0);
+
+ return((ulint) n);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: record descriptor */
+{
+ ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+ rec_print_new(stderr, rec, offsets);
+ if (page_rec_is_comp(rec)) {
+ fprintf(stderr,
+ " n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+ (ulong) rec_get_n_owned_new(rec),
+ (ulong) rec_get_heap_no_new(rec),
+ (ulong) rec_get_next_offs(rec, TRUE));
+ } else {
+ fprintf(stderr,
+ " n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+ (ulong) rec_get_n_owned_old(rec),
+ (ulong) rec_get_heap_no_old(rec),
+ (ulong) rec_get_next_offs(rec, TRUE));
+ }
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+ page_t* page, /*!< in: index page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ ulint n;
+ ulint i;
+ page_dir_slot_t* slot;
+
+ n = page_dir_get_n_slots(page);
+
+ fprintf(stderr, "--------------------------------\n"
+ "PAGE DIRECTORY\n"
+ "Page address %p\n"
+ "Directory stack top at offs: %lu; number of slots: %lu\n",
+ page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+ (ulong) n);
+ for (i = 0; i < n; i++) {
+ slot = page_dir_get_nth_slot(page, i);
+ if ((i == pr_n) && (i < n - pr_n)) {
+ fputs(" ... \n", stderr);
+ }
+ if ((i < pr_n) || (i >= n - pr_n)) {
+ fprintf(stderr,
+ "Contents of slot: %lu: n_owned: %lu,"
+ " rec offs: %lu\n",
+ (ulong) i,
+ (ulong) page_dir_slot_get_n_owned(slot),
+ (ulong)
+ page_offset(page_dir_slot_get_rec(slot)));
+ }
+ }
+ fprintf(stderr, "Total of %lu records\n"
+ "--------------------------------\n",
+ (ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint pr_n) /*!< in: print n first and n last entries */
+{
+ page_t* page = block->frame;
+ page_cur_t cur;
+ ulint count;
+ ulint n_recs;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+ fprintf(stderr,
+ "--------------------------------\n"
+ "PAGE RECORD LIST\n"
+ "Page address %p\n", page);
+
+ n_recs = page_get_n_recs(page);
+
+ page_cur_set_before_first(block, &cur);
+ count = 0;
+ for (;;) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+
+ if (count == pr_n) {
+ break;
+ }
+ if (page_cur_is_after_last(&cur)) {
+ break;
+ }
+ page_cur_move_to_next(&cur);
+ count++;
+ }
+
+ if (n_recs > 2 * pr_n) {
+ fputs(" ... \n", stderr);
+ }
+
+ while (!page_cur_is_after_last(&cur)) {
+ page_cur_move_to_next(&cur);
+
+ if (count + pr_n >= n_recs) {
+ offsets = rec_get_offsets(cur.rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ page_rec_print(cur.rec, offsets);
+ }
+ count++;
+ }
+
+ fprintf(stderr,
+ "Total of %lu records \n"
+ "--------------------------------\n",
+ (ulong) (count + 1));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+ const page_t* page)
+{
+ fprintf(stderr,
+ "--------------------------------\n"
+ "PAGE HEADER INFO\n"
+ "Page address %p, n records %lu (%s)\n"
+ "n dir slots %lu, heap top %lu\n"
+ "Page n heap %lu, free %lu, garbage %lu\n"
+ "Page last insert %lu, direction %lu, n direction %lu\n",
+ page, (ulong) page_header_get_field(page, PAGE_N_RECS),
+ page_is_comp(page) ? "compact format" : "original format",
+ (ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS),
+ (ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+ (ulong) page_dir_get_n_heap(page),
+ (ulong) page_header_get_field(page, PAGE_FREE),
+ (ulong) page_header_get_field(page, PAGE_GARBAGE),
+ (ulong) page_header_get_field(page, PAGE_LAST_INSERT),
+ (ulong) page_header_get_field(page, PAGE_DIRECTION),
+ (ulong) page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+ buf_block_t* block, /*!< in: index page */
+ dict_index_t* index, /*!< in: dictionary index of the page */
+ ulint dn, /*!< in: print dn first and last entries
+ in directory */
+ ulint rn) /*!< in: print rn first and last records
+ in directory */
+{
+ page_t* page = block->frame;
+
+ page_header_print(page);
+ page_dir_print(page, dn);
+ page_print_list(block, index, rn);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+ rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_owned;
+ ulint heap_no;
+ page_t* page;
+
+ page = page_align(rec);
+ ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+ page_rec_check(rec);
+ rec_validate(rec, offsets);
+
+ if (page_rec_is_comp(rec)) {
+ n_owned = rec_get_n_owned_new(rec);
+ heap_no = rec_get_heap_no_new(rec);
+ } else {
+ n_owned = rec_get_n_owned_old(rec);
+ heap_no = rec_get_heap_no_old(rec);
+ }
+
+ if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+ fprintf(stderr,
+ "InnoDB: Dir slot of rec %lu, n owned too big %lu\n",
+ (ulong) page_offset(rec), (ulong) n_owned);
+ return(FALSE);
+ }
+
+ if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+ fprintf(stderr,
+ "InnoDB: Heap no of rec %lu too big %lu %lu\n",
+ (ulong) page_offset(rec), (ulong) heap_no,
+ (ulong) page_dir_get_n_heap(page));
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+ const page_t* page) /*!< in: index page */
+{
+ ulint n_slots;
+ ulint infimum_offs;
+ ulint supremum_offs;
+
+ n_slots = page_dir_get_n_slots(page);
+ infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+ supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+ n_slots - 1));
+
+ if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+ fprintf(stderr,
+ "InnoDB: Page directory corruption:"
+ " infimum not pointed to\n");
+ buf_page_print(page, 0);
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+ fprintf(stderr,
+ "InnoDB: Page directory corruption:"
+ " supremum not pointed to\n");
+ buf_page_print(page, 0);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+ page_t* page) /*!< in: old-style index page */
+{
+ page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ rec_t* rec;
+ byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(!page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+ fprintf(stderr,
+ "InnoDB: Nonsensical number %lu of page dir slots\n",
+ (ulong) n_slots);
+
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+ fprintf(stderr,
+ "InnoDB: Record heap and dir overlap on a page,"
+ " heap top %lu, dir %lu\n",
+ (ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+ (ulong)
+ page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ fprintf(stderr,
+ "InnoDB: Record %lu is above"
+ " rec heap top %lu\n",
+ (ulong)(rec - page),
+ (ulong)(rec_heap_top - page));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+ != own_count)) {
+
+ fprintf(stderr,
+ "InnoDB: Wrong owned count %lu, %lu,"
+ " rec %lu\n",
+ (ulong) rec_get_n_owned_old(rec),
+ (ulong) own_count,
+ (ulong)(rec - page));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ fprintf(stderr,
+ "InnoDB: Dir slot does not point"
+ " to right rec %lu\n",
+ (ulong)(rec - page));
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Next record offset"
+ " nonsensical %lu for rec %lu\n",
+ (ulong) rec_get_next_offs(rec, FALSE),
+ (ulong) (rec - page));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Page record list appears"
+ " to be circular %lu\n",
+ (ulong) count);
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+ fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n");
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+ (ulong) slot_no, (ulong) (n_slots - 1));
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+ (ulong) page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW,
+ (ulong) (count + 1));
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Free list record has"
+ " a nonsensical offset %lu\n",
+ (ulong) (rec - page));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ fprintf(stderr,
+ "InnoDB: Free list record %lu"
+ " is above rec heap top %lu\n",
+ (ulong) (rec - page),
+ (ulong) (rec_heap_top - page));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Page free list appears"
+ " to be circular %lu\n",
+ (ulong) count);
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+ (ulong) page_dir_get_n_heap(page),
+ (ulong) (count + 1));
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+ page_t* page) /*!< in: new-style index page */
+{
+ page_dir_slot_t* slot;
+ ulint slot_no;
+ ulint n_slots;
+ rec_t* rec;
+ byte* rec_heap_top;
+ ulint count;
+ ulint own_count;
+ ibool ret = FALSE;
+
+ ut_a(page_is_comp(page));
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+ fprintf(stderr,
+ "InnoDB: Nonsensical number %lu"
+ " of page dir slots\n", (ulong) n_slots);
+
+ goto func_exit;
+ }
+
+ rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+ if (UNIV_UNLIKELY(rec_heap_top
+ > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+ fprintf(stderr,
+ "InnoDB: Record heap and dir overlap on a page,"
+ " heap top %lu, dir %lu\n",
+ (ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+ (ulong)
+ page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that it is
+ consistent with the page record directory. */
+
+ count = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ fprintf(stderr,
+ "InnoDB: Record %lu is above rec"
+ " heap top %lu\n",
+ (ulong) page_offset(rec),
+ (ulong) page_offset(rec_heap_top));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+ != own_count)) {
+
+ fprintf(stderr,
+ "InnoDB: Wrong owned count %lu, %lu,"
+ " rec %lu\n",
+ (ulong) rec_get_n_owned_new(rec),
+ (ulong) own_count,
+ (ulong) page_offset(rec));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_dir_slot_get_rec(slot) != rec)) {
+ fprintf(stderr,
+ "InnoDB: Dir slot does not point"
+ " to right rec %lu\n",
+ (ulong) page_offset(rec));
+
+ goto func_exit;
+ }
+
+ own_count = 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ break;
+ }
+
+ if (UNIV_UNLIKELY
+ (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+ || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Next record offset nonsensical %lu"
+ " for rec %lu\n",
+ (ulong) rec_get_next_offs(rec, TRUE),
+ (ulong) page_offset(rec));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Page record list appears"
+ " to be circular %lu\n",
+ (ulong) count);
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next(rec);
+ own_count++;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+ fprintf(stderr, "InnoDB: n owned is zero"
+ " in a supremum rec\n");
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+ (ulong) slot_no, (ulong) (n_slots - 1));
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+ (ulong) page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW,
+ (ulong) (count + 1));
+
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+ || rec >= page + UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Free list record has"
+ " a nonsensical offset %lu\n",
+ (ulong) page_offset(rec));
+
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+ fprintf(stderr,
+ "InnoDB: Free list record %lu"
+ " is above rec heap top %lu\n",
+ (ulong) page_offset(rec),
+ (ulong) page_offset(rec_heap_top));
+
+ goto func_exit;
+ }
+
+ count++;
+
+ if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Page free list appears"
+ " to be circular %lu\n",
+ (ulong) count);
+ goto func_exit;
+ }
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+ fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+ (ulong) page_dir_get_n_heap(page),
+ (ulong) (count + 1));
+
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+ page_t* page, /*!< in: index page */
+ dict_index_t* index) /*!< in: data dictionary index containing
+ the page record type definition */
+{
+ page_dir_slot_t*slot;
+ mem_heap_t* heap;
+ byte* buf;
+ ulint count;
+ ulint own_count;
+ ulint rec_own_count;
+ ulint slot_no;
+ ulint data_size;
+ rec_t* rec;
+ rec_t* old_rec = NULL;
+ ulint offs;
+ ulint n_slots;
+ ibool ret = FALSE;
+ ulint i;
+ ulint* offsets = NULL;
+ ulint* old_offsets = NULL;
+
+ if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+ != dict_table_is_comp(index->table))) {
+ fputs("InnoDB: 'compact format' flag mismatch\n", stderr);
+ goto func_exit2;
+ }
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+ goto func_exit2;
+ }
+ } else {
+ if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+ goto func_exit2;
+ }
+ }
+
+ heap = mem_heap_create(UNIV_PAGE_SIZE + 200);
+
+ /* The following buffer is used to check that the
+ records in the page record heap do not overlap */
+
+ buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE);
+
+ /* Check first that the record heap and the directory do not
+ overlap. */
+
+ n_slots = page_dir_get_n_slots(page);
+
+ if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+ <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+ fprintf(stderr,
+ "InnoDB: Record heap and dir overlap"
+ " on space %lu page %lu index %s, %p, %p\n",
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page), index->name,
+ page_header_get_ptr(page, PAGE_HEAP_TOP),
+ page_dir_get_nth_slot(page, n_slots - 1));
+
+ goto func_exit;
+ }
+
+ /* Validate the record list in a loop checking also that
+ it is consistent with the directory. */
+ count = 0;
+ data_size = 0;
+ own_count = 1;
+ slot_no = 0;
+ slot = page_dir_get_nth_slot(page, slot_no);
+
+ rec = page_get_infimum_rec(page);
+
+ for (;;) {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (page_is_comp(page) && page_rec_is_user_rec(rec)
+ && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+ == page_is_leaf(page))) {
+ fputs("InnoDB: node_ptr flag mismatch\n", stderr);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+ goto func_exit;
+ }
+
+#ifndef UNIV_HOTBACKUP
+ /* Check that the records are in the ascending order */
+ if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW)
+ && !page_rec_is_supremum(rec)) {
+ if (UNIV_UNLIKELY
+ (1 != cmp_rec_rec(rec, old_rec,
+ offsets, old_offsets, index))) {
+ fprintf(stderr,
+ "InnoDB: Records in wrong order"
+ " on space %lu page %lu index %s\n",
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page),
+ index->name);
+ fputs("\nInnoDB: previous record ", stderr);
+ rec_print_new(stderr, old_rec, old_offsets);
+ fputs("\nInnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+
+ goto func_exit;
+ }
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ if (page_rec_is_user_rec(rec)) {
+
+ data_size += rec_offs_size(offsets);
+ }
+
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
+
+ while (i--) {
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ /* No other record may overlap this */
+
+ fputs("InnoDB: Record overlaps another\n",
+ stderr);
+ goto func_exit;
+ }
+
+ buf[offs + i] = 1;
+ }
+
+ if (page_is_comp(page)) {
+ rec_own_count = rec_get_n_owned_new(rec);
+ } else {
+ rec_own_count = rec_get_n_owned_old(rec);
+ }
+
+ if (UNIV_UNLIKELY(rec_own_count)) {
+ /* This is a record pointed to by a dir slot */
+ if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+ fprintf(stderr,
+ "InnoDB: Wrong owned count %lu, %lu\n",
+ (ulong) rec_own_count,
+ (ulong) own_count);
+ goto func_exit;
+ }
+
+ if (page_dir_slot_get_rec(slot) != rec) {
+ fputs("InnoDB: Dir slot does not"
+ " point to right rec\n",
+ stderr);
+ goto func_exit;
+ }
+
+ page_dir_slot_check(slot);
+
+ own_count = 0;
+ if (!page_rec_is_supremum(rec)) {
+ slot_no++;
+ slot = page_dir_get_nth_slot(page, slot_no);
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+ break;
+ }
+
+ count++;
+ own_count++;
+ old_rec = rec;
+ rec = page_rec_get_next(rec);
+
+ /* set old_offsets to offsets; recycle offsets */
+ {
+ ulint* offs = old_offsets;
+ old_offsets = offsets;
+ offsets = offs;
+ }
+ }
+
+ if (page_is_comp(page)) {
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+ goto n_owned_zero;
+ }
+ } else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+ fputs("InnoDB: n owned is zero\n", stderr);
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+ fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n",
+ (ulong) slot_no, (ulong) (n_slots - 1));
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW
+ != count + 1)) {
+ fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+ (ulong) page_header_get_field(page, PAGE_N_RECS)
+ + PAGE_HEAP_NO_USER_LOW,
+ (ulong) (count + 1));
+ goto func_exit;
+ }
+
+ if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+ fprintf(stderr,
+ "InnoDB: Summed data size %lu, returned by func %lu\n",
+ (ulong) data_size, (ulong) page_get_data_size(page));
+ goto func_exit;
+ }
+
+ /* Check then the free list */
+ rec = page_header_get_ptr(page, PAGE_FREE);
+
+ while (rec != NULL) {
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+
+ goto func_exit;
+ }
+
+ count++;
+ offs = page_offset(rec_get_start(rec, offsets));
+ i = rec_offs_size(offsets);
+ if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+ fputs("InnoDB: record offset out of bounds\n", stderr);
+ goto func_exit;
+ }
+
+ while (i--) {
+
+ if (UNIV_UNLIKELY(buf[offs + i])) {
+ fputs("InnoDB: Record overlaps another"
+ " in free list\n", stderr);
+ goto func_exit;
+ }
+
+ buf[offs + i] = 1;
+ }
+
+ rec = page_rec_get_next(rec);
+ }
+
+ if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+ fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n",
+ (ulong) page_dir_get_n_heap(page),
+ (ulong) count + 1);
+ goto func_exit;
+ }
+
+ ret = TRUE;
+
+func_exit:
+ mem_heap_free(heap);
+
+ if (UNIV_UNLIKELY(ret == FALSE)) {
+func_exit2:
+ fprintf(stderr,
+ "InnoDB: Apparent corruption"
+ " in space %lu page %lu index %s\n",
+ (ulong) page_get_space_id(page),
+ (ulong) page_get_page_no(page),
+ index->name);
+ buf_page_print(page, 0);
+ }
+
+ return(ret);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return record, NULL if not found */
+UNIV_INTERN
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+ const page_t* page, /*!< in: index page */
+ ulint heap_no)/*!< in: heap number */
+{
+ const rec_t* rec;
+
+ if (page_is_comp(page)) {
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for(;;) {
+ ulint rec_heap_no = rec_get_heap_no_new(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, TRUE);
+ }
+ } else {
+ rec = page + PAGE_OLD_INFIMUM;
+
+ for (;;) {
+ ulint rec_heap_no = rec_get_heap_no_old(rec);
+
+ if (rec_heap_no == heap_no) {
+
+ return(rec);
+ } else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+ return(NULL);
+ }
+
+ rec = page + rec_get_next_offs(rec, FALSE);
+ }
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c
new file mode 100644
index 00000000000..a94d2d54417
--- /dev/null
+++ b/storage/xtradb/page/page0zip.c
@@ -0,0 +1,4677 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.c
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "page0zip.h"
+#ifdef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+#undef THIS_MODULE
+#include "page0page.h"
+#include "mtr0log.h"
+#include "ut0sort.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "page0types.h"
+#include "log0recv.h"
+#include "zlib.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "lock0lock.h"
+#else /* !UNIV_HOTBACKUP */
+# define lock_move_reorganize_page(block, temp_block) ((void) 0)
+# define buf_LRU_stat_inc_unzip() ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+#endif /* !UNIV_HOTBACKUP */
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+ 0x01, /* info_bits=0, n_owned=1 */
+ 0x00, 0x02 /* heap_no=0, status=2 */
+ /* ?, ? */ /* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+ 0x69, 0x6e, 0x66, 0x69,
+ 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data[] = {
+ /* 0x0?, */ /* info_bits=0, n_owned=1..8 */
+ 0x00, 0x0b, /* heap_no=1, status=3 */
+ 0x00, 0x00, /* next=0 */
+ 0x73, 0x75, 0x70, 0x72,
+ 0x65, 0x6d, 0x75, 0x6d /* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+Compare at most sizeof(field_ref_zero) bytes.
+@param b in: memory block
+@param s in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) \
+ ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero)))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) \
+ ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero))
+
+/* Enable some extra debugging output. This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+__attribute__((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+ const char* fmt, /*!< in: printf(3) format string */
+ ...) /*!< in: arguments corresponding to fmt */
+{
+ int res;
+ va_list ap;
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: ", stderr);
+ va_start(ap, fmt);
+ res = vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+ ulint n_fields, /*!< in: number of columns in the index */
+ ulint zip_size) /*!< in: compressed page size in bytes */
+{
+ lint size = zip_size
+ /* subtract the page header and the longest
+ uncompressed data needed for one record */
+ - (PAGE_DATA
+ + PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+ + 1/* encoded heap_no==2 in page_zip_write_rec() */
+ + 1/* end of modification log */
+ - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+ /* subtract the space for page_zip_fields_encode() */
+ - compressBound(2 * (n_fields + 1));
+ return(size > 0 ? (ulint) size : 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ /* Exclude the page infimum and supremum from the record count. */
+ ulint size = PAGE_ZIP_DIR_SLOT_SIZE
+ * (page_dir_get_n_heap(page_zip->data)
+ - PAGE_HEAP_NO_USER_LOW);
+ return(size);
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+ const page_zip_des_t* page_zip) /*!< in: compressed page */
+{
+ ulint size = PAGE_ZIP_DIR_SLOT_SIZE
+ * page_get_n_recs(page_zip->data);
+ ut_ad(size <= page_zip_dir_size(page_zip));
+ return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+ byte* slot, /*!< in: start of records */
+ byte* end, /*!< in: end of records */
+ ulint offset) /*!< in: offset of user record */
+{
+ ut_ad(slot <= end);
+
+ for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+ if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+ == offset) {
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+ end,
+ offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+ page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint offset) /*!< in: offset of user record */
+{
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+ end - page_zip_dir_user_size(page_zip),
+ offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+ const page_zip_des_t* page_zip, /*!< in: compressed page */
+ ulint slot) /*!< in: slot
+ (0=first user record) */
+{
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+ return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of compressing an index page. */
+static
+void
+page_zip_compress_write_log(
+/*========================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ byte* log_ptr;
+ ulint trailer_size;
+
+ ut_ad(!dict_index_is_ibuf(index));
+
+ log_ptr = mlog_open(mtr, 11 + 2 + 2);
+
+ if (!log_ptr) {
+
+ return;
+ }
+
+ /* Read the number of user records. */
+ trailer_size = page_dir_get_n_heap(page_zip->data)
+ - PAGE_HEAP_NO_USER_LOW;
+ /* Multiply by uncompressed of size stored per record */
+ if (!page_is_leaf(page)) {
+ trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ } else if (dict_index_is_clust(index)) {
+ trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+ /* Add the space occupied by BLOB pointers. */
+ trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(page_zip->m_end > PAGE_DATA);
+#if FIL_PAGE_DATA > PAGE_DATA
+# error "FIL_PAGE_DATA > PAGE_DATA"
+#endif
+ ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+ log_ptr = mlog_write_initial_log_record_fast((page_t*) page,
+ MLOG_ZIP_PAGE_COMPRESS,
+ log_ptr, mtr);
+ mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE);
+ log_ptr += 2;
+ mach_write_to_2(log_ptr, trailer_size);
+ log_ptr += 2;
+ mlog_close(mtr, log_ptr);
+
+ /* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */
+ mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4);
+ mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4);
+ /* Write most of the page header, the compressed stream and
+ the modification log. */
+ mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE,
+ page_zip->m_end - FIL_PAGE_TYPE);
+ /* Write the uncompressed trailer of the compressed page. */
+ mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip)
+ - trailer_size, trailer_size);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ const rec_t* rec, /*!< in: compact physical record
+ on a B-tree leaf page */
+ dict_index_t* index) /*!< in: record descriptor */
+{
+ const page_t* page = page_align(rec);
+ ulint n_ext = 0;
+ ulint i;
+ ulint left;
+ ulint heap_no;
+ ulint n_recs = page_get_n_recs(page_zip->data);
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(page_is_comp(page));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ left = heap_no - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(!left)) {
+ return(0);
+ }
+
+ for (i = 0; i < n_recs; i++) {
+ const rec_t* r = page + (page_zip_dir_get(page_zip, i)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+
+ if (rec_get_heap_no_new(r) < heap_no) {
+ n_ext += rec_get_n_extern_new(r, index,
+ ULINT_UNDEFINED);
+ if (!--left) {
+ break;
+ }
+ }
+ }
+
+ return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+ byte* buf, /*!< in: pointer to buffer where to write */
+ ulint val) /*!< in: value to write */
+{
+ ut_ad(val >= 2);
+
+ if (UNIV_LIKELY(val < 126)) {
+ /*
+ 0 = nullable variable field of at most 255 bytes length;
+ 1 = not null variable field of at most 255 bytes length;
+ 126 = nullable variable field with maximum length >255;
+ 127 = not null variable field with maximum length >255
+ */
+ *buf++ = (byte) val;
+ } else {
+ *buf++ = (byte) (0x80 | val >> 8);
+ *buf++ = (byte) val;
+ }
+
+ return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return used size of buf */
+static
+ulint
+page_zip_fields_encode(
+/*===================*/
+ ulint n, /*!< in: number of fields to compress */
+ dict_index_t* index, /*!< in: index comprising at least n fields */
+ ulint trx_id_pos,/*!< in: position of the trx_id column
+ in the index, or ULINT_UNDEFINED if
+ this is a non-leaf page */
+ byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */
+{
+ const byte* buf_start = buf;
+ ulint i;
+ ulint col;
+ ulint trx_id_col = 0;
+ /* sum of lengths of preceding non-nullable fixed fields, or 0 */
+ ulint fixed_sum = 0;
+
+ ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+ for (i = col = 0; i < n; i++) {
+ dict_field_t* field = dict_index_get_nth_field(index, i);
+ ulint val;
+
+ if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+ val = 1; /* set the "not nullable" flag */
+ } else {
+ val = 0; /* nullable field */
+ }
+
+ if (!field->fixed_len) {
+ /* variable-length field */
+ const dict_col_t* column
+ = dict_field_get_col(field);
+
+ if (UNIV_UNLIKELY(column->len > 255)
+ || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) {
+ val |= 0x7e; /* max > 255 bytes */
+ }
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ *buf++ = (byte) val;
+ col++;
+ } else if (val) {
+ /* fixed-length non-nullable field */
+
+ if (fixed_sum && UNIV_UNLIKELY
+ (fixed_sum + field->fixed_len
+ > DICT_MAX_INDEX_COL_LEN)) {
+ /* Write out the length of the
+ preceding non-nullable fields,
+ to avoid exceeding the maximum
+ length of a fixed-length column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+ if (fixed_sum) {
+ /* Write out the length of any
+ preceding non-nullable fields,
+ and start a new trx_id column. */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ col++;
+ }
+
+ trx_id_col = col;
+ fixed_sum = field->fixed_len;
+ } else {
+ /* add to the sum */
+ fixed_sum += field->fixed_len;
+ }
+ } else {
+ /* fixed-length nullable field */
+
+ if (fixed_sum) {
+ /* write out the length of any
+ preceding non-nullable fields */
+ buf = page_zip_fixed_field_encode(
+ buf, fixed_sum << 1 | 1);
+ fixed_sum = 0;
+ col++;
+ }
+
+ buf = page_zip_fixed_field_encode(
+ buf, field->fixed_len << 1);
+ col++;
+ }
+ }
+
+ if (fixed_sum) {
+ /* Write out the lengths of last fixed-length columns. */
+ buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+ }
+
+ if (trx_id_pos != ULINT_UNDEFINED) {
+ /* Write out the position of the trx_id column */
+ i = trx_id_col;
+ } else {
+ /* Write out the number of nullable fields */
+ i = index->n_nullable;
+ }
+
+ if (i < 128) {
+ *buf++ = (byte) i;
+ } else {
+ *buf++ = (byte) (0x80 | i >> 8);
+ *buf++ = (byte) i;
+ }
+
+ ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+ return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+ const page_t* page, /*!< in: compact page */
+ byte* buf, /*!< in: pointer to dense page directory[-1];
+ out: dense directory on compressed page */
+ const rec_t** recs) /*!< in: pointer to an array of 0, or NULL;
+ out: dense page directory sorted by ascending
+ address (and heap_no) */
+{
+ const byte* rec;
+ ulint status;
+ ulint min_mark;
+ ulint heap_no;
+ ulint i;
+ ulint n_heap;
+ ulint offs;
+
+ min_mark = 0;
+
+ if (page_is_leaf(page)) {
+ status = REC_STATUS_ORDINARY;
+ } else {
+ status = REC_STATUS_NODE_PTR;
+ if (UNIV_UNLIKELY
+ (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) {
+ min_mark = REC_INFO_MIN_REC_FLAG;
+ }
+ }
+
+ n_heap = page_dir_get_n_heap(page);
+
+ /* Traverse the list of stored records in the collation order,
+ starting from the first user record. */
+
+ rec = page + PAGE_NEW_INFIMUM;
+
+ i = 0;
+
+ for (;;) {
+ ulint info_bits;
+ offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+ break;
+ }
+ rec = page + offs;
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+ ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR);
+ ut_a(offs >= PAGE_ZIP_START);
+#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
+# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
+#endif
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1
+# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1"
+#endif
+ if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+ offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+ }
+
+ info_bits = rec_get_info_bits(rec, TRUE);
+ if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) {
+ info_bits &= ~REC_INFO_DELETED_FLAG;
+ offs |= PAGE_ZIP_DIR_SLOT_DEL;
+ }
+ ut_a(info_bits == min_mark);
+ /* Only the smallest user record can have
+ REC_INFO_MIN_REC_FLAG set. */
+ min_mark = 0;
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ ut_a(rec_get_status(rec) == status);
+ }
+
+ offs = page_header_get_field(page, PAGE_FREE);
+
+ /* Traverse the free list (of deleted records). */
+ while (offs) {
+ ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+ rec = page + offs;
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+ ut_a(heap_no < n_heap);
+
+ ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+ ut_a(rec_get_status(rec) == status);
+
+ mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+ if (UNIV_LIKELY_NULL(recs)) {
+ /* Ensure that each heap_no occurs at most once. */
+ ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+ /* exclude infimum and supremum */
+ recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+ }
+
+ offs = rec_get_next_offs(rec, TRUE);
+ }
+
+ /* Ensure that each heap no occurs at least once. */
+ ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_malloc(
+/*============*/
+ void* opaque, /*!< in/out: memory heap */
+ uInt items, /*!< in: number of items to allocate */
+ uInt size) /*!< in: size of an item in bytes */
+{
+ return(mem_heap_alloc(opaque, items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+ void* opaque __attribute__((unused)), /*!< in: memory heap */
+ void* address __attribute__((unused)))/*!< in: object to free */
+{
+}
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+ void* stream, /*!< in/out: zlib stream */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ z_stream* strm = stream;
+
+ strm->zalloc = page_zip_malloc;
+ strm->zfree = page_zip_free;
+ strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+UNIV_INTERN ibool page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+UNIV_INTERN unsigned page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set.
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+ FILE* logfile,/*!< in: log file, or NULL */
+ z_streamp strm, /*!< in/out: compressed stream for deflate() */
+ int flush) /*!< in: deflate() flushing method */
+{
+ int status;
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ ut_print_buf(stderr, strm->next_in, strm->avail_in);
+ }
+ if (UNIV_LIKELY_NULL(logfile)) {
+ fwrite(strm->next_in, 1, strm->avail_in, logfile);
+ }
+ status = deflate(strm, flush);
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ fprintf(stderr, " -> %d\n", status);
+ }
+ return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm in/out: compressed stream
+@param flush in: flushing method
+@return deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ ulint* offsets = NULL;
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Only leaf nodes may contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in;
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+
+ /* Compress the data bytes, except node_ptr. */
+ c_stream->next_in = (byte*) rec;
+ c_stream->avail_in = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+ ut_ad(c_stream->avail_in);
+
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+
+ ut_ad(!c_stream->avail_in);
+
+ memcpy(storage - REC_NODE_PTR_SIZE
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in, REC_NODE_PTR_SIZE);
+ c_stream->next_in += REC_NODE_PTR_SIZE;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense) /*!< in: size of recs[] */
+{
+ int err = Z_OK;
+
+ ut_ad(n_dense > 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ /* Compress everything up to this record. */
+ c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in;
+
+ if (UNIV_LIKELY(c_stream->avail_in)) {
+ UNIV_MEM_ASSERT_RW(c_stream->next_in,
+ c_stream->avail_in);
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ break;
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+ c_stream->next_in = (byte*) rec;
+ } while (--n_dense);
+
+ return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ byte** externs, /*!< in/out: pointer to the next
+ available BLOB pointer */
+ ulint* n_blobs) /*!< in/out: number of
+ externally stored columns */
+{
+ int err;
+ ulint i;
+
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ /* Store trx_id and roll_ptr
+ in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in
+ = src - c_stream->next_in;
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ i++;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ c_stream->avail_in = src
+ - c_stream->next_in;
+ if (UNIV_LIKELY(c_stream->avail_in)) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ /* Reserve space for the data at
+ the end of the space reserved for
+ the compressed data and the page
+ modification log. */
+
+ if (UNIV_UNLIKELY
+ (c_stream->avail_out
+ <= BTR_EXTERN_FIELD_REF_SIZE)) {
+ /* out of space */
+ return(Z_BUF_ERROR);
+ }
+
+ ut_ad(*externs == c_stream->next_out
+ + c_stream->avail_out
+ + 1/* end of modif. log */);
+
+ c_stream->next_in
+ += BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Skip deleted records. */
+ if (UNIV_LIKELY_NULL
+ (page_zip_dir_find_low(
+ storage, deleted,
+ page_offset(rec)))) {
+ continue;
+ }
+
+ (*n_blobs)++;
+ c_stream->avail_out
+ -= BTR_EXTERN_FIELD_REF_SIZE;
+ *externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Copy the BLOB pointer */
+ memcpy(*externs, c_stream->next_in
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+
+ return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+ FILE_LOGFILE
+ z_stream* c_stream, /*!< in/out: compressed page stream */
+ const rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* n_blobs, /*!< in: 0; out: number of
+ externally stored columns */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ byte* deleted, /*!< in: dense directory entry pointing
+ to the head of the free list */
+ byte* storage, /*!< in: end of dense page directory */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err = Z_OK;
+ ulint* offsets = NULL;
+ /* BTR_EXTERN_FIELD_REF storage */
+ byte* externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ut_ad(*n_blobs == 0);
+
+ do {
+ const rec_t* rec = *recs++;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ ut_ad(rec_offs_n_fields(offsets)
+ == dict_index_get_n_fields(index));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress the extra bytes. */
+ c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+ - c_stream->next_in;
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Compress the data bytes. */
+
+ c_stream->next_in = (byte*) rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, store the
+ BTR_EXTERN_FIELD_REF separately. */
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ ut_ad(dict_index_is_clust(index));
+
+ err = page_zip_compress_clust_ext(
+ LOGFILE
+ c_stream, rec, offsets, trx_id_col,
+ deleted, storage, &externs, n_blobs);
+
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ } else {
+ ulint len;
+ const byte* src;
+
+ /* Store trx_id and roll_ptr in uncompressed form. */
+ src = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ /* Compress any preceding bytes. */
+ c_stream->avail_in = src - c_stream->next_in;
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ return(err);
+ }
+ }
+
+ ut_ad(!c_stream->avail_in);
+ ut_ad(c_stream->next_in == src);
+
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (rec_get_heap_no_new(rec) - 1),
+ c_stream->next_in,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ c_stream->next_in
+ += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Skip also roll_ptr */
+ ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+ }
+
+ /* Compress the last bytes of the record. */
+ c_stream->avail_in = rec + rec_offs_data_size(offsets)
+ - c_stream->next_in;
+
+ if (c_stream->avail_in) {
+ err = deflate(c_stream, Z_NO_FLUSH);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+
+ goto func_exit;
+ }
+ }
+ ut_ad(!c_stream->avail_in);
+ } while (--n_dense);
+
+func_exit:
+ return(err);
+}
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+ page_zip_des_t* page_zip,/*!< in: size; out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ const page_t* page, /*!< in: uncompressed page */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+{
+ z_stream c_stream;
+ int err;
+ ulint n_fields;/* number of index fields needed */
+ byte* fields; /*!< index field information */
+ byte* buf; /*!< compressed payload of the page */
+ byte* buf_end;/* end of buf */
+ ulint n_dense;
+ ulint slot_size;/* amount of uncompressed bytes per record */
+ const rec_t** recs; /*!< dense page directory, sorted by address */
+ mem_heap_t* heap;
+ ulint trx_id_col;
+ ulint* offsets = NULL;
+ ulint n_blobs = 0;
+ byte* storage;/* storage of uncompressed columns */
+#ifndef UNIV_HOTBACKUP
+ ullint usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ FILE* logfile = NULL;
+#endif
+
+ if (!page) {
+ return(FALSE);
+ }
+
+ ut_a(page_is_comp(page));
+ ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(!dict_index_is_ibuf(index));
+
+ UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+ /* Check the data that will be omitted. */
+ ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra));
+ ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+ infimum_data, sizeof infimum_data));
+ ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+ /* info_bits == 0, n_owned <= max */
+ <= PAGE_DIR_SLOT_MAX_N_OWNED);
+ ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+ supremum_extra_data, sizeof supremum_extra_data));
+
+ if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+ ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+ == PAGE_NEW_SUPREMUM);
+ }
+
+ if (page_is_leaf(page)) {
+ n_fields = dict_index_get_n_fields(index);
+ } else {
+ n_fields = dict_index_get_n_unique_in_tree(index);
+ }
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+ fprintf(stderr, "compress %p %p %lu %lu %lu\n",
+ (void*) page_zip, (void*) page,
+ page_is_leaf(page),
+ n_fields, n_dense);
+ }
+ if (UNIV_UNLIKELY(page_zip_compress_log)) {
+ /* Create a log file for every compression attempt. */
+ char logfilename[9];
+ ut_snprintf(logfilename, sizeof logfilename,
+ "%08x", page_zip_compress_log++);
+ logfile = fopen(logfilename, "wb");
+
+ if (logfile) {
+ /* Write the uncompressed page to the log. */
+ fwrite(page, 1, UNIV_PAGE_SIZE, logfile);
+ /* Record the compressed size as zero.
+ This will be overwritten at successful exit. */
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ putc(0, logfile);
+ }
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+ page_zip_stat[page_zip->ssize - 1].compressed++;
+#endif /* !UNIV_HOTBACKUP */
+
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+
+ goto err_exit;
+ }
+
+ heap = mem_heap_create(page_zip_get_size(page_zip)
+ + n_fields * (2 + sizeof *offsets)
+ + n_dense * ((sizeof *recs)
+ - PAGE_ZIP_DIR_SLOT_SIZE)
+ + UNIV_PAGE_SIZE * 4
+ + (512 << MAX_MEM_LEVEL));
+
+ recs = mem_heap_zalloc(heap, n_dense * sizeof *recs);
+
+ fields = mem_heap_alloc(heap, (n_fields + 1) * 2);
+
+ buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA);
+ buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+ /* Compress the data payload. */
+ page_zip_set_alloc(&c_stream, heap);
+
+ err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT,
+ MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ ut_a(err == Z_OK);
+
+ c_stream.next_out = buf;
+ /* Subtract the space reserved for uncompressed data. */
+ /* Page header and the end marker of the modification log */
+ c_stream.avail_out = buf_end - buf - 1;
+ /* Dense page directory and uncompressed columns, if any */
+ if (page_is_leaf(page)) {
+ if (dict_index_is_clust(index)) {
+ trx_id_col = dict_index_get_sys_col_pos(
+ index, DATA_TRX_ID);
+ ut_ad(trx_id_col > 0);
+ ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ } else {
+ /* Signal the absence of trx_id
+ in page_zip_fields_encode() */
+ ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+ == ULINT_UNDEFINED);
+ trx_id_col = 0;
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+ } else {
+ slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+ + 6/* sizeof(zlib header and footer) */)) {
+ goto zlib_error;
+ }
+
+ c_stream.avail_out -= n_dense * slot_size;
+ c_stream.avail_in = page_zip_fields_encode(n_fields, index,
+ trx_id_col, fields);
+ c_stream.next_in = fields;
+ if (UNIV_LIKELY(!trx_id_col)) {
+ trx_id_col = ULINT_UNDEFINED;
+ }
+
+ UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FULL_FLUSH);
+ if (err != Z_OK) {
+ goto zlib_error;
+ }
+
+ ut_ad(!c_stream.avail_in);
+
+ page_zip_dir_encode(page, buf_end, recs);
+
+ c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+ storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ /* Compress the records in heap_no order. */
+ if (UNIV_UNLIKELY(!n_dense)) {
+ } else if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ err = page_zip_compress_node_ptrs(LOGFILE
+ &c_stream, recs, n_dense,
+ index, storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ err = page_zip_compress_sec(LOGFILE
+ &c_stream, recs, n_dense);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ err = page_zip_compress_clust(LOGFILE
+ &c_stream, recs, n_dense,
+ index, &n_blobs, trx_id_col,
+ buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+ * page_get_n_recs(page),
+ storage, heap);
+ if (UNIV_UNLIKELY(err != Z_OK)) {
+ goto zlib_error;
+ }
+ }
+
+ /* Finish the compression. */
+ ut_ad(!c_stream.avail_in);
+ /* Compress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list,
+ or the data of the last record from page_zip_compress_sec(). */
+ c_stream.avail_in
+ = page_header_get_field(page, PAGE_HEAP_TOP)
+ - (c_stream.next_in - page);
+ ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR);
+
+ UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+ err = deflate(&c_stream, Z_FINISH);
+
+ if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+ deflateEnd(&c_stream);
+ mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+ page_zip_stat[page_zip->ssize - 1].compressed_usec
+ += ut_time_us(NULL) - usec;
+#endif /* !UNIV_HOTBACKUP */
+ return(FALSE);
+ }
+
+ err = deflateEnd(&c_stream);
+ ut_a(err == Z_OK);
+
+ ut_ad(buf + c_stream.total_out == c_stream.next_out);
+ ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+ /* Valgrind believes that zlib does not initialize some bits
+ in the last 7 or 8 bytes of the stream. Make Valgrind happy. */
+ UNIV_MEM_VALID(buf, c_stream.total_out);
+
+ /* Zero out the area reserved for the modification log.
+ Space for the end marker of the modification log is not
+ included in avail_out. */
+ memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start =
+#endif /* UNIV_DEBUG */
+ page_zip->m_end = PAGE_DATA + c_stream.total_out;
+ page_zip->m_nonempty = FALSE;
+ page_zip->n_blobs = n_blobs;
+ /* Copy those header fields that will not be written
+ in buf_flush_init_for_writing() */
+ memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV);
+ memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2);
+ memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA);
+ /* Copy the rest of the compressed page */
+ memcpy(page_zip->data + PAGE_DATA, buf,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+ mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (mtr) {
+#ifndef UNIV_HOTBACKUP
+ page_zip_compress_write_log(page_zip, page, index, mtr);
+#endif /* !UNIV_HOTBACKUP */
+ }
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+ if (logfile) {
+ /* Record the compressed size of the block. */
+ byte sz[4];
+ mach_write_to_4(sz, c_stream.total_out);
+ fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET);
+ fwrite(sz, 1, sizeof sz, logfile);
+ fclose(logfile);
+ }
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+ {
+ page_zip_stat_t* zip_stat
+ = &page_zip_stat[page_zip->ssize - 1];
+ zip_stat->compressed_ok++;
+ zip_stat->compressed_usec += ut_time_us(NULL) - usec;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Compare two page directory entries.
+@return positive if rec1 > rec2 */
+UNIV_INLINE
+ibool
+page_zip_dir_cmp(
+/*=============*/
+ const rec_t* rec1, /*!< in: rec1 */
+ const rec_t* rec2) /*!< in: rec2 */
+{
+ return(rec1 > rec2);
+}
+
+/**********************************************************************//**
+Sort the dense page directory by address (heap_no). */
+static
+void
+page_zip_dir_sort(
+/*==============*/
+ rec_t** arr, /*!< in/out: dense page directory */
+ rec_t** aux_arr,/*!< in/out: work area */
+ ulint low, /*!< in: lower bound of the sorting area, inclusive */
+ ulint high) /*!< in: upper bound of the sorting area, exclusive */
+{
+ UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high,
+ page_zip_dir_cmp);
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+ dict_index_t* index) /*!< in: dummy index to be freed */
+{
+ if (index) {
+ dict_table_t* table = index->table;
+ mem_heap_free(index->heap);
+ mutex_free(&(table->autoinc_mutex));
+ ut_free(table->name);
+ mem_heap_free(table->heap);
+ }
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+ const byte* buf, /*!< in: index information */
+ const byte* end, /*!< in: end of buf */
+ ulint* trx_id_col)/*!< in: NULL for non-leaf pages;
+ for leaf pages, pointer to where to store
+ the position of the trx_id column */
+{
+ const byte* b;
+ ulint n;
+ ulint i;
+ ulint val;
+ dict_table_t* table;
+ dict_index_t* index;
+
+ /* Determine the number of fields. */
+ for (b = buf, n = 0; b < end; n++) {
+ if (*b++ & 0x80) {
+ b++; /* skip the second byte */
+ }
+ }
+
+ n--; /* n_nullable or trx_id */
+
+ if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+ page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+ (ulong) n));
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(b > end)) {
+
+ page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+ (const void*) b, (const void*) end));
+ return(NULL);
+ }
+
+ table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
+ DICT_TF_COMPACT);
+ index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
+ DICT_HDR_SPACE, 0, n);
+ index->table = table;
+ index->n_uniq = n;
+ /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+ index->cached = TRUE;
+
+ /* Initialize the fields. */
+ for (b = buf, i = 0; i < n; i++) {
+ ulint mtype;
+ ulint len;
+
+ val = *b++;
+
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ /* fixed length > 62 bytes */
+ val = (val & 0x7f) << 8 | *b++;
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ } else if (UNIV_UNLIKELY(val >= 126)) {
+ /* variable length with max > 255 bytes */
+ len = 0x7fff;
+ mtype = DATA_BINARY;
+ } else if (val <= 1) {
+ /* variable length with max <= 255 bytes */
+ len = 0;
+ mtype = DATA_BINARY;
+ } else {
+ /* fixed length < 62 bytes */
+ len = val >> 1;
+ mtype = DATA_FIXBINARY;
+ }
+
+ dict_mem_table_add_col(table, NULL, NULL, mtype,
+ val & 1 ? DATA_NOT_NULL : 0, len);
+ dict_index_add_col(index, table,
+ dict_table_get_nth_col(table, i), 0);
+ }
+
+ val = *b++;
+ if (UNIV_UNLIKELY(val & 0x80)) {
+ val = (val & 0x7f) << 8 | *b++;
+ }
+
+ /* Decode the position of the trx_id column. */
+ if (trx_id_col) {
+ if (!val) {
+ val = ULINT_UNDEFINED;
+ } else if (UNIV_UNLIKELY(val >= n)) {
+ page_zip_fields_free(index);
+ index = NULL;
+ } else {
+ index->type = DICT_CLUSTERED;
+ }
+
+ *trx_id_col = val;
+ } else {
+ /* Decode the number of nullable fields. */
+ if (UNIV_UNLIKELY(index->n_nullable > val)) {
+ page_zip_fields_free(index);
+ index = NULL;
+ } else {
+ index->n_nullable = val;
+ }
+ }
+
+ ut_ad(b == end);
+
+ return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_dir_decode(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: dense page directory on
+ compressed page */
+ page_t* page, /*!< in: compact page with valid header;
+ out: trailer and sparse page directory
+ filled in */
+ rec_t** recs, /*!< out: dense page directory sorted by
+ ascending address (and heap_no) */
+ rec_t** recs_aux,/*!< in/out: scratch area */
+ ulint n_dense)/*!< in: number of user records, and
+ size of recs[] and recs_aux[] */
+{
+ ulint i;
+ ulint n_recs;
+ byte* slot;
+
+ n_recs = page_get_n_recs(page);
+
+ if (UNIV_UNLIKELY(n_recs > n_dense)) {
+ page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+ (ulong) n_recs, (ulong) n_dense));
+ return(FALSE);
+ }
+
+ /* Traverse the list of stored records in the sorting order,
+ starting from the first user record. */
+
+ slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+ UNIV_PREFETCH_RW(slot);
+
+ /* Zero out the page trailer. */
+ memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+ mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+
+ /* Initialize the sparse directory and copy the dense directory. */
+ for (i = 0; i < n_recs; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+ mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+ slot -= PAGE_DIR_SLOT_SIZE;
+ UNIV_PREFETCH_RW(slot);
+ }
+
+ if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+ < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_recs,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+
+ mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+ {
+ const page_dir_slot_t* last_slot = page_dir_get_nth_slot(
+ page, page_dir_get_n_slots(page) - 1);
+
+ if (UNIV_UNLIKELY(slot != last_slot)) {
+ page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+ (const void*) slot,
+ (const void*) last_slot));
+ return(FALSE);
+ }
+ }
+
+ /* Copy the rest of the dense directory. */
+ for (; i < n_dense; i++) {
+ ulint offs = page_zip_dir_get(page_zip, i);
+
+ if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+ page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+ (unsigned) i, (unsigned) n_dense,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ recs[i] = page + offs;
+ }
+
+ if (UNIV_LIKELY(n_dense > 1)) {
+ page_zip_dir_sort(recs, recs_aux, 0, n_dense);
+ }
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ page_t* page, /*!< in/out: uncompressed page */
+ ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+ ulint n;
+ ulint i;
+ ulint n_owned = 1;
+ ulint offs;
+ rec_t* rec;
+
+ n = page_get_n_recs(page);
+ rec = page + PAGE_NEW_INFIMUM;
+
+ for (i = 0; i < n; i++) {
+ offs = page_zip_dir_get(page_zip, i);
+
+ if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) {
+ info_bits |= REC_INFO_DELETED_FLAG;
+ }
+ if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+ info_bits |= n_owned;
+ n_owned = 1;
+ } else {
+ n_owned++;
+ }
+ offs &= PAGE_ZIP_DIR_SLOT_MASK;
+ if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+ + REC_N_NEW_EXTRA_BYTES)) {
+ page_zip_fail(("page_zip_set_extra_bytes 1:"
+ " %u %u %lx\n",
+ (unsigned) i, (unsigned) n,
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec_set_next_offs_new(rec, offs);
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+ info_bits = 0;
+ }
+
+ /* Set the next pointer of the last user record. */
+ rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+ /* Set n_owned of the supremum record. */
+ page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+
+ if (i >= n) {
+ if (UNIV_LIKELY(i == n)) {
+ return(TRUE);
+ }
+
+ page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+ (unsigned) i, (unsigned) n));
+ return(FALSE);
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+
+ /* Set the extra bytes of deleted records on the free list. */
+ for (;;) {
+ if (UNIV_UNLIKELY(!offs)
+ || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+ page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+ (ulong) offs));
+ return(FALSE);
+ }
+
+ rec = page + offs;
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+ if (++i == n) {
+ break;
+ }
+
+ offs = page_zip_dir_get(page_zip, i);
+ rec_set_next_offs_new(rec, offs);
+ }
+
+ /* Terminate the free list. */
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+ rec_set_next_offs_new(rec, 0);
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns. Do not copy the fields that are stored separately.
+@return pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+ rec_t* rec, /*!< in/out: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
+ const byte* data, /*!< in: modification log */
+ const byte* end) /*!< in: end of modification log */
+{
+ ulint i;
+ ulint len;
+ byte* next_out = rec;
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, skip the
+ BTR_EXTERN_FIELD_REF. */
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ if (UNIV_UNLIKELY(dst - next_out >= end - data)
+ || UNIV_UNLIKELY
+ (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+ || rec_offs_nth_extern(offsets, i)) {
+ page_zip_fail(("page_zip_apply_log_ext:"
+ " trx_id len %lu,"
+ " %p - %p >= %p - %p\n",
+ (ulong) len,
+ (const void*) dst,
+ (const void*) next_out,
+ (const void*) end,
+ (const void*) data));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, dst - next_out);
+ data += dst - next_out;
+ next_out = dst + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ len += dst - next_out
+ - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext: "
+ "ext %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(next_out, data, len);
+ data += len;
+ next_out += len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ /* Copy the last bytes of the record. */
+ len = rec_get_end(rec, offsets) - next_out;
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log_ext: "
+ "last %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(next_out, data, len);
+ data += len;
+
+ return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+ const byte* data, /*!< in: modification log */
+ ulint size, /*!< in: maximum length of the log, in bytes */
+ rec_t** recs, /*!< in: dense page directory,
+ sorted by address (indexed by
+ heap_no - PAGE_HEAP_NO_USER_LOW) */
+ ulint n_dense,/*!< in: size of recs[] */
+ ulint trx_id_col,/*!< in: column number of trx_id in the index,
+ or ULINT_UNDEFINED if none */
+ ulint heap_status,
+ /*!< in: heap_no and status bits for
+ the next record to uncompress */
+ dict_index_t* index, /*!< in: index of the page */
+ ulint* offsets)/*!< in/out: work area for
+ rec_get_offsets_reverse() */
+{
+ const byte* const end = data + size;
+
+ for (;;) {
+ ulint val;
+ rec_t* rec;
+ ulint len;
+ ulint hs;
+
+ val = *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ return(data - 1);
+ }
+ if (val & 0x80) {
+ val = (val & 0x7f) << 8 | *data++;
+ if (UNIV_UNLIKELY(!val)) {
+ page_zip_fail(("page_zip_apply_log:"
+ " invalid val %x%x\n",
+ data[-2], data[-1]));
+ return(NULL);
+ }
+ }
+ if (UNIV_UNLIKELY(data >= end)) {
+ page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+ (const void*) data,
+ (const void*) end));
+ return(NULL);
+ }
+ if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+ page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+ (ulong) val, (ulong) n_dense));
+ return(NULL);
+ }
+
+ /* Determine the heap number and status bits of the record. */
+ rec = recs[(val >> 1) - 1];
+
+ hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+ hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+ /* This may either be an old record that is being
+ overwritten (updated in place, or allocated from
+ the free list), or a new record, with the next
+ available_heap_no. */
+ if (UNIV_UNLIKELY(hs > heap_status)) {
+ page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+ (ulong) hs, (ulong) heap_status));
+ return(NULL);
+ } else if (hs == heap_status) {
+ /* A new record was allocated from the heap. */
+ if (UNIV_UNLIKELY(val & 1)) {
+ /* Only existing records may be cleared. */
+ page_zip_fail(("page_zip_apply_log:"
+ " attempting to create"
+ " deleted rec %lu\n",
+ (ulong) hs));
+ return(NULL);
+ }
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ }
+
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+ if (val & 1) {
+ /* Clear the data bytes of the record. */
+ mem_heap_t* heap = NULL;
+ ulint* offs;
+ offs = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ memset(rec, 0, rec_offs_data_size(offs));
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ continue;
+ }
+
+#if REC_STATUS_NODE_PTR != TRUE
+# error "REC_STATUS_NODE_PTR != TRUE"
+#endif
+ rec_get_offsets_reverse(data, index,
+ hs & REC_STATUS_NODE_PTR,
+ offsets);
+ rec_offs_make_valid(rec, index, offsets);
+
+ /* Copy the extra bytes (backwards). */
+ {
+ byte* start = rec_get_start(rec, offsets);
+ byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+ while (b != start) {
+ *--b = *data++;
+ }
+ }
+
+ /* Copy the data bytes. */
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ /* Non-leaf nodes should not contain any
+ externally stored columns. */
+ if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ page_zip_fail(("page_zip_apply_log: "
+ "%lu&REC_STATUS_NODE_PTR\n",
+ (ulong) hs));
+ return(NULL);
+ }
+
+ data = page_zip_apply_log_ext(
+ rec, offsets, trx_id_col, data, end);
+
+ if (UNIV_UNLIKELY(!data)) {
+ return(NULL);
+ }
+ } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+ len = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+ /* Copy the data bytes, except node_ptr. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log: "
+ "node_ptr %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(rec, data, len);
+ data += len;
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ len = rec_offs_data_size(offsets);
+
+ /* Copy all data bytes of
+ a record in a secondary index. */
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log: "
+ "sec %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+
+ memcpy(rec, data, len);
+ data += len;
+ } else {
+ /* Skip DB_TRX_ID and DB_ROLL_PTR. */
+ ulint l = rec_get_nth_field_offs(offsets,
+ trx_id_col, &len);
+ byte* b;
+
+ if (UNIV_UNLIKELY(data + l >= end)
+ || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN))) {
+ page_zip_fail(("page_zip_apply_log: "
+ "trx_id %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) l,
+ (const void*) end));
+ return(NULL);
+ }
+
+ /* Copy any preceding data bytes. */
+ memcpy(rec, data, l);
+ data += l;
+
+ /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+ b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ len = rec_get_end(rec, offsets) - b;
+ if (UNIV_UNLIKELY(data + len >= end)) {
+ page_zip_fail(("page_zip_apply_log: "
+ "clust %p+%lu >= %p\n",
+ (const void*) data,
+ (ulong) len,
+ (const void*) end));
+ return(NULL);
+ }
+ memcpy(b, data, len);
+ data += len;
+ }
+ }
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ ulint heap_status = REC_STATUS_NODE_PTR
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+ const byte* storage;
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= n_dense
+ * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE);
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+ - d_stream->next_out;
+
+ ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+ - PAGE_ZIP_START - PAGE_DIR);
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ /* Apparently, n_dense has grown
+ since the time the page was last compressed. */
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+ /* Prepare to decompress the data bytes. */
+ d_stream->next_out = rec;
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Decompress the data bytes, except node_ptr. */
+ d_stream->avail_out = rec_offs_data_size(offsets)
+ - REC_NODE_PTR_SIZE;
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ /* Clear the node pointer in case the record
+ will be deleted and the space will be reallocated
+ to a smaller record. */
+ memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+ d_stream->next_out += REC_NODE_PTR_SIZE;
+
+ ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = page_header_get_field(page_zip->data,
+ PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out);
+ if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page) - 1)
+ - d_stream->next_out);
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = mod_log_ptr - page_zip->data;
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY
+ (page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index), NULL)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress_node_ptrs:"
+ " %lu + %lu >= %lu, %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, dict_index_is_clust(index),
+ NULL),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip),
+ (ulong) dict_index_is_clust(index)));
+ return(FALSE);
+ }
+
+ /* Restore the uncompressed columns in heap_no order. */
+ storage = page_zip->data + page_zip_get_size(page_zip)
+ - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ storage -= REC_NODE_PTR_SIZE;
+
+ memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+ storage, REC_NODE_PTR_SIZE);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint* offsets) /*!< in/out: temporary offsets */
+{
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ ulint slot;
+
+ ut_a(!dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ /* Decompress everything up to this record. */
+ d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+ - d_stream->next_out;
+
+ if (UNIV_LIKELY(d_stream->avail_out)) {
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ /* Apparently, n_dense has grown
+ since the time the page was last compressed. */
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+
+ /* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+ d_stream->next_out = rec;
+
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+ }
+
+ /* Decompress the data of the last record and any trailing garbage,
+ in case the last record was allocated from an originally longer space
+ on the free list. */
+ d_stream->avail_out = page_header_get_field(page_zip->data,
+ PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out);
+ if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_sec:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_sec:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page) - 1)
+ - d_stream->next_out);
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ ULINT_UNDEFINED, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = mod_log_ptr - page_zip->data;
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE, NULL)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, FALSE, NULL),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ /* There are no uncompressed columns on leaf pages of
+ secondary indexes. */
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t* rec, /*!< in/out: record */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec) */
+ ulint trx_id_col) /*!< in: position of of DB_TRX_ID */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ulint len;
+ byte* dst;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ /* Skip trx_id and roll_ptr */
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " len[%lu] = %lu\n",
+ (ulong) i, (ulong) len));
+ return(FALSE);
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " DB_TRX_ID at %lu is ext\n",
+ (ulong) i));
+ return(FALSE);
+ }
+
+ d_stream->avail_out = dst - d_stream->next_out;
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+ ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ d_stream->avail_out = dst - d_stream->next_out;
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust_ext:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ return(FALSE);
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear the BLOB pointer in case
+ the record will be deleted and the
+ space will not be reused. Note that
+ the final initialization of the BLOB
+ pointers (copying from "externs"
+ or clearing) will have to take place
+ only after the page modification log
+ has been applied. Otherwise, we
+ could end up with an uninitialized
+ BLOB pointer when a record is deleted,
+ reallocated and deleted. */
+ memset(d_stream->next_out, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ d_stream->next_out
+ += BTR_EXTERN_FIELD_REF_SIZE;
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ z_stream* d_stream, /*!< in/out: compressed page stream */
+ rec_t** recs, /*!< in: dense page directory
+ sorted by address */
+ ulint n_dense, /*!< in: size of recs[] */
+ dict_index_t* index, /*!< in: the index of the page */
+ ulint trx_id_col, /*!< index of the trx_id column */
+ ulint* offsets, /*!< in/out: temporary offsets */
+ mem_heap_t* heap) /*!< in: temporary memory heap */
+{
+ int err;
+ ulint slot;
+ ulint heap_status = REC_STATUS_ORDINARY
+ | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+ const byte* storage;
+ const byte* externs;
+
+ ut_a(dict_index_is_clust(index));
+
+ /* Subtract the space reserved for uncompressed data. */
+ d_stream->avail_in -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+
+ /* Decompress the records in heap_no order. */
+ for (slot = 0; slot < n_dense; slot++) {
+ rec_t* rec = recs[slot];
+
+ d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+ - d_stream->next_out;
+
+ ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+ - PAGE_ZIP_START - PAGE_DIR);
+ err = inflate(d_stream, Z_SYNC_FLUSH);
+ switch (err) {
+ case Z_STREAM_END:
+ /* Apparently, n_dense has grown
+ since the time the page was last compressed. */
+ goto zlib_done;
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (UNIV_LIKELY(!d_stream->avail_out)) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+ /* Prepare to decompress the data bytes. */
+ d_stream->next_out = rec;
+ /* Set heap_no and the status bits. */
+ mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+ heap_status += 1 << REC_HEAP_NO_SHIFT;
+
+ /* Read the offsets. The status bits are needed here. */
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ /* This is a leaf page in a clustered index. */
+
+ /* Check if there are any externally stored columns.
+ For each externally stored column, restore the
+ BTR_EXTERN_FIELD_REF separately. */
+
+ if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_clust_ext(
+ d_stream, rec, offsets, trx_id_col))) {
+
+ goto zlib_error;
+ }
+ } else {
+ /* Skip trx_id and roll_ptr */
+ ulint len;
+ byte* dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " len = %lu\n", (ulong) len));
+ goto zlib_error;
+ }
+
+ d_stream->avail_out = dst - d_stream->next_out;
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+
+ ut_ad(d_stream->next_out == dst);
+
+ /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+ avoid uninitialized bytes in case the record
+ is affected by page_zip_apply_log(). */
+ memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ d_stream->next_out += DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN;
+ }
+
+ /* Decompress the last bytes of the record. */
+ d_stream->avail_out = rec_get_end(rec, offsets)
+ - d_stream->next_out;
+
+ switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+ case Z_STREAM_END:
+ case Z_OK:
+ case Z_BUF_ERROR:
+ if (!d_stream->avail_out) {
+ break;
+ }
+ /* fall through */
+ default:
+ page_zip_fail(("page_zip_decompress_clust:"
+ " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+ d_stream->msg));
+ goto zlib_error;
+ }
+ }
+
+ /* Decompress any trailing garbage, in case the last record was
+ allocated from an originally longer space on the free list. */
+ d_stream->avail_out = page_header_get_field(page_zip->data,
+ PAGE_HEAP_TOP)
+ - page_offset(d_stream->next_out);
+ if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+ - PAGE_ZIP_START - PAGE_DIR)) {
+
+ page_zip_fail(("page_zip_decompress_clust:"
+ " avail_out = %u\n",
+ d_stream->avail_out));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " inflate(Z_FINISH)=%s\n",
+ d_stream->msg));
+zlib_error:
+ inflateEnd(d_stream);
+ return(FALSE);
+ }
+
+ /* Note that d_stream->avail_out > 0 may hold here
+ if the modification log is nonempty. */
+
+zlib_done:
+ if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+ ut_error;
+ }
+
+ {
+ page_t* page = page_align(d_stream->next_out);
+
+ /* Clear the unused heap space on the uncompressed page. */
+ memset(d_stream->next_out, 0,
+ page_dir_get_nth_slot(page,
+ page_dir_get_n_slots(page) - 1)
+ - d_stream->next_out);
+ }
+
+#ifdef UNIV_DEBUG
+ page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+ /* Apply the modification log. */
+ {
+ const byte* mod_log_ptr;
+ mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+ d_stream->avail_in + 1,
+ recs, n_dense,
+ trx_id_col, heap_status,
+ index, offsets);
+
+ if (UNIV_UNLIKELY(!mod_log_ptr)) {
+ return(FALSE);
+ }
+ page_zip->m_end = mod_log_ptr - page_zip->data;
+ page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+ }
+
+ if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE, NULL)
+ + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+ page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+ (ulong) page_zip_get_trailer_len(
+ page_zip, TRUE, NULL),
+ (ulong) page_zip->m_end,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ storage = page_zip->data + page_zip_get_size(page_zip)
+ - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ externs = storage - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Restore the uncompressed columns in heap_no order. */
+
+ for (slot = 0; slot < n_dense; slot++) {
+ ulint i;
+ ulint len;
+ byte* dst;
+ rec_t* rec = recs[slot];
+ ibool exists = !page_zip_dir_find_free(
+ page_zip, page_offset(rec));
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ dst = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+ memcpy(dst, storage,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ /* Check if there are any externally stored
+ columns in this record. For each externally
+ stored column, restore or clear the
+ BTR_EXTERN_FIELD_REF. */
+ if (!rec_offs_any_extern(offsets)) {
+ continue;
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ if (!rec_offs_nth_extern(offsets, i)) {
+ continue;
+ }
+ dst = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+ page_zip_fail(("page_zip_decompress_clust:"
+ " %lu < 20\n",
+ (ulong) len));
+ return(FALSE);
+ }
+
+ dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_LIKELY(exists)) {
+ /* Existing record:
+ restore the BLOB pointer */
+ externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (UNIV_UNLIKELY
+ (externs < page_zip->data
+ + page_zip->m_end)) {
+ page_zip_fail(("page_zip_"
+ "decompress_clust: "
+ "%p < %p + %lu\n",
+ (const void*) externs,
+ (const void*)
+ page_zip->data,
+ (ulong)
+ page_zip->m_end));
+ return(FALSE);
+ }
+
+ memcpy(dst, externs,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ page_zip->n_blobs++;
+ } else {
+ /* Deleted record:
+ clear the BLOB pointer */
+ memset(dst, 0,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page. This function should tolerate errors on the compressed
+page. Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in: data, ssize;
+ out: m_start, m_end, m_nonempty, n_blobs */
+ page_t* page, /*!< out: uncompressed page, may be trashed */
+ ibool all) /*!< in: TRUE=decompress the whole page;
+ FALSE=verify but do not copy some
+ page header fields that should not change
+ after page creation */
+{
+ z_stream d_stream;
+ dict_index_t* index = NULL;
+ rec_t** recs; /*!< dense page directory, sorted by address */
+ ulint n_dense;/* number of user records on the page */
+ ulint trx_id_col = ULINT_UNDEFINED;
+ mem_heap_t* heap;
+ ulint* offsets;
+#ifndef UNIV_HOTBACKUP
+ ullint usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+
+ ut_ad(page_zip_simple_validate(page_zip));
+ UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ /* The dense directory excludes the infimum and supremum records. */
+ n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+ if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+ >= page_zip_get_size(page_zip))) {
+ page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+ (ulong) n_dense,
+ (ulong) page_zip_get_size(page_zip)));
+ return(FALSE);
+ }
+
+ heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
+ recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs));
+
+ if (all) {
+ /* Copy the page header. */
+ memcpy(page, page_zip->data, PAGE_DATA);
+ } else {
+ /* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(FIL_PAGE_TYPE + page,
+ FIL_PAGE_TYPE + page_zip->data,
+ PAGE_HEADER - FIL_PAGE_TYPE));
+ ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+ PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+ PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+ /* Copy the mutable parts of the page header. */
+ memcpy(page, page_zip->data, FIL_PAGE_TYPE);
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data,
+ PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ /* Check that the page headers match after copying. */
+ ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ /* Clear the uncompressed page, except the header. */
+ memset(PAGE_DATA + page, 0x55, UNIV_PAGE_SIZE - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+ UNIV_MEM_INVALID(PAGE_DATA + page, UNIV_PAGE_SIZE - PAGE_DATA);
+
+ /* Copy the page directory. */
+ if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+ recs + n_dense, n_dense))) {
+zlib_error:
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ /* Copy the infimum and supremum records. */
+ memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+ infimum_extra, sizeof infimum_extra);
+ if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ PAGE_NEW_SUPREMUM);
+ } else {
+ rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+ page_zip_dir_get(page_zip, 0)
+ & PAGE_ZIP_DIR_SLOT_MASK);
+ }
+ memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+ memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+ supremum_extra_data, sizeof supremum_extra_data);
+
+ page_zip_set_alloc(&d_stream, heap);
+
+ if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
+ != Z_OK)) {
+ ut_error;
+ }
+
+ d_stream.next_in = page_zip->data + PAGE_DATA;
+ /* Subtract the space reserved for
+ the page header and the end marker of the modification log. */
+ d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1);
+
+ d_stream.next_out = page + PAGE_ZIP_START;
+ d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START;
+
+ /* Decode the zlib header and the index information. */
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+ page_zip_fail(("page_zip_decompress:"
+ " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+ goto zlib_error;
+ }
+
+ index = page_zip_fields_decode(
+ page + PAGE_ZIP_START, d_stream.next_out,
+ page_is_leaf(page) ? &trx_id_col : NULL);
+
+ if (UNIV_UNLIKELY(!index)) {
+
+ goto zlib_error;
+ }
+
+ /* Decompress the user records. */
+ page_zip->n_blobs = 0;
+ d_stream.next_out = page + PAGE_ZIP_START;
+
+ {
+ /* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+ ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ offsets = mem_heap_alloc(heap, n * sizeof(ulint));
+ *offsets = n;
+ }
+
+ /* Decompress the records in heap_no order. */
+ if (!page_is_leaf(page)) {
+ /* This is a node pointer page. */
+ ulint info_bits;
+
+ if (UNIV_UNLIKELY
+ (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+ recs, n_dense, index,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL
+ ? REC_INFO_MIN_REC_FLAG : 0;
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+ info_bits))) {
+ goto err_exit;
+ }
+ } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+ /* This is a leaf page in a secondary index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+ recs, n_dense,
+ index, offsets))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+err_exit:
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+ } else {
+ /* This is a leaf page in a clustered index. */
+ if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+ &d_stream, recs,
+ n_dense, index,
+ trx_id_col,
+ offsets, heap))) {
+ goto err_exit;
+ }
+
+ if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+ page, 0))) {
+ goto err_exit;
+ }
+ }
+
+ ut_a(page_is_comp(page));
+ UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+ page_zip_fields_free(index);
+ mem_heap_free(heap);
+#ifndef UNIV_HOTBACKUP
+ {
+ page_zip_stat_t* zip_stat
+ = &page_zip_stat[page_zip->ssize - 1];
+ zip_stat->decompressed++;
+ zip_stat->decompressed_usec += ut_time_us(NULL) - usec;
+ }
+#endif /* !UNIV_HOTBACKUP */
+
+ /* Update the stat counter for LRU policy. */
+ buf_LRU_stat_inc_unzip();
+
+ return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+ const char* name, /*!< in: name of the data structure */
+ const void* buf, /*!< in: data */
+ ulint size) /*!< in: length of the data, in bytes */
+{
+ const byte* s = buf;
+ ulint addr;
+ const ulint width = 32; /* bytes per line */
+
+ fprintf(stderr, "%s:\n", name);
+
+ for (addr = 0; addr < size; addr += width) {
+ ulint i;
+
+ fprintf(stderr, "%04lx ", (ulong) addr);
+
+ i = ut_min(width, size - addr);
+
+ while (i--) {
+ fprintf(stderr, "%02x", *s++);
+ }
+
+ putc('\n', stderr);
+ }
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf in: data
+@param size in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+UNIV_INTERN ibool page_zip_validate_header_only = FALSE;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page, /*!< in: uncompressed page */
+ ibool sloppy) /*!< in: FALSE=strict,
+ TRUE=ignore the MIN_REC_FLAG */
+{
+ page_zip_des_t temp_page_zip;
+ byte* temp_page_buf;
+ page_t* temp_page;
+ ibool valid;
+
+ if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV)
+ || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+ || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA)) {
+ page_zip_fail(("page_zip_validate: page header\n"));
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, UNIV_PAGE_SIZE);
+ return(FALSE);
+ }
+
+ ut_a(page_is_comp(page));
+
+ if (page_zip_validate_header_only) {
+ return(TRUE);
+ }
+
+ /* page_zip_decompress() expects the uncompressed page to be
+ UNIV_PAGE_SIZE aligned. */
+ temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+ temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE);
+
+#ifdef UNIV_DEBUG_VALGRIND
+ /* Get detailed information on the valid bits in case the
+ UNIV_MEM_ASSERT_RW() checks fail. The v-bits of page[],
+ page_zip->data[] or page_zip could be viewed at temp_page[] or
+ temp_page_zip in a debugger when running valgrind --db-attach. */
+ VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+# if UNIV_WORD_SIZE == 4
+ VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip);
+ /* On 32-bit systems, there is no padding in page_zip_des_t.
+ On other systems, Valgrind could complain about uninitialized
+ pad bytes. */
+ UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip);
+# endif
+ VALGRIND_GET_VBITS(page_zip->data, temp_page,
+ page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ temp_page_zip = *page_zip;
+ valid = page_zip_decompress(&temp_page_zip, temp_page, TRUE);
+ if (!valid) {
+ fputs("page_zip_validate(): failed to decompress\n", stderr);
+ goto func_exit;
+ }
+ if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+ page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+ page_zip->n_blobs, temp_page_zip.n_blobs));
+ valid = FALSE;
+ }
+#ifdef UNIV_DEBUG
+ if (page_zip->m_start != temp_page_zip.m_start) {
+ page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+ page_zip->m_start, temp_page_zip.m_start));
+ valid = FALSE;
+ }
+#endif /* UNIV_DEBUG */
+ if (page_zip->m_end != temp_page_zip.m_end) {
+ page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+ page_zip->m_end, temp_page_zip.m_end));
+ valid = FALSE;
+ }
+ if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+ page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+ page_zip->m_nonempty,
+ temp_page_zip.m_nonempty));
+ valid = FALSE;
+ }
+ if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+ UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+ /* In crash recovery, the "minimum record" flag may be
+ set incorrectly until the mini-transaction is
+ committed. Let us tolerate that difference when we
+ are performing a sloppy validation. */
+
+ if (sloppy) {
+ byte info_bits_diff;
+ ulint offset
+ = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE);
+ ut_a(offset >= PAGE_NEW_SUPREMUM);
+ offset -= 5 /* REC_NEW_INFO_BITS */;
+
+ info_bits_diff = page[offset] ^ temp_page[offset];
+
+ if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+ temp_page[offset] = page[offset];
+
+ if (!memcmp(page + PAGE_HEADER,
+ temp_page + PAGE_HEADER,
+ UNIV_PAGE_SIZE - PAGE_HEADER
+ - FIL_PAGE_DATA_END)) {
+
+ /* Only the minimum record flag
+ differed. Let us ignore it. */
+ page_zip_fail(("page_zip_validate: "
+ "min_rec_flag "
+ "(ignored, "
+ "%lu,%lu,0x%02lx)\n",
+ page_get_space_id(page),
+ page_get_page_no(page),
+ (ulong) page[offset]));
+ goto func_exit;
+ }
+ }
+ }
+ page_zip_fail(("page_zip_validate: content\n"));
+ valid = FALSE;
+ }
+
+func_exit:
+ if (!valid) {
+ page_zip_hexdump(page_zip, sizeof *page_zip);
+ page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+ page_zip_hexdump(page, UNIV_PAGE_SIZE);
+ page_zip_hexdump(temp_page, UNIV_PAGE_SIZE);
+ }
+ ut_free(temp_page_buf);
+ return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const page_t* page) /*!< in: uncompressed page */
+{
+ return(page_zip_validate_low(page_zip, page,
+ recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+ const page_zip_des_t* page_zip,/*!< in: compressed page */
+ const byte* page) /*!< in: uncompressed page */
+{
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+ FIL_PAGE_LSN - FIL_PAGE_PREV));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+ 2));
+ ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+ PAGE_DATA - FIL_PAGE_DATA));
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns. The data must already have been written to the uncompressed page.
+@return end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ const page_t* page, /*!< in: page containing rec */
+ const byte* rec, /*!< in: record being written */
+ dict_index_t* index, /*!< in: record descriptor */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ ulint create, /*!< in: nonzero=insert, zero=update */
+ ulint trx_id_col, /*!< in: position of DB_TRX_ID */
+ ulint heap_no, /*!< in: heap number of rec */
+ byte* storage, /*!< in: end of dense page directory */
+ byte* data) /*!< in: end of modification log */
+{
+ const byte* start = rec;
+ ulint i;
+ ulint len;
+ byte* externs = storage;
+ ulint n_ext = rec_offs_n_extern(offsets);
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+ /* Note that this will not take into account
+ the BLOB columns of rec if create==TRUE. */
+ ut_ad(data + rec_offs_data_size(offsets)
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ - n_ext * BTR_EXTERN_FIELD_REF_SIZE
+ < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs);
+
+ {
+ ulint blob_no = page_zip_get_n_prev_extern(
+ page_zip, rec, index);
+ byte* ext_end = externs - page_zip->n_blobs
+ * BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(blob_no <= page_zip->n_blobs);
+ externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (create) {
+ page_zip->n_blobs += n_ext;
+ ASSERT_ZERO_BLOB(ext_end - n_ext
+ * BTR_EXTERN_FIELD_REF_SIZE);
+ memmove(ext_end - n_ext
+ * BTR_EXTERN_FIELD_REF_SIZE,
+ ext_end,
+ externs - ext_end);
+ }
+
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+ }
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* src;
+
+ if (UNIV_UNLIKELY(i == trx_id_col)) {
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i));
+ ut_ad(!rec_offs_nth_extern(offsets,
+ i + 1));
+ /* Locate trx_id and roll_ptr. */
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ i + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, src - start);
+ data += src - start;
+ start = src + (DATA_TRX_ID_LEN
+ + DATA_ROLL_PTR_LEN);
+
+ /* Store trx_id and roll_ptr. */
+ memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (heap_no - 1),
+ src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ i++; /* skip also roll_ptr */
+ } else if (rec_offs_nth_extern(offsets, i)) {
+ src = rec_get_nth_field(rec, offsets,
+ i, &len);
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(len
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ ASSERT_ZERO(data, src - start);
+ memcpy(data, start, src - start);
+ data += src - start;
+ start = src + BTR_EXTERN_FIELD_REF_SIZE;
+
+ /* Store the BLOB pointer. */
+ externs -= BTR_EXTERN_FIELD_REF_SIZE;
+ ut_ad(data < externs);
+ memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets) - (start - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, start, len);
+ data += len;
+
+ return(data);
+}
+
+/**********************************************************************//**
+Write an entire record on the compressed page. The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record being written */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint create) /*!< in: nonzero=insert, zero=update */
+{
+ const page_t* page;
+ byte* data;
+ byte* storage;
+ ulint heap_no;
+ byte* slot;
+
+ ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+
+ page = page_align(rec);
+
+ ut_ad(page_zip_header_cmp(page_zip, page));
+ ut_ad(page_simple_validate_new((page_t*) page));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ /* Copy the delete mark. */
+ if (rec_get_deleted_flag(rec, TRUE)) {
+ *slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+ } else {
+ *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ }
+
+ ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+ ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE
+ - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+ * page_dir_get_n_slots(page));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+ ut_ad(heap_no < page_dir_get_n_heap(page));
+
+ /* Append to the modification log. */
+ data = page_zip->data + page_zip->m_end;
+ ut_ad(!*data);
+
+ /* Identify the record by writing its heap number - 1.
+ 0 is reserved to indicate the end of the modification log. */
+
+ if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+ *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+ ut_ad(!*data);
+ }
+ *data++ = (byte) ((heap_no - 1) << 1);
+ ut_ad(!*data);
+
+ {
+ const byte* start = rec - rec_offs_extra_size(offsets);
+ const byte* b = rec - REC_N_NEW_EXTRA_BYTES;
+
+ /* Write the extra bytes backwards, so that
+ rec_offs_extra_size() can be easily computed in
+ page_zip_apply_log() by invoking
+ rec_get_offsets_reverse(). */
+
+ while (b != start) {
+ *data++ = *--b;
+ ut_ad(!*data);
+ }
+ }
+
+ /* Write the data bytes. Store the uncompressed bytes separately. */
+ storage = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ if (page_is_leaf(page)) {
+ ulint len;
+
+ if (dict_index_is_clust(index)) {
+ ulint trx_id_col;
+
+ trx_id_col = dict_index_get_sys_col_pos(index,
+ DATA_TRX_ID);
+ ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+ /* Store separately trx_id, roll_ptr and
+ the BTR_EXTERN_FIELD_REF of each BLOB column. */
+ if (rec_offs_any_extern(offsets)) {
+ data = page_zip_write_rec_ext(
+ page_zip, page,
+ rec, index, offsets, create,
+ trx_id_col, heap_no, storage, data);
+ } else {
+ /* Locate trx_id and roll_ptr. */
+ const byte* src
+ = rec_get_nth_field(rec, offsets,
+ trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(src + DATA_TRX_ID_LEN
+ == rec_get_nth_field(
+ rec, offsets,
+ trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+
+ /* Log the preceding fields. */
+ ASSERT_ZERO(data, src - rec);
+ memcpy(data, rec, src - rec);
+ data += src - rec;
+
+ /* Store trx_id and roll_ptr. */
+ memcpy(storage
+ - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+ * (heap_no - 1),
+ src,
+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+ /* Log the last bytes of the record. */
+ len = rec_offs_data_size(offsets)
+ - (src - rec);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, src, len);
+ data += len;
+ }
+ } else {
+ /* Leaf page of a secondary index:
+ no externally stored columns */
+ ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+ == ULINT_UNDEFINED);
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Log the entire record. */
+ len = rec_offs_data_size(offsets);
+
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+ }
+ } else {
+ /* This is a node pointer page. */
+ ulint len;
+
+ /* Non-leaf nodes should not have any externally
+ stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ /* Copy the data bytes, except node_ptr. */
+ len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+ ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+ * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+ ASSERT_ZERO(data, len);
+ memcpy(data, rec, len);
+ data += len;
+
+ /* Copy the node pointer to the uncompressed area. */
+ memcpy(storage - REC_NODE_PTR_SIZE
+ * (heap_no - 1),
+ rec + len,
+ REC_NODE_PTR_SIZE);
+ }
+
+ ut_a(!*data);
+ ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+ page_zip->m_end = data - page_zip->data;
+ page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page_align(rec)));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< in/out: compressed page */
+{
+ ulint offset;
+ ulint z_offset;
+
+ ut_ad(!page == !page_zip);
+
+ if (UNIV_UNLIKELY
+ (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ z_offset = mach_read_from_2(ptr + 2);
+
+ if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+ || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+ || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (UNIV_UNLIKELY(!page_zip)
+ || UNIV_UNLIKELY(!page_is_leaf(page))) {
+
+ goto corrupt;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ memcpy(page + offset,
+ ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+ memcpy(page_zip->data + z_offset,
+ ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE));
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in/out: record whose data is being
+ written */
+ dict_index_t* index, /*!< in: index of the page */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint n, /*!< in: column index */
+ mtr_t* mtr) /*!< in: mini-transaction handle,
+ or NULL if no logging is needed */
+{
+ const byte* field;
+ byte* externs;
+ const page_t* page = page_align(rec);
+ ulint blob_no;
+ ulint len;
+
+ ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+ ut_ad(page_simple_validate_new((page_t*) page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_comp(offsets));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_any_extern(offsets));
+ ut_ad(rec_offs_nth_extern(offsets, n));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+ ut_ad(dict_index_is_clust(index));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+ + rec_get_n_extern_new(rec, index, n);
+ ut_a(blob_no < page_zip->n_blobs);
+
+ externs = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * (PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ field = rec_get_nth_field(rec, offsets, n, &len);
+
+ externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+ field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+ memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ if (mtr) {
+#ifndef UNIV_HOTBACKUP
+ byte* log_ptr = mlog_open(
+ mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE);
+ if (UNIV_UNLIKELY(!log_ptr)) {
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(
+ (byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr);
+ mach_write_to_2(log_ptr, page_offset(field));
+ log_ptr += 2;
+ mach_write_to_2(log_ptr, externs - page_zip->data);
+ log_ptr += 2;
+ memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE);
+ log_ptr += BTR_EXTERN_FIELD_REF_SIZE;
+ mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+ }
+}
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< in/out: compressed page */
+{
+ ulint offset;
+ ulint z_offset;
+
+ ut_ad(!page == !page_zip);
+
+ if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) {
+
+ return(NULL);
+ }
+
+ offset = mach_read_from_2(ptr);
+ z_offset = mach_read_from_2(ptr + 2);
+
+ if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+ || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+ || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (page) {
+ byte* storage_end;
+ byte* field;
+ byte* storage;
+ ulint heap_no;
+
+ if (UNIV_UNLIKELY(!page_zip)
+ || UNIV_UNLIKELY(page_is_leaf(page))) {
+
+ goto corrupt;
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ field = page + offset;
+ storage = page_zip->data + z_offset;
+
+ storage_end = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE;
+
+ if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE)
+ || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW)
+ || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) {
+
+ goto corrupt;
+ }
+
+ memcpy(field, ptr + 4, REC_NODE_PTR_SIZE);
+ memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ return(ptr + (2 + 2 + REC_NODE_PTR_SIZE));
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ ulint size, /*!< in: data size of rec */
+ ulint ptr, /*!< in: node pointer */
+ mtr_t* mtr) /*!< in: mini-transaction, or NULL */
+{
+ byte* field;
+ byte* storage;
+ page_t* page = page_align(rec);
+
+ ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(page_rec_is_comp(rec));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(!page_is_leaf(page));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(rec, size);
+
+ storage = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE
+ - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+ field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if REC_NODE_PTR_SIZE != 4
+# error "REC_NODE_PTR_SIZE != 4"
+#endif
+ mach_write_to_4(field, ptr);
+ memcpy(storage, field, REC_NODE_PTR_SIZE);
+
+ if (mtr) {
+#ifndef UNIV_HOTBACKUP
+ byte* log_ptr = mlog_open(mtr,
+ 11 + 2 + 2 + REC_NODE_PTR_SIZE);
+ if (UNIV_UNLIKELY(!log_ptr)) {
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(
+ field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr);
+ mach_write_to_2(log_ptr, page_offset(field));
+ log_ptr += 2;
+ mach_write_to_2(log_ptr, storage - page_zip->data);
+ log_ptr += 2;
+ memcpy(log_ptr, field, REC_NODE_PTR_SIZE);
+ log_ptr += REC_NODE_PTR_SIZE;
+ mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+ }
+}
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in/out: record */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint trx_id_col,/*!< in: column number of TRX_ID in rec */
+ trx_id_t trx_id, /*!< in: transaction identifier */
+ roll_ptr_t roll_ptr)/*!< in: roll_ptr */
+{
+ byte* field;
+ byte* storage;
+ page_t* page = page_align(rec);
+ ulint len;
+
+ ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+ ut_ad(page_simple_validate_new(page));
+ ut_ad(page_zip_simple_validate(page_zip));
+ ut_ad(page_zip_get_size(page_zip)
+ > PAGE_DATA + page_zip_dir_size(page_zip));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ ut_ad(page_zip->m_start >= PAGE_DATA);
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ ut_ad(page_is_leaf(page));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ storage = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE
+ - (rec_get_heap_no_new(rec) - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+ field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+ ut_ad(field + DATA_TRX_ID_LEN
+ == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+ ut_ad(len == DATA_ROLL_PTR_LEN);
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+ ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+ mach_write_to_6(field, trx_id);
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+ mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+ memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/** Set this variable in a debugger to disable page_zip_clear_rec().
+The only observable effect should be the compression ratio due to
+deleted records not being zeroed out. In rare cases, there can be
+page_zip_validate() failures on the node_ptr, trx_id and roll_ptr
+columns if the space is reallocated for a smaller record. */
+UNIV_INTERN ibool page_zip_clear_rec_disable;
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page, if possible. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: record to clear */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint heap_no;
+ page_t* page = page_align(rec);
+ /* page_zip_validate() would fail here if a record
+ containing externally stored columns is being deleted. */
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+ ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+ ut_ad(page_zip_header_cmp(page_zip, page));
+
+ heap_no = rec_get_heap_no_new(rec);
+ ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ if (
+#ifdef UNIV_ZIP_DEBUG
+ !page_zip_clear_rec_disable &&
+#endif /* UNIV_ZIP_DEBUG */
+ page_zip->m_end
+ + 1 + ((heap_no - 1) >= 64)/* size of the log entry */
+ + page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index), NULL)
+ < page_zip_get_size(page_zip)) {
+ byte* data;
+
+ /* Clear only the data bytes, because the allocator and
+ the decompressor depend on the extra bytes. */
+ memset(rec, 0, rec_offs_data_size(offsets));
+
+ if (!page_is_leaf(page)) {
+ /* Clear node_ptr on the compressed page. */
+ byte* storage = page_zip->data
+ + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page)
+ - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
+ 0, REC_NODE_PTR_SIZE);
+ } else if (dict_index_is_clust(index)) {
+ /* Clear trx_id and roll_ptr on the compressed page. */
+ byte* storage = page_zip->data
+ + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page)
+ - PAGE_HEAP_NO_USER_LOW)
+ * PAGE_ZIP_DIR_SLOT_SIZE;
+
+ memset(storage - (heap_no - 1)
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+ 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ }
+
+ /* Log that the data was zeroed out. */
+ data = page_zip->data + page_zip->m_end;
+ ut_ad(!*data);
+ if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+ *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+ ut_ad(!*data);
+ }
+ *data++ = (byte) ((heap_no - 1) << 1 | 1);
+ ut_ad(!*data);
+ ut_ad((ulint) (data - page_zip->data)
+ < page_zip_get_size(page_zip));
+ page_zip->m_end = data - page_zip->data;
+ page_zip->m_nonempty = TRUE;
+ } else if (page_is_leaf(page) && dict_index_is_clust(index)) {
+ /* Do not clear the record, because there is not enough space
+ to log the operation. */
+
+ if (rec_offs_any_extern(offsets)) {
+ ulint i;
+
+ for (i = rec_offs_n_fields(offsets); i--; ) {
+ /* Clear all BLOB pointers in order to make
+ page_zip_validate() pass. */
+ if (rec_offs_nth_extern(offsets, i)) {
+ ulint len;
+ byte* field = rec_get_nth_field(
+ rec, offsets, i, &len);
+ memset(field + len
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ 0, BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page. The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the deleted flag (nonzero=TRUE) */
+{
+ byte* slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ if (flag) {
+ *slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ } else {
+ *slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+ }
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page_align(rec)));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page. The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* rec, /*!< in: record on the uncompressed page */
+ ulint flag) /*!< in: the owned flag (nonzero=TRUE) */
+{
+ byte* slot = page_zip_dir_find(page_zip, page_offset(rec));
+ ut_a(slot);
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ if (flag) {
+ *slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ } else {
+ *slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+ }
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ const byte* prev_rec,/*!< in: record after which to insert */
+ const byte* free_rec,/*!< in: record from which rec was
+ allocated, or NULL */
+ byte* rec) /*!< in: record to insert */
+{
+ ulint n_dense;
+ byte* slot_rec;
+ byte* slot_free;
+
+ ut_ad(prev_rec != rec);
+ ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec);
+ ut_ad(page_zip_simple_validate(page_zip));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ if (page_rec_is_infimum(prev_rec)) {
+ /* Use the first slot. */
+ slot_rec = page_zip->data + page_zip_get_size(page_zip);
+ } else {
+ byte* end = page_zip->data + page_zip_get_size(page_zip);
+ byte* start = end - page_zip_dir_user_size(page_zip);
+
+ if (UNIV_LIKELY(!free_rec)) {
+ /* PAGE_N_RECS was already incremented
+ in page_cur_insert_rec_zip(), but the
+ dense directory slot at that position
+ contains garbage. Skip it. */
+ start += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ slot_rec = page_zip_dir_find_low(start, end,
+ page_offset(prev_rec));
+ ut_a(slot_rec);
+ }
+
+ /* Read the old n_dense (n_heap may have been incremented). */
+ n_dense = page_dir_get_n_heap(page_zip->data)
+ - (PAGE_HEAP_NO_USER_LOW + 1);
+
+ if (UNIV_LIKELY_NULL(free_rec)) {
+ /* The record was allocated from the free list.
+ Shift the dense directory only up to that slot.
+ Note that in this case, n_dense is actually
+ off by one, because page_cur_insert_rec_zip()
+ did not increment n_heap. */
+ ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+ + PAGE_HEAP_NO_USER_LOW);
+ ut_ad(rec >= free_rec);
+ slot_free = page_zip_dir_find(page_zip, page_offset(free_rec));
+ ut_ad(slot_free);
+ slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+ } else {
+ /* The record was allocated from the heap.
+ Shift the entire dense directory. */
+ ut_ad(rec_get_heap_no_new(rec) == n_dense
+ + PAGE_HEAP_NO_USER_LOW);
+
+ /* Shift to the end of the dense page directory. */
+ slot_free = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+ }
+
+ /* Shift the dense directory to allocate place for rec. */
+ memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+ slot_rec - slot_free);
+
+ /* Write the entry for the inserted record.
+ The "owned" and "deleted" flags must be zero. */
+ mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
+}
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+ page_zip_des_t* page_zip,/*!< in/out: compressed page */
+ byte* rec, /*!< in: record to delete */
+ dict_index_t* index, /*!< in: index of rec */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec) */
+ const byte* free) /*!< in: previous start of the free list */
+{
+ byte* slot_rec;
+ byte* slot_free;
+ ulint n_ext;
+ page_t* page = page_align(rec);
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(rec_offs_comp(offsets));
+
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+ UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+ rec_offs_extra_size(offsets));
+
+ slot_rec = page_zip_dir_find(page_zip, page_offset(rec));
+
+ ut_a(slot_rec);
+
+ /* This could not be done before page_zip_dir_find(). */
+ page_header_set_field(page, page_zip, PAGE_N_RECS,
+ (ulint)(page_get_n_recs(page) - 1));
+
+ if (UNIV_UNLIKELY(!free)) {
+ /* Make the last slot the start of the free list. */
+ slot_free = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE
+ * (page_dir_get_n_heap(page_zip->data)
+ - PAGE_HEAP_NO_USER_LOW);
+ } else {
+ slot_free = page_zip_dir_find_free(page_zip,
+ page_offset(free));
+ ut_a(slot_free < slot_rec);
+ /* Grow the free list by one slot by moving the start. */
+ slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+ }
+
+ if (UNIV_LIKELY(slot_rec > slot_free)) {
+ memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE,
+ slot_free,
+ slot_rec - slot_free);
+ }
+
+ /* Write the entry for the deleted record.
+ The "owned" and "deleted" flags will be cleared. */
+ mach_write_to_2(slot_free, page_offset(rec));
+
+ if (!page_is_leaf(page) || !dict_index_is_clust(index)) {
+ ut_ad(!rec_offs_any_extern(offsets));
+ goto skip_blobs;
+ }
+
+ n_ext = rec_offs_n_extern(offsets);
+ if (UNIV_UNLIKELY(n_ext)) {
+ /* Shift and zero fill the array of BLOB pointers. */
+ ulint blob_no;
+ byte* externs;
+ byte* ext_end;
+
+ blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+ ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+ externs = page_zip->data + page_zip_get_size(page_zip)
+ - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+ * (PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ ext_end = externs - page_zip->n_blobs
+ * BTR_EXTERN_FIELD_REF_SIZE;
+ externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+ page_zip->n_blobs -= n_ext;
+ /* Shift and zero fill the array. */
+ memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end,
+ (page_zip->n_blobs - blob_no)
+ * BTR_EXTERN_FIELD_REF_SIZE);
+ memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE);
+ }
+
+skip_blobs:
+ /* The compression algorithm expects info_bits and n_owned
+ to be 0 for deleted records. */
+ rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+ page_zip_clear_rec(page_zip, rec, index, offsets);
+}
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+ page_zip_des_t* page_zip, /*!< in/out: compressed page */
+ ulint is_clustered) /*!< in: nonzero for clustered index,
+ zero for others */
+{
+ ulint n_dense;
+ byte* dir;
+ byte* stored;
+
+ ut_ad(page_is_comp(page_zip->data));
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ /* Read the old n_dense (n_heap has already been incremented). */
+ n_dense = page_dir_get_n_heap(page_zip->data)
+ - (PAGE_HEAP_NO_USER_LOW + 1);
+
+ dir = page_zip->data + page_zip_get_size(page_zip)
+ - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+
+ if (!page_is_leaf(page_zip->data)) {
+ ut_ad(!page_zip->n_blobs);
+ stored = dir - n_dense * REC_NODE_PTR_SIZE;
+ } else if (UNIV_UNLIKELY(is_clustered)) {
+ /* Move the BLOB pointer array backwards to make space for the
+ roll_ptr and trx_id columns and the dense directory slot. */
+ byte* externs;
+
+ stored = dir - n_dense
+ * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ externs = stored
+ - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ASSERT_ZERO(externs
+ - (PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+ PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+ memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE
+ + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+ externs, stored - externs);
+ } else {
+ stored = dir
+ - page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+ ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE,
+ PAGE_ZIP_DIR_SLOT_SIZE);
+ }
+
+ /* Move the uncompressed area backwards to make space
+ for one directory slot. */
+ memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored);
+}
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+ byte* ptr, /*!< in: redo log buffer */
+ byte* end_ptr,/*!< in: redo log buffer end */
+ page_t* page, /*!< in/out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< in/out: compressed page */
+{
+ ulint offset;
+ ulint len;
+
+ ut_ad(ptr && end_ptr);
+ ut_ad(!page == !page_zip);
+
+ if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) {
+
+ return(NULL);
+ }
+
+ offset = (ulint) *ptr++;
+ len = (ulint) *ptr++;
+
+ if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) {
+corrupt:
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ if (UNIV_UNLIKELY(end_ptr < ptr + len)) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (UNIV_UNLIKELY(!page_zip)) {
+
+ goto corrupt;
+ }
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ memcpy(page + offset, ptr, len);
+ memcpy(page_zip->data + offset, ptr, len);
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+ }
+
+ return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+ const byte* data, /*!< in: data on the uncompressed page */
+ ulint length, /*!< in: length of the data */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ byte* log_ptr = mlog_open(mtr, 11 + 1 + 1);
+ ulint offset = page_offset(data);
+
+ ut_ad(offset < PAGE_DATA);
+ ut_ad(offset + length < PAGE_DATA);
+#if PAGE_DATA > 255
+# error "PAGE_DATA > 255"
+#endif
+ ut_ad(length < 256);
+
+ /* If no logging is requested, we may return now */
+ if (UNIV_UNLIKELY(!log_ptr)) {
+
+ return;
+ }
+
+ log_ptr = mlog_write_initial_log_record_fast(
+ (byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr);
+ *log_ptr++ = (byte) offset;
+ *log_ptr++ = (byte) length;
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr, data, length);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reorganize and compress a page. This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+ buf_block_t* block, /*!< in/out: page with compressed page;
+ on the compressed page, in: size;
+ out: data, n_blobs,
+ m_start, m_end, m_nonempty */
+ dict_index_t* index, /*!< in: index of the B-tree node */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ page_zip_des_t* page_zip = buf_block_get_page_zip(block);
+ page_t* page = buf_block_get_frame(block);
+ buf_block_t* temp_block;
+ page_t* temp_page;
+ ulint log_mode;
+
+ ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(page_is_comp(page));
+ ut_ad(!dict_index_is_ibuf(index));
+ /* Note that page_zip_validate(page_zip, page) may fail here. */
+ UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+ /* Disable logging */
+ log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+ temp_block = buf_block_alloc(0);
+ btr_search_drop_page_hash_index(block);
+ block->check_index_page_at_flush = TRUE;
+#else /* !UNIV_HOTBACKUP */
+ ut_ad(block == back_block1);
+ temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+ temp_page = temp_block->frame;
+
+ /* Copy the old page to temporary space */
+ buf_frame_copy(temp_page, page);
+
+ /* Recreate the page: note that global data on page (possible
+ segment headers, next page-field, etc.) is preserved intact */
+
+ page_create(block, mtr, TRUE);
+
+ /* Copy the records from the temporary space to the recreated page;
+ do not copy the lock bits yet */
+
+ page_copy_rec_list_end_no_locks(block, temp_block,
+ page_get_infimum_rec(temp_page),
+ index, mtr);
+
+ if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) {
+ /* Copy max trx id to recreated page */
+ trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
+ page_set_max_trx_id(block, NULL, max_trx_id, NULL);
+ ut_ad(!ut_dulint_is_zero(max_trx_id));
+ }
+
+ /* Restore logging. */
+ mtr_set_log_mode(mtr, log_mode);
+
+ if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+
+#ifndef UNIV_HOTBACKUP
+ buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+ return(FALSE);
+ }
+
+ lock_move_reorganize_page(block, temp_block);
+
+#ifndef UNIV_HOTBACKUP
+ buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+ return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte. Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records. Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+ page_zip_des_t* page_zip, /*!< out: copy of src_zip
+ (n_blobs, m_start, m_end,
+ m_nonempty, data[0..size-1]) */
+ page_t* page, /*!< out: copy of src */
+ const page_zip_des_t* src_zip, /*!< in: compressed page */
+ const page_t* src, /*!< in: page */
+ dict_index_t* index, /*!< in: index of the B-tree */
+ mtr_t* mtr) /*!< in: mini-transaction */
+{
+ ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
+ ut_ad(!dict_index_is_ibuf(index));
+#ifdef UNIV_ZIP_DEBUG
+ /* The B-tree operations that call this function may set
+ FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+ mismatch. A strict page_zip_validate() will be executed later
+ during the B-tree operations. */
+ ut_a(page_zip_validate_low(src_zip, src, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+ ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+ if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+ ut_a(page_is_leaf(src));
+ ut_a(dict_index_is_clust(index));
+ }
+
+ /* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary
+ indexes. It does not matter on other pages. */
+ ut_a(dict_index_is_clust(index) || !page_is_leaf(src)
+ || !ut_dulint_is_zero(page_get_max_trx_id(src)));
+
+ UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip));
+ UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE);
+ UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip));
+
+ /* Copy those B-tree page header fields that are related to
+ the records stored in the page. Also copy the field
+ PAGE_MAX_TRX_ID. Skip the rest of the page header and
+ trailer. On the compressed page, there is no trailer. */
+#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END
+# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END"
+#endif
+ memcpy(PAGE_HEADER + page, PAGE_HEADER + src,
+ PAGE_HEADER_PRIV_END);
+ memcpy(PAGE_DATA + page, PAGE_DATA + src,
+ UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+ memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data,
+ PAGE_HEADER_PRIV_END);
+ memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data,
+ page_zip_get_size(page_zip) - PAGE_DATA);
+
+ /* Copy all fields of src_zip to page_zip, except the pointer
+ to the compressed data page. */
+ {
+ page_zip_t* data = page_zip->data;
+ memcpy(page_zip, src_zip, sizeof *page_zip);
+ page_zip->data = data;
+ }
+ ut_ad(page_zip_get_trailer_len(page_zip,
+ dict_index_is_clust(index), NULL)
+ + page_zip->m_end < page_zip_get_size(page_zip));
+
+ if (!page_is_leaf(src)
+ && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL)
+ && UNIV_LIKELY(mach_read_from_4(page
+ + FIL_PAGE_PREV) != FIL_NULL)) {
+ /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+ ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+ TRUE);
+ if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+ rec_t* rec = page + offs;
+ ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+ & REC_INFO_MIN_REC_FLAG);
+ rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG;
+ }
+ }
+
+#ifdef UNIV_ZIP_DEBUG
+ ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+ page_zip_compress_write_log(page_zip, page, index, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< out: uncompressed page */
+ page_zip_des_t* page_zip)/*!< out: compressed page */
+{
+ ulint size;
+ ulint trailer_size;
+
+ ut_ad(ptr && end_ptr);
+ ut_ad(!page == !page_zip);
+
+ if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) {
+
+ return(NULL);
+ }
+
+ size = mach_read_from_2(ptr);
+ ptr += 2;
+ trailer_size = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (UNIV_UNLIKELY(!page_zip)
+ || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) {
+corrupt:
+ recv_sys->found_corrupt_log = TRUE;
+
+ return(NULL);
+ }
+
+ memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4);
+ memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4);
+ memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size);
+ memset(page_zip->data + FIL_PAGE_TYPE + size, 0,
+ page_zip_get_size(page_zip) - trailer_size
+ - (FIL_PAGE_TYPE + size));
+ memcpy(page_zip->data + page_zip_get_size(page_zip)
+ - trailer_size, ptr + 8 + size, trailer_size);
+
+ if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page,
+ TRUE))) {
+
+ goto corrupt;
+ }
+ }
+
+ return(ptr + 8 + size + trailer_size);
+}
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+ const void* data, /*!< in: compressed page */
+ ulint size) /*!< in: size of compressed page */
+{
+ /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+ and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+
+ const Bytef* s = data;
+ uLong adler;
+
+ ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ adler = adler32(0L, s + FIL_PAGE_OFFSET,
+ FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+ adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+ adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+ size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+ return((ulint) adler);
+}
diff --git a/storage/xtradb/pars/lexyy.c b/storage/xtradb/pars/lexyy.c
new file mode 100644
index 00000000000..fc6b5102581
--- /dev/null
+++ b/storage/xtradb/pars/lexyy.c
@@ -0,0 +1,2795 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+#include "univ.i"
+#line 2 "lexyy.c"
+
+#line 4 "lexyy.c"
+
+#define YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 31
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t;
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+#endif /* ! C99 */
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else /* ! __cplusplus */
+
+#if __STDC__
+
+#define YY_USE_CONST
+
+#endif /* __STDC__ */
+#endif /* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index. If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* Enter a start condition. This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN (yy_start) = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state. The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START (((yy_start) - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart(yyin )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#define YY_BUF_SIZE 16384
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+static int yyleng;
+
+static FILE *yyin, *yyout;
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+ #define YY_LESS_LINENO(n)
+
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ *yy_cp = (yy_hold_char); \
+ YY_RESTORE_YY_MORE_OFFSET \
+ (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+ YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+ } \
+ while ( 0 )
+
+#define unput(c) yyunput( c, (yytext_ptr) )
+
+/* The following is because we cannot portably get our hands on size_t
+ * (without autoconf's help, which isn't available because we want
+ * flex-generated scanners to compile on their own).
+ */
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef unsigned int yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+ {
+ FILE *yy_input_file;
+
+ char *yy_ch_buf; /* input buffer */
+ char *yy_buf_pos; /* current position in input buffer */
+
+ /* Size of input buffer in bytes, not including room for EOB
+ * characters.
+ */
+ yy_size_t yy_buf_size;
+
+ /* Number of characters read into yy_ch_buf, not including EOB
+ * characters.
+ */
+ int yy_n_chars;
+
+ /* Whether we "own" the buffer - i.e., we know we created it,
+ * and can realloc() it to grow it, and should free() it to
+ * delete it.
+ */
+ int yy_is_our_buffer;
+
+ /* Whether this is an "interactive" input source; if so, and
+ * if we're using stdio for input, then we want to use getc()
+ * instead of fread(), to make sure we stop fetching input after
+ * each newline.
+ */
+ int yy_is_interactive;
+
+ /* Whether we're considered to be at the beginning of a line.
+ * If so, '^' rules will be active on the next match, otherwise
+ * not.
+ */
+ int yy_at_bol;
+
+ int yy_bs_lineno; /**< The line count. */
+ int yy_bs_column; /**< The column count. */
+
+ /* Whether to try to fill the input buffer when we reach the
+ * end of it.
+ */
+ int yy_fill_buffer;
+
+ int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+ /* When an EOF's been seen but there's still some text to process
+ * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+ * shouldn't try reading from the input source any more. We might
+ * still have a bunch of tokens to match, though, because of
+ * possible backing-up.
+ *
+ * When we actually see the EOF, we change the status to "new"
+ * (via yyrestart()), so that the user can continue scanning by
+ * just pointing yyin at a new input file.
+ */
+#define YY_BUFFER_EOF_PENDING 2
+
+ };
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* Stack of input buffers. */
+static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
+static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
+static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
+ ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
+ : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)]
+
+/* yy_hold_char holds the character lost when yytext is formed. */
+static char yy_hold_char;
+static int yy_n_chars; /* number of characters read into yy_ch_buf */
+static int yyleng;
+
+/* Points to current character in buffer. */
+static char *yy_c_buf_p = (char *) 0;
+static int yy_init = 1; /* whether we need to initialize */
+static int yy_start = 0; /* start state number */
+
+/* Flag which is used to allow yywrap()'s to do buffer switches
+ * instead of setting up a fresh yyin. A bit of a hack ...
+ */
+static int yy_did_buffer_switch_on_eof;
+
+static void yyrestart (FILE *input_file );
+__attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer );
+static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size );
+static void yy_delete_buffer (YY_BUFFER_STATE b );
+static void yy_flush_buffer (YY_BUFFER_STATE b );
+__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer );
+__attribute__((unused)) static void yypop_buffer_state (void );
+
+static void yyensure_buffer_stack (void );
+static void yy_load_buffer_state (void );
+static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file );
+
+#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER )
+
+YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size );
+YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len );
+
+static void *yyalloc (yy_size_t );
+static void *yyrealloc (void *,yy_size_t );
+static void yyfree (void * );
+
+#define yy_new_buffer yy_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){ \
+ yyensure_buffer_stack (); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer(yyin,YY_BUF_SIZE ); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+ }
+
+#define yy_set_bol(at_bol) \
+ { \
+ if ( ! YY_CURRENT_BUFFER ){\
+ yyensure_buffer_stack (); \
+ YY_CURRENT_BUFFER_LVALUE = \
+ yy_create_buffer(yyin,YY_BUF_SIZE ); \
+ } \
+ YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+ }
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define yywrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+
+typedef int yy_state_type;
+
+static int yylineno;
+
+static int yylineno = 1;
+
+static char *yytext;
+#define yytext_ptr yytext
+
+static yy_state_type yy_get_previous_state (void );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state );
+static int yy_get_next_buffer (void );
+static void yy_fatal_error (yyconst char msg[] );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+ (yytext_ptr) = yy_bp; \
+ yyleng = (size_t) (yy_cp - yy_bp); \
+ (yy_hold_char) = *yy_cp; \
+ *yy_cp = '\0'; \
+ (yy_c_buf_p) = yy_cp;
+
+#define YY_NUM_RULES 119
+#define YY_END_OF_BUFFER 120
+/* This struct is not used in this scanner,
+ but its presence is necessary. */
+struct yy_trans_info
+ {
+ flex_int32_t yy_verify;
+ flex_int32_t yy_nxt;
+ };
+static yyconst flex_int16_t yy_accept[399] =
+ { 0,
+ 0, 0, 114, 114, 0, 0, 0, 0, 120, 118,
+ 117, 117, 8, 118, 109, 5, 98, 104, 107, 105,
+ 102, 106, 118, 108, 1, 118, 103, 101, 99, 100,
+ 112, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 110, 111, 114, 115, 6, 7, 9, 10, 117, 4,
+ 93, 113, 2, 1, 3, 94, 95, 97, 96, 92,
+ 92, 92, 92, 92, 92, 44, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 92, 92, 28, 17, 25, 92, 92, 92, 92, 92,
+
+ 54, 61, 92, 14, 92, 92, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 92, 92, 114, 115, 115, 116, 6, 7, 9, 10,
+ 2, 13, 45, 92, 92, 92, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 92, 27, 92, 92, 92, 41, 92, 92, 92, 92,
+ 21, 92, 92, 92, 92, 15, 92, 92, 92, 18,
+ 92, 92, 92, 92, 92, 80, 92, 92, 92, 51,
+ 92, 12, 92, 36, 92, 92, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 20, 24,
+
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 92,
+ 46, 92, 92, 30, 92, 87, 92, 92, 39, 92,
+ 92, 92, 92, 92, 48, 92, 89, 32, 91, 92,
+ 11, 64, 92, 92, 92, 42, 92, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 29, 92, 92, 92,
+ 92, 92, 92, 92, 92, 92, 85, 92, 26, 92,
+ 66, 92, 92, 92, 37, 92, 92, 92, 92, 92,
+ 92, 92, 31, 65, 23, 92, 57, 92, 75, 92,
+ 92, 92, 43, 92, 92, 92, 92, 92, 92, 92,
+ 92, 90, 92, 92, 56, 92, 92, 92, 92, 92,
+
+ 92, 92, 40, 33, 79, 19, 92, 83, 74, 55,
+ 92, 63, 92, 52, 92, 92, 92, 47, 92, 76,
+ 92, 78, 92, 92, 34, 92, 92, 92, 35, 72,
+ 92, 92, 92, 92, 58, 92, 50, 49, 92, 92,
+ 53, 62, 92, 92, 92, 22, 92, 92, 73, 81,
+ 92, 92, 77, 92, 68, 92, 92, 92, 92, 38,
+ 92, 88, 67, 92, 84, 92, 92, 92, 86, 92,
+ 59, 92, 16, 92, 70, 69, 92, 92, 82, 92,
+ 92, 92, 92, 92, 92, 92, 92, 92, 92, 71,
+ 92, 92, 92, 92, 92, 92, 60, 0
+
+ } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 2, 1, 4, 1, 5, 6, 1, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 16, 17, 18, 19,
+ 20, 21, 22, 1, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 32,
+ 1, 1, 1, 1, 48, 1, 32, 32, 32, 32,
+
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 49, 1, 50, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1
+ } ;
+
+static yyconst flex_int32_t yy_meta[51] =
+ { 0,
+ 1, 1, 1, 2, 1, 1, 3, 1, 1, 4,
+ 1, 1, 1, 1, 1, 5, 1, 1, 1, 6,
+ 1, 1, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 1, 1
+ } ;
+
+static yyconst flex_int16_t yy_base[409] =
+ { 0,
+ 0, 0, 437, 436, 438, 437, 439, 438, 441, 448,
+ 49, 51, 448, 0, 448, 448, 448, 448, 448, 448,
+ 448, 448, 426, 429, 41, 418, 448, 38, 448, 417,
+ 448, 20, 33, 32, 46, 40, 44, 0, 54, 52,
+ 399, 48, 60, 395, 65, 67, 81, 27, 411, 75,
+ 448, 448, 0, 98, 0, 426, 0, 428, 113, 0,
+ 448, 448, 415, 54, 410, 448, 448, 448, 448, 0,
+ 403, 68, 399, 391, 389, 0, 402, 80, 84, 397,
+ 383, 96, 381, 394, 379, 393, 387, 375, 379, 375,
+ 377, 377, 0, 98, 0, 376, 97, 385, 368, 375,
+
+ 0, 0, 381, 381, 364, 94, 103, 379, 98, 65,
+ 381, 369, 109, 361, 377, 373, 351, 97, 372, 363,
+ 115, 356, 0, 137, 138, 448, 0, 388, 0, 390,
+ 377, 0, 0, 365, 360, 367, 365, 348, 346, 345,
+ 350, 359, 347, 359, 95, 347, 353, 354, 336, 336,
+ 123, 0, 334, 350, 351, 0, 338, 347, 344, 122,
+ 124, 341, 336, 330, 340, 338, 331, 328, 336, 0,
+ 326, 336, 334, 325, 315, 309, 322, 307, 327, 0,
+ 313, 0, 311, 0, 325, 316, 313, 131, 309, 316,
+ 323, 302, 304, 309, 309, 301, 304, 299, 0, 0,
+
+ 311, 295, 305, 312, 292, 291, 305, 294, 307, 287,
+ 0, 297, 279, 0, 298, 0, 295, 282, 0, 281,
+ 276, 281, 280, 290, 0, 276, 0, 0, 0, 280,
+ 0, 0, 276, 273, 287, 0, 272, 272, 270, 286,
+ 271, 283, 280, 264, 282, 277, 0, 272, 272, 258,
+ 257, 270, 256, 270, 269, 268, 0, 252, 0, 246,
+ 0, 265, 249, 248, 0, 262, 252, 247, 246, 258,
+ 248, 247, 0, 0, 0, 251, 0, 239, 0, 253,
+ 249, 235, 0, 249, 250, 233, 238, 231, 249, 231,
+ 228, 0, 229, 226, 0, 231, 243, 230, 237, 227,
+
+ 235, 220, 0, 0, 0, 212, 219, 0, 0, 0,
+ 216, 0, 230, 0, 231, 218, 217, 0, 213, 0,
+ 216, 0, 208, 210, 0, 209, 223, 216, 0, 0,
+ 219, 222, 204, 219, 0, 215, 0, 0, 199, 213,
+ 0, 0, 197, 196, 201, 0, 210, 195, 0, 0,
+ 201, 197, 0, 192, 0, 204, 204, 192, 202, 0,
+ 179, 0, 0, 199, 0, 183, 177, 183, 0, 174,
+ 0, 193, 0, 192, 0, 0, 183, 187, 0, 174,
+ 174, 180, 166, 189, 181, 180, 166, 151, 118, 0,
+ 130, 136, 127, 123, 119, 111, 0, 448, 167, 173,
+
+ 179, 152, 181, 124, 187, 193, 199, 205
+ } ;
+
+static yyconst flex_int16_t yy_def[409] =
+ { 0,
+ 398, 1, 399, 399, 400, 400, 401, 401, 398, 398,
+ 398, 398, 398, 402, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 403, 398, 398, 398, 398,
+ 398, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 398, 398, 405, 406, 407, 398, 408, 398, 398, 402,
+ 398, 398, 398, 398, 403, 398, 398, 398, 398, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 405, 406, 406, 398, 407, 398, 408, 398,
+ 398, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 404, 404, 404,
+ 404, 404, 404, 404, 404, 404, 404, 0, 398, 398,
+
+ 398, 398, 398, 398, 398, 398, 398, 398
+ } ;
+
+static yyconst flex_int16_t yy_nxt[499] =
+ { 0,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 38,
+ 39, 38, 38, 40, 41, 42, 43, 44, 38, 45,
+ 46, 47, 48, 49, 50, 38, 38, 38, 51, 52,
+ 59, 59, 59, 59, 63, 71, 64, 67, 68, 73,
+ 72, 77, 118, 74, 119, 78, 75, 63, 79, 64,
+ 88, 80, 82, 85, 81, 86, 83, 89, 96, 76,
+ 90, 93, 84, 91, 99, 87, 92, 101, 97, 94,
+ 100, 107, 133, 110, 95, 102, 111, 103, 179, 104,
+
+ 108, 109, 105, 115, 121, 112, 180, 125, 134, 113,
+ 116, 122, 126, 114, 59, 59, 139, 117, 141, 142,
+ 146, 163, 140, 159, 171, 173, 143, 189, 70, 147,
+ 172, 177, 183, 164, 207, 208, 148, 190, 160, 161,
+ 174, 193, 178, 184, 175, 194, 398, 125, 222, 214,
+ 224, 398, 126, 215, 248, 249, 60, 397, 396, 395,
+ 225, 394, 393, 223, 392, 391, 250, 53, 53, 53,
+ 53, 53, 53, 55, 55, 55, 55, 55, 55, 57,
+ 57, 57, 57, 57, 57, 65, 65, 123, 123, 123,
+ 390, 123, 123, 124, 124, 124, 124, 124, 124, 127,
+
+ 127, 389, 127, 127, 127, 129, 388, 129, 129, 129,
+ 129, 387, 386, 385, 384, 383, 382, 381, 380, 379,
+ 378, 377, 376, 375, 374, 373, 372, 371, 370, 369,
+ 368, 367, 366, 365, 364, 363, 362, 361, 360, 359,
+ 358, 357, 356, 355, 354, 353, 352, 351, 350, 349,
+ 348, 347, 346, 345, 344, 343, 342, 341, 340, 339,
+ 338, 337, 336, 335, 334, 333, 332, 331, 330, 329,
+ 328, 327, 326, 325, 324, 323, 322, 321, 320, 319,
+ 318, 317, 316, 315, 314, 313, 312, 311, 310, 309,
+ 308, 307, 306, 305, 304, 303, 302, 301, 300, 299,
+
+ 298, 297, 296, 295, 294, 293, 292, 291, 290, 289,
+ 288, 287, 286, 285, 284, 283, 282, 281, 280, 279,
+ 278, 277, 276, 275, 274, 273, 272, 271, 270, 269,
+ 268, 267, 266, 265, 264, 263, 262, 261, 260, 259,
+ 258, 257, 256, 255, 254, 253, 252, 251, 247, 246,
+ 245, 244, 243, 242, 241, 240, 239, 238, 237, 236,
+ 235, 234, 233, 232, 231, 230, 229, 228, 227, 226,
+ 221, 220, 219, 218, 217, 216, 213, 212, 211, 210,
+ 209, 206, 205, 204, 203, 202, 201, 200, 199, 198,
+ 197, 196, 131, 130, 128, 195, 192, 191, 188, 187,
+
+ 186, 185, 182, 181, 176, 170, 169, 168, 167, 166,
+ 165, 162, 158, 157, 156, 155, 154, 153, 152, 151,
+ 150, 149, 145, 144, 138, 137, 136, 135, 132, 398,
+ 131, 130, 128, 120, 106, 98, 69, 66, 62, 61,
+ 398, 58, 58, 56, 56, 54, 54, 9, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398
+
+ } ;
+
+static yyconst flex_int16_t yy_chk[499] =
+ { 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 11, 11, 12, 12, 25, 32, 25, 28, 28, 33,
+ 32, 34, 48, 33, 48, 34, 33, 64, 34, 64,
+ 37, 34, 35, 36, 34, 36, 35, 37, 40, 33,
+ 37, 39, 35, 37, 42, 36, 37, 43, 40, 39,
+ 42, 45, 72, 46, 39, 43, 46, 43, 110, 43,
+
+ 45, 45, 43, 47, 50, 46, 110, 54, 72, 46,
+ 47, 50, 54, 46, 59, 59, 78, 47, 79, 79,
+ 82, 97, 78, 94, 106, 107, 79, 118, 404, 82,
+ 106, 109, 113, 97, 145, 145, 82, 118, 94, 94,
+ 107, 121, 109, 113, 107, 121, 124, 125, 160, 151,
+ 161, 124, 125, 151, 188, 188, 402, 396, 395, 394,
+ 161, 393, 392, 160, 391, 389, 188, 399, 399, 399,
+ 399, 399, 399, 400, 400, 400, 400, 400, 400, 401,
+ 401, 401, 401, 401, 401, 403, 403, 405, 405, 405,
+ 388, 405, 405, 406, 406, 406, 406, 406, 406, 407,
+
+ 407, 387, 407, 407, 407, 408, 386, 408, 408, 408,
+ 408, 385, 384, 383, 382, 381, 380, 378, 377, 374,
+ 372, 370, 368, 367, 366, 364, 361, 359, 358, 357,
+ 356, 354, 352, 351, 348, 347, 345, 344, 343, 340,
+ 339, 336, 334, 333, 332, 331, 328, 327, 326, 324,
+ 323, 321, 319, 317, 316, 315, 313, 311, 307, 306,
+ 302, 301, 300, 299, 298, 297, 296, 294, 293, 291,
+ 290, 289, 288, 287, 286, 285, 284, 282, 281, 280,
+ 278, 276, 272, 271, 270, 269, 268, 267, 266, 264,
+ 263, 262, 260, 258, 256, 255, 254, 253, 252, 251,
+
+ 250, 249, 248, 246, 245, 244, 243, 242, 241, 240,
+ 239, 238, 237, 235, 234, 233, 230, 226, 224, 223,
+ 222, 221, 220, 218, 217, 215, 213, 212, 210, 209,
+ 208, 207, 206, 205, 204, 203, 202, 201, 198, 197,
+ 196, 195, 194, 193, 192, 191, 190, 189, 187, 186,
+ 185, 183, 181, 179, 178, 177, 176, 175, 174, 173,
+ 172, 171, 169, 168, 167, 166, 165, 164, 163, 162,
+ 159, 158, 157, 155, 154, 153, 150, 149, 148, 147,
+ 146, 144, 143, 142, 141, 140, 139, 138, 137, 136,
+ 135, 134, 131, 130, 128, 122, 120, 119, 117, 116,
+
+ 115, 114, 112, 111, 108, 105, 104, 103, 100, 99,
+ 98, 96, 92, 91, 90, 89, 88, 87, 86, 85,
+ 84, 83, 81, 80, 77, 75, 74, 73, 71, 65,
+ 63, 58, 56, 49, 44, 41, 30, 26, 24, 23,
+ 9, 8, 7, 6, 5, 4, 3, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398, 398, 398,
+ 398, 398, 398, 398, 398, 398, 398, 398
+
+ } ;
+
+static yy_state_type yy_last_accepting_state;
+static char *yy_last_accepting_cpos;
+
+static int yy_flex_debug;
+static int yy_flex_debug = 0;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+static char *yytext;
+#line 1 "pars0lex.l"
+/**************************************************//**
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+(c) 1997 Innobase Oy
+
+Created 12/14/1997 Heikki Tuuri
+Published under the GPL version 2
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+*******************************************************/
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+#line 38 "pars0lex.l"
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A) ut_malloc(A)
+#define free(A) ut_free(A)
+#define realloc(P, A) ut_realloc(P, A)
+#define exit(A) ut_error
+
+#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+
+/* String buffer for removing quotes */
+static ulint stringbuf_len_alloc = 0; /* Allocated length */
+static ulint stringbuf_len = 0; /* Current length */
+static char* stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+ const char* str, /*!< in: string to be appended */
+ ulint len) /*!< in: length of the string */
+{
+ if (stringbuf == NULL) {
+ stringbuf = malloc(1);
+ stringbuf_len_alloc = 1;
+ }
+
+ if (stringbuf_len + len > stringbuf_len_alloc) {
+ while (stringbuf_len + len > stringbuf_len_alloc) {
+ stringbuf_len_alloc <<= 1;
+ }
+ stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+ }
+
+ memcpy(stringbuf + stringbuf_len, str, len);
+ stringbuf_len += len;
+}
+
+
+
+
+#line 759 "lexyy.c"
+
+#define INITIAL 0
+#define comment 1
+#define quoted 2
+#define id 3
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap (void );
+#else
+extern int yywrap (void );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int );
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * );
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (void );
+#else
+static int input (void );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#define YY_READ_BUF_SIZE 8192
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO (void) fwrite( yytext, yyleng, 1, yyout )
+#endif
+
+/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+ { \
+ int c = '*'; \
+ size_t n; \
+ for ( n = 0; n < max_size && \
+ (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+ buf[n] = (char) c; \
+ if ( c == '\n' ) \
+ buf[n++] = (char) c; \
+ if ( c == EOF && ferror( yyin ) ) \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ result = n; \
+ } \
+ else \
+ { \
+ errno=0; \
+ while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+ { \
+ if( errno != EINTR) \
+ { \
+ YY_FATAL_ERROR( "input in flex scanner failed" ); \
+ break; \
+ } \
+ errno=0; \
+ clearerr(yyin); \
+ } \
+ }\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg )
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+UNIV_INTERN int yylex (void);
+
+#define YY_DECL UNIV_INTERN int yylex (void)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+ YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+ register yy_state_type yy_current_state;
+ register char *yy_cp, *yy_bp;
+ register int yy_act;
+
+#line 92 "pars0lex.l"
+
+
+#line 914 "lexyy.c"
+
+ if ( (yy_init) )
+ {
+ (yy_init) = 0;
+
+#ifdef YY_USER_INIT
+ YY_USER_INIT;
+#endif
+
+ if ( ! (yy_start) )
+ (yy_start) = 1; /* first start state */
+
+ if ( ! yyin )
+ yyin = stdin;
+
+ if ( ! yyout )
+ yyout = stdout;
+
+ if ( ! YY_CURRENT_BUFFER ) {
+ yyensure_buffer_stack ();
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer(yyin,YY_BUF_SIZE );
+ }
+
+ yy_load_buffer_state( );
+ }
+
+ while ( 1 ) /* loops until end-of-file is reached */
+ {
+ yy_cp = (yy_c_buf_p);
+
+ /* Support of yytext. */
+ *yy_cp = (yy_hold_char);
+
+ /* yy_bp points to the position in yy_ch_buf of the start of
+ * the current run.
+ */
+ yy_bp = yy_cp;
+
+ yy_current_state = (yy_start);
+yy_match:
+ do
+ {
+ register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 399 )
+ yy_c = yy_meta[(unsigned int) yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+ ++yy_cp;
+ }
+ while ( yy_current_state != 398 );
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+
+yy_find_action:
+ yy_act = yy_accept[yy_current_state];
+
+ YY_DO_BEFORE_ACTION;
+
+do_action: /* This label is used only to access EOF actions. */
+
+ switch ( yy_act )
+ { /* beginning of action switch */
+ case 0: /* must back up */
+ /* undo the effects of YY_DO_BEFORE_ACTION */
+ *yy_cp = (yy_hold_char);
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+ goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 94 "pars0lex.l"
+{
+ yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+ atoi(yytext));
+ return(PARS_INT_LIT);
+}
+ YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 100 "pars0lex.l"
+{
+ ut_error; /* not implemented */
+
+ return(PARS_FLOAT_LIT);
+}
+ YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 106 "pars0lex.l"
+{
+ ulint type;
+
+ yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+ yytext + 1, &type);
+
+ return((int) type);
+}
+ YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 115 "pars0lex.l"
+{
+ yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+ yytext + 1);
+
+ return(PARS_ID_TOKEN);
+}
+ YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 122 "pars0lex.l"
+{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'. This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+ BEGIN(quoted);
+ stringbuf_len = 0;
+}
+ YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 131 "pars0lex.l"
+{
+ /* Got a sequence of characters other than "'":
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+ YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 136 "pars0lex.l"
+{
+ /* Got a sequence of "'" characters:
+ append half of them to string buffer,
+ as "''" represents a single "'".
+ We apply truncating division,
+ so that "'''" will result in "'". */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ string literal. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_str_lit(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+ return(PARS_STR_LIT);
+ }
+}
+ YY_BREAK
+case 8:
+YY_RULE_SETUP
+#line 160 "pars0lex.l"
+{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+ BEGIN(id);
+ stringbuf_len = 0;
+}
+ YY_BREAK
+case 9:
+/* rule 9 can match eol */
+YY_RULE_SETUP
+#line 169 "pars0lex.l"
+{
+ /* Got a sequence of characters other than '"':
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+ YY_BREAK
+case 10:
+YY_RULE_SETUP
+#line 174 "pars0lex.l"
+{
+ /* Got a sequence of '"' characters:
+ append half of them to string buffer,
+ as '""' represents a single '"'.
+ We apply truncating division,
+ so that '"""' will result in '"'. */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ identifier. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_id(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+
+ return(PARS_ID_TOKEN);
+ }
+}
+ YY_BREAK
+case 11:
+YY_RULE_SETUP
+#line 199 "pars0lex.l"
+{
+ yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+ return(PARS_NULL_LIT);
+}
+ YY_BREAK
+case 12:
+YY_RULE_SETUP
+#line 205 "pars0lex.l"
+{
+ /* Implicit cursor name */
+ yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+ (byte*) yytext, yyleng);
+ return(PARS_SQL_TOKEN);
+}
+ YY_BREAK
+case 13:
+YY_RULE_SETUP
+#line 212 "pars0lex.l"
+{
+ return(PARS_AND_TOKEN);
+}
+ YY_BREAK
+case 14:
+YY_RULE_SETUP
+#line 216 "pars0lex.l"
+{
+ return(PARS_OR_TOKEN);
+}
+ YY_BREAK
+case 15:
+YY_RULE_SETUP
+#line 220 "pars0lex.l"
+{
+ return(PARS_NOT_TOKEN);
+}
+ YY_BREAK
+case 16:
+YY_RULE_SETUP
+#line 224 "pars0lex.l"
+{
+ return(PARS_PROCEDURE_TOKEN);
+}
+ YY_BREAK
+case 17:
+YY_RULE_SETUP
+#line 228 "pars0lex.l"
+{
+ return(PARS_IN_TOKEN);
+}
+ YY_BREAK
+case 18:
+YY_RULE_SETUP
+#line 232 "pars0lex.l"
+{
+ return(PARS_OUT_TOKEN);
+}
+ YY_BREAK
+case 19:
+YY_RULE_SETUP
+#line 236 "pars0lex.l"
+{
+ return(PARS_BINARY_TOKEN);
+}
+ YY_BREAK
+case 20:
+YY_RULE_SETUP
+#line 240 "pars0lex.l"
+{
+ return(PARS_BLOB_TOKEN);
+}
+ YY_BREAK
+case 21:
+YY_RULE_SETUP
+#line 244 "pars0lex.l"
+{
+ return(PARS_INT_TOKEN);
+}
+ YY_BREAK
+case 22:
+YY_RULE_SETUP
+#line 248 "pars0lex.l"
+{
+ return(PARS_INT_TOKEN);
+}
+ YY_BREAK
+case 23:
+YY_RULE_SETUP
+#line 252 "pars0lex.l"
+{
+ return(PARS_FLOAT_TOKEN);
+}
+ YY_BREAK
+case 24:
+YY_RULE_SETUP
+#line 256 "pars0lex.l"
+{
+ return(PARS_CHAR_TOKEN);
+}
+ YY_BREAK
+case 25:
+YY_RULE_SETUP
+#line 260 "pars0lex.l"
+{
+ return(PARS_IS_TOKEN);
+}
+ YY_BREAK
+case 26:
+YY_RULE_SETUP
+#line 264 "pars0lex.l"
+{
+ return(PARS_BEGIN_TOKEN);
+}
+ YY_BREAK
+case 27:
+YY_RULE_SETUP
+#line 268 "pars0lex.l"
+{
+ return(PARS_END_TOKEN);
+}
+ YY_BREAK
+case 28:
+YY_RULE_SETUP
+#line 272 "pars0lex.l"
+{
+ return(PARS_IF_TOKEN);
+}
+ YY_BREAK
+case 29:
+YY_RULE_SETUP
+#line 276 "pars0lex.l"
+{
+ return(PARS_THEN_TOKEN);
+}
+ YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 280 "pars0lex.l"
+{
+ return(PARS_ELSE_TOKEN);
+}
+ YY_BREAK
+case 31:
+YY_RULE_SETUP
+#line 284 "pars0lex.l"
+{
+ return(PARS_ELSIF_TOKEN);
+}
+ YY_BREAK
+case 32:
+YY_RULE_SETUP
+#line 288 "pars0lex.l"
+{
+ return(PARS_LOOP_TOKEN);
+}
+ YY_BREAK
+case 33:
+YY_RULE_SETUP
+#line 292 "pars0lex.l"
+{
+ return(PARS_WHILE_TOKEN);
+}
+ YY_BREAK
+case 34:
+YY_RULE_SETUP
+#line 296 "pars0lex.l"
+{
+ return(PARS_RETURN_TOKEN);
+}
+ YY_BREAK
+case 35:
+YY_RULE_SETUP
+#line 300 "pars0lex.l"
+{
+ return(PARS_SELECT_TOKEN);
+}
+ YY_BREAK
+case 36:
+YY_RULE_SETUP
+#line 304 "pars0lex.l"
+{
+ return(PARS_SUM_TOKEN);
+}
+ YY_BREAK
+case 37:
+YY_RULE_SETUP
+#line 308 "pars0lex.l"
+{
+ return(PARS_COUNT_TOKEN);
+}
+ YY_BREAK
+case 38:
+YY_RULE_SETUP
+#line 312 "pars0lex.l"
+{
+ return(PARS_DISTINCT_TOKEN);
+}
+ YY_BREAK
+case 39:
+YY_RULE_SETUP
+#line 316 "pars0lex.l"
+{
+ return(PARS_FROM_TOKEN);
+}
+ YY_BREAK
+case 40:
+YY_RULE_SETUP
+#line 320 "pars0lex.l"
+{
+ return(PARS_WHERE_TOKEN);
+}
+ YY_BREAK
+case 41:
+YY_RULE_SETUP
+#line 324 "pars0lex.l"
+{
+ return(PARS_FOR_TOKEN);
+}
+ YY_BREAK
+case 42:
+YY_RULE_SETUP
+#line 328 "pars0lex.l"
+{
+ return(PARS_READ_TOKEN);
+}
+ YY_BREAK
+case 43:
+YY_RULE_SETUP
+#line 332 "pars0lex.l"
+{
+ return(PARS_ORDER_TOKEN);
+}
+ YY_BREAK
+case 44:
+YY_RULE_SETUP
+#line 336 "pars0lex.l"
+{
+ return(PARS_BY_TOKEN);
+}
+ YY_BREAK
+case 45:
+YY_RULE_SETUP
+#line 340 "pars0lex.l"
+{
+ return(PARS_ASC_TOKEN);
+}
+ YY_BREAK
+case 46:
+YY_RULE_SETUP
+#line 344 "pars0lex.l"
+{
+ return(PARS_DESC_TOKEN);
+}
+ YY_BREAK
+case 47:
+YY_RULE_SETUP
+#line 348 "pars0lex.l"
+{
+ return(PARS_INSERT_TOKEN);
+}
+ YY_BREAK
+case 48:
+YY_RULE_SETUP
+#line 352 "pars0lex.l"
+{
+ return(PARS_INTO_TOKEN);
+}
+ YY_BREAK
+case 49:
+YY_RULE_SETUP
+#line 356 "pars0lex.l"
+{
+ return(PARS_VALUES_TOKEN);
+}
+ YY_BREAK
+case 50:
+YY_RULE_SETUP
+#line 360 "pars0lex.l"
+{
+ return(PARS_UPDATE_TOKEN);
+}
+ YY_BREAK
+case 51:
+YY_RULE_SETUP
+#line 364 "pars0lex.l"
+{
+ return(PARS_SET_TOKEN);
+}
+ YY_BREAK
+case 52:
+YY_RULE_SETUP
+#line 368 "pars0lex.l"
+{
+ return(PARS_DELETE_TOKEN);
+}
+ YY_BREAK
+case 53:
+YY_RULE_SETUP
+#line 372 "pars0lex.l"
+{
+ return(PARS_CURRENT_TOKEN);
+}
+ YY_BREAK
+case 54:
+YY_RULE_SETUP
+#line 376 "pars0lex.l"
+{
+ return(PARS_OF_TOKEN);
+}
+ YY_BREAK
+case 55:
+YY_RULE_SETUP
+#line 380 "pars0lex.l"
+{
+ return(PARS_CREATE_TOKEN);
+}
+ YY_BREAK
+case 56:
+YY_RULE_SETUP
+#line 384 "pars0lex.l"
+{
+ return(PARS_TABLE_TOKEN);
+}
+ YY_BREAK
+case 57:
+YY_RULE_SETUP
+#line 388 "pars0lex.l"
+{
+ return(PARS_INDEX_TOKEN);
+}
+ YY_BREAK
+case 58:
+YY_RULE_SETUP
+#line 392 "pars0lex.l"
+{
+ return(PARS_UNIQUE_TOKEN);
+}
+ YY_BREAK
+case 59:
+YY_RULE_SETUP
+#line 396 "pars0lex.l"
+{
+ return(PARS_CLUSTERED_TOKEN);
+}
+ YY_BREAK
+case 60:
+YY_RULE_SETUP
+#line 400 "pars0lex.l"
+{
+ return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+ YY_BREAK
+case 61:
+YY_RULE_SETUP
+#line 404 "pars0lex.l"
+{
+ return(PARS_ON_TOKEN);
+}
+ YY_BREAK
+case 62:
+YY_RULE_SETUP
+#line 408 "pars0lex.l"
+{
+ return(PARS_DECLARE_TOKEN);
+}
+ YY_BREAK
+case 63:
+YY_RULE_SETUP
+#line 412 "pars0lex.l"
+{
+ return(PARS_CURSOR_TOKEN);
+}
+ YY_BREAK
+case 64:
+YY_RULE_SETUP
+#line 416 "pars0lex.l"
+{
+ return(PARS_OPEN_TOKEN);
+}
+ YY_BREAK
+case 65:
+YY_RULE_SETUP
+#line 420 "pars0lex.l"
+{
+ return(PARS_FETCH_TOKEN);
+}
+ YY_BREAK
+case 66:
+YY_RULE_SETUP
+#line 424 "pars0lex.l"
+{
+ return(PARS_CLOSE_TOKEN);
+}
+ YY_BREAK
+case 67:
+YY_RULE_SETUP
+#line 428 "pars0lex.l"
+{
+ return(PARS_NOTFOUND_TOKEN);
+}
+ YY_BREAK
+case 68:
+YY_RULE_SETUP
+#line 432 "pars0lex.l"
+{
+ return(PARS_TO_CHAR_TOKEN);
+}
+ YY_BREAK
+case 69:
+YY_RULE_SETUP
+#line 436 "pars0lex.l"
+{
+ return(PARS_TO_NUMBER_TOKEN);
+}
+ YY_BREAK
+case 70:
+YY_RULE_SETUP
+#line 440 "pars0lex.l"
+{
+ return(PARS_TO_BINARY_TOKEN);
+}
+ YY_BREAK
+case 71:
+YY_RULE_SETUP
+#line 444 "pars0lex.l"
+{
+ return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+ YY_BREAK
+case 72:
+YY_RULE_SETUP
+#line 448 "pars0lex.l"
+{
+ return(PARS_SUBSTR_TOKEN);
+}
+ YY_BREAK
+case 73:
+YY_RULE_SETUP
+#line 452 "pars0lex.l"
+{
+ return(PARS_REPLSTR_TOKEN);
+}
+ YY_BREAK
+case 74:
+YY_RULE_SETUP
+#line 456 "pars0lex.l"
+{
+ return(PARS_CONCAT_TOKEN);
+}
+ YY_BREAK
+case 75:
+YY_RULE_SETUP
+#line 460 "pars0lex.l"
+{
+ return(PARS_INSTR_TOKEN);
+}
+ YY_BREAK
+case 76:
+YY_RULE_SETUP
+#line 464 "pars0lex.l"
+{
+ return(PARS_LENGTH_TOKEN);
+}
+ YY_BREAK
+case 77:
+YY_RULE_SETUP
+#line 468 "pars0lex.l"
+{
+ return(PARS_SYSDATE_TOKEN);
+}
+ YY_BREAK
+case 78:
+YY_RULE_SETUP
+#line 472 "pars0lex.l"
+{
+ return(PARS_PRINTF_TOKEN);
+}
+ YY_BREAK
+case 79:
+YY_RULE_SETUP
+#line 476 "pars0lex.l"
+{
+ return(PARS_ASSERT_TOKEN);
+}
+ YY_BREAK
+case 80:
+YY_RULE_SETUP
+#line 480 "pars0lex.l"
+{
+ return(PARS_RND_TOKEN);
+}
+ YY_BREAK
+case 81:
+YY_RULE_SETUP
+#line 484 "pars0lex.l"
+{
+ return(PARS_RND_STR_TOKEN);
+}
+ YY_BREAK
+case 82:
+YY_RULE_SETUP
+#line 488 "pars0lex.l"
+{
+ return(PARS_ROW_PRINTF_TOKEN);
+}
+ YY_BREAK
+case 83:
+YY_RULE_SETUP
+#line 492 "pars0lex.l"
+{
+ return(PARS_COMMIT_TOKEN);
+}
+ YY_BREAK
+case 84:
+YY_RULE_SETUP
+#line 496 "pars0lex.l"
+{
+ return(PARS_ROLLBACK_TOKEN);
+}
+ YY_BREAK
+case 85:
+YY_RULE_SETUP
+#line 500 "pars0lex.l"
+{
+ return(PARS_WORK_TOKEN);
+}
+ YY_BREAK
+case 86:
+YY_RULE_SETUP
+#line 504 "pars0lex.l"
+{
+ return(PARS_UNSIGNED_TOKEN);
+}
+ YY_BREAK
+case 87:
+YY_RULE_SETUP
+#line 508 "pars0lex.l"
+{
+ return(PARS_EXIT_TOKEN);
+}
+ YY_BREAK
+case 88:
+YY_RULE_SETUP
+#line 512 "pars0lex.l"
+{
+ return(PARS_FUNCTION_TOKEN);
+}
+ YY_BREAK
+case 89:
+YY_RULE_SETUP
+#line 516 "pars0lex.l"
+{
+ return(PARS_LOCK_TOKEN);
+}
+ YY_BREAK
+case 90:
+YY_RULE_SETUP
+#line 520 "pars0lex.l"
+{
+ return(PARS_SHARE_TOKEN);
+}
+ YY_BREAK
+case 91:
+YY_RULE_SETUP
+#line 524 "pars0lex.l"
+{
+ return(PARS_MODE_TOKEN);
+}
+ YY_BREAK
+case 92:
+YY_RULE_SETUP
+#line 528 "pars0lex.l"
+{
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*)yytext,
+ ut_strlen(yytext));
+ return(PARS_ID_TOKEN);
+}
+ YY_BREAK
+case 93:
+YY_RULE_SETUP
+#line 535 "pars0lex.l"
+{
+ return(PARS_DDOT_TOKEN);
+}
+ YY_BREAK
+case 94:
+YY_RULE_SETUP
+#line 539 "pars0lex.l"
+{
+ return(PARS_ASSIGN_TOKEN);
+}
+ YY_BREAK
+case 95:
+YY_RULE_SETUP
+#line 543 "pars0lex.l"
+{
+ return(PARS_LE_TOKEN);
+}
+ YY_BREAK
+case 96:
+YY_RULE_SETUP
+#line 547 "pars0lex.l"
+{
+ return(PARS_GE_TOKEN);
+}
+ YY_BREAK
+case 97:
+YY_RULE_SETUP
+#line 551 "pars0lex.l"
+{
+ return(PARS_NE_TOKEN);
+}
+ YY_BREAK
+case 98:
+YY_RULE_SETUP
+#line 555 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 99:
+YY_RULE_SETUP
+#line 560 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 100:
+YY_RULE_SETUP
+#line 565 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 101:
+YY_RULE_SETUP
+#line 570 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 102:
+YY_RULE_SETUP
+#line 575 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 103:
+YY_RULE_SETUP
+#line 580 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 104:
+YY_RULE_SETUP
+#line 585 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 105:
+YY_RULE_SETUP
+#line 590 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 106:
+YY_RULE_SETUP
+#line 595 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 107:
+YY_RULE_SETUP
+#line 600 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 108:
+YY_RULE_SETUP
+#line 605 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 109:
+YY_RULE_SETUP
+#line 610 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 110:
+YY_RULE_SETUP
+#line 615 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 111:
+YY_RULE_SETUP
+#line 620 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 112:
+YY_RULE_SETUP
+#line 625 "pars0lex.l"
+{
+
+ return((int)(*yytext));
+}
+ YY_BREAK
+case 113:
+YY_RULE_SETUP
+#line 630 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+ YY_BREAK
+case 114:
+/* rule 114 can match eol */
+YY_RULE_SETUP
+#line 632 "pars0lex.l"
+
+ YY_BREAK
+case 115:
+/* rule 115 can match eol */
+YY_RULE_SETUP
+#line 633 "pars0lex.l"
+
+ YY_BREAK
+case 116:
+YY_RULE_SETUP
+#line 634 "pars0lex.l"
+BEGIN(INITIAL);
+ YY_BREAK
+case 117:
+/* rule 117 can match eol */
+YY_RULE_SETUP
+#line 636 "pars0lex.l"
+/* eat up whitespace */
+ YY_BREAK
+case 118:
+YY_RULE_SETUP
+#line 639 "pars0lex.l"
+{
+ fprintf(stderr,"Unrecognized character: %02x\n",
+ *yytext);
+
+ ut_error;
+
+ return(0);
+}
+ YY_BREAK
+case 119:
+YY_RULE_SETUP
+#line 648 "pars0lex.l"
+YY_FATAL_ERROR( "flex scanner jammed" );
+ YY_BREAK
+#line 1916 "lexyy.c"
+case YY_STATE_EOF(INITIAL):
+case YY_STATE_EOF(comment):
+case YY_STATE_EOF(quoted):
+case YY_STATE_EOF(id):
+ yyterminate();
+
+ case YY_END_OF_BUFFER:
+ {
+ /* Amount of text matched not including the EOB char. */
+ int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;
+
+ /* Undo the effects of YY_DO_BEFORE_ACTION. */
+ *yy_cp = (yy_hold_char);
+ YY_RESTORE_YY_MORE_OFFSET
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+ {
+ /* We're scanning a new file or input source. It's
+ * possible that this happened because the user
+ * just pointed yyin at a new source and called
+ * yylex(). If so, then we have to assure
+ * consistency between YY_CURRENT_BUFFER and our
+ * globals. Here is the right place to do so, because
+ * this is the first action (other than possibly a
+ * back-up) that will match for the new input source.
+ */
+ (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+ }
+
+ /* Note that here we test for yy_c_buf_p "<=" to the position
+ * of the first EOB in the buffer, since yy_c_buf_p will
+ * already have been incremented past the NUL character
+ * (since all states make transitions on EOB to the
+ * end-of-buffer state). Contrast this with the test
+ * in input().
+ */
+ if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+ { /* This was really a NUL. */
+ yy_state_type yy_next_state;
+
+ (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( );
+
+ /* Okay, we're now positioned to make the NUL
+ * transition. We couldn't have
+ * yy_get_previous_state() go ahead and do it
+ * for us because it doesn't know how to deal
+ * with the possibility of jamming (and we don't
+ * want to build jamming into it because then it
+ * will run more slowly).
+ */
+
+ yy_next_state = yy_try_NUL_trans( yy_current_state );
+
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+
+ if ( yy_next_state )
+ {
+ /* Consume the NUL. */
+ yy_cp = ++(yy_c_buf_p);
+ yy_current_state = yy_next_state;
+ goto yy_match;
+ }
+
+ else
+ {
+ yy_cp = (yy_last_accepting_cpos);
+ yy_current_state = (yy_last_accepting_state);
+ goto yy_find_action;
+ }
+ }
+
+ else switch ( yy_get_next_buffer( ) )
+ {
+ case EOB_ACT_END_OF_FILE:
+ {
+ (yy_did_buffer_switch_on_eof) = 0;
+
+ if ( yywrap( ) )
+ {
+ /* Note: because we've taken care in
+ * yy_get_next_buffer() to have set up
+ * yytext, we can now set up
+ * yy_c_buf_p so that if some total
+ * hoser (like flex itself) wants to
+ * call the scanner after we return the
+ * YY_NULL, it'll still work - another
+ * YY_NULL will get returned.
+ */
+ (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;
+
+ yy_act = YY_STATE_EOF(YY_START);
+ goto do_action;
+ }
+
+ else
+ {
+ if ( ! (yy_did_buffer_switch_on_eof) )
+ YY_NEW_FILE;
+ }
+ break;
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ (yy_c_buf_p) =
+ (yytext_ptr) + yy_amount_of_matched_text;
+
+ yy_current_state = yy_get_previous_state( );
+
+ yy_cp = (yy_c_buf_p);
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+ goto yy_match;
+
+ case EOB_ACT_LAST_MATCH:
+ (yy_c_buf_p) =
+ &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];
+
+ yy_current_state = yy_get_previous_state( );
+
+ yy_cp = (yy_c_buf_p);
+ yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+ goto yy_find_action;
+ }
+ break;
+ }
+
+ default:
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--no action found" );
+ } /* end of action switch */
+ } /* end of scanning one token */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ * EOB_ACT_LAST_MATCH -
+ * EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ * EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (void)
+{
+ register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+ register char *source = (yytext_ptr);
+ register int number_to_move, i;
+ int ret_val;
+
+ if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
+ YY_FATAL_ERROR(
+ "fatal flex scanner internal error--end of buffer missed" );
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+ { /* Don't try to fill the buffer, so this is an EOF. */
+ if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 )
+ {
+ /* We matched a single character, the EOB, so
+ * treat this as a final EOF.
+ */
+ return EOB_ACT_END_OF_FILE;
+ }
+
+ else
+ {
+ /* We matched some text prior to the EOB, first
+ * process it.
+ */
+ return EOB_ACT_LAST_MATCH;
+ }
+ }
+
+ /* Try to read more data. */
+
+ /* First move last chars to start of buffer. */
+ number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1;
+
+ for ( i = 0; i < number_to_move; ++i )
+ *(dest++) = *(source++);
+
+ if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+ /* don't do the read, it's not guaranteed to return an EOF,
+ * just force an EOF
+ */
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0;
+
+ else
+ {
+ size_t num_to_read =
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+ while ( num_to_read <= 0 )
+ { /* Not enough room in the buffer - grow it. */
+
+ /* just a shorter name for the current buffer */
+ YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+ int yy_c_buf_p_offset =
+ (int) ((yy_c_buf_p) - b->yy_ch_buf);
+
+ if ( b->yy_is_our_buffer )
+ {
+ int new_size = b->yy_buf_size * 2;
+
+ if ( new_size <= 0 )
+ b->yy_buf_size += b->yy_buf_size / 8;
+ else
+ b->yy_buf_size *= 2;
+
+ b->yy_ch_buf = (char *)
+ /* Include room in for 2 EOB chars. */
+ yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 );
+ }
+ else
+ /* Can't grow it, we don't own it. */
+ b->yy_ch_buf = 0;
+
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR(
+ "fatal error - scanner input buffer overflow" );
+
+ (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+ num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+ number_to_move - 1;
+
+ }
+
+ if ( num_to_read > YY_READ_BUF_SIZE )
+ num_to_read = YY_READ_BUF_SIZE;
+
+ /* Read in more data. */
+ YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+ (yy_n_chars), num_to_read );
+
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ if ( (yy_n_chars) == 0 )
+ {
+ if ( number_to_move == YY_MORE_ADJ )
+ {
+ ret_val = EOB_ACT_END_OF_FILE;
+ yyrestart(yyin );
+ }
+
+ else
+ {
+ ret_val = EOB_ACT_LAST_MATCH;
+ YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+ YY_BUFFER_EOF_PENDING;
+ }
+ }
+
+ else
+ ret_val = EOB_ACT_CONTINUE_SCAN;
+
+ (yy_n_chars) += number_to_move;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+ YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
+
+ (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+ return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+ static yy_state_type yy_get_previous_state (void)
+{
+ register yy_state_type yy_current_state;
+ register char *yy_cp;
+
+ yy_current_state = (yy_start);
+
+ for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
+ {
+ register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 399 )
+ yy_c = yy_meta[(unsigned int) yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+ }
+
+ return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ * next_state = yy_try_NUL_trans( current_state );
+ */
+ static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state )
+{
+ register int yy_is_jam;
+ register char *yy_cp = (yy_c_buf_p);
+
+ register YY_CHAR yy_c = 1;
+ if ( yy_accept[yy_current_state] )
+ {
+ (yy_last_accepting_state) = yy_current_state;
+ (yy_last_accepting_cpos) = yy_cp;
+ }
+ while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+ {
+ yy_current_state = (int) yy_def[yy_current_state];
+ if ( yy_current_state >= 399 )
+ yy_c = yy_meta[(unsigned int) yy_c];
+ }
+ yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+ yy_is_jam = (yy_current_state == 398);
+
+ return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+ static int yyinput (void)
+#else
+ static int input (void)
+#endif
+
+{
+ int c;
+
+ *(yy_c_buf_p) = (yy_hold_char);
+
+ if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
+ {
+ /* yy_c_buf_p now points to the character we want to return.
+ * If this occurs *before* the EOB characters, then it's a
+ * valid NUL; if not, then we've hit the end of the buffer.
+ */
+ if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+ /* This was really a NUL. */
+ *(yy_c_buf_p) = '\0';
+
+ else
+ { /* need more input */
+ int offset = (int)((yy_c_buf_p) - (yytext_ptr));
+ ++(yy_c_buf_p);
+
+ switch ( yy_get_next_buffer( ) )
+ {
+ case EOB_ACT_LAST_MATCH:
+ /* This happens because yy_g_n_b()
+ * sees that we've accumulated a
+ * token and flags that we need to
+ * try matching the token before
+ * proceeding. But for input(),
+ * there's no matching to consider.
+ * So convert the EOB_ACT_LAST_MATCH
+ * to EOB_ACT_END_OF_FILE.
+ */
+
+ /* Reset buffer status. */
+ yyrestart(yyin );
+
+ /*FALLTHROUGH*/
+
+ case EOB_ACT_END_OF_FILE:
+ {
+ if ( yywrap( ) )
+ return EOF;
+
+ if ( ! (yy_did_buffer_switch_on_eof) )
+ YY_NEW_FILE;
+#ifdef __cplusplus
+ return yyinput();
+#else
+ return input();
+#endif
+ }
+
+ case EOB_ACT_CONTINUE_SCAN:
+ (yy_c_buf_p) = (yytext_ptr) + offset;
+ break;
+ }
+ }
+ }
+
+ c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */
+ *(yy_c_buf_p) = '\0'; /* preserve yytext */
+ (yy_hold_char) = *++(yy_c_buf_p);
+
+ return c;
+}
+#endif /* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ *
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+ static void yyrestart (FILE * input_file )
+{
+
+ if ( ! YY_CURRENT_BUFFER ){
+ yyensure_buffer_stack ();
+ YY_CURRENT_BUFFER_LVALUE =
+ yy_create_buffer(yyin,YY_BUF_SIZE );
+ }
+
+ yy_init_buffer(YY_CURRENT_BUFFER,input_file );
+ yy_load_buffer_state( );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ *
+ */
+ __attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer )
+{
+
+ /* TODO. We should be able to replace this entire function body
+ * with
+ * yypop_buffer_state();
+ * yypush_buffer_state(new_buffer);
+ */
+ yyensure_buffer_stack ();
+ if ( YY_CURRENT_BUFFER == new_buffer )
+ return;
+
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *(yy_c_buf_p) = (yy_hold_char);
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+ yy_load_buffer_state( );
+
+ /* We don't actually know whether we did this switch during
+ * EOF (yywrap()) processing, but the only time this flag
+ * is looked at is after yywrap() is called, so it's safe
+ * to go ahead and always set it.
+ */
+ (yy_did_buffer_switch_on_eof) = 1;
+}
+
+static void yy_load_buffer_state (void)
+{
+ (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+ (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+ yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+ (yy_hold_char) = *(yy_c_buf_p);
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ *
+ * @return the allocated buffer state.
+ */
+ static YY_BUFFER_STATE yy_create_buffer (FILE * file, int size )
+{
+ YY_BUFFER_STATE b;
+
+ b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) );
+ if ( ! b )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_buf_size = size;
+
+ /* yy_ch_buf has to be 2 characters longer than the size given because
+ * we need to put in 2 end-of-buffer characters.
+ */
+ b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 );
+ if ( ! b->yy_ch_buf )
+ YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+ b->yy_is_our_buffer = 1;
+
+ yy_init_buffer(b,file );
+
+ return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ *
+ */
+ static void yy_delete_buffer (YY_BUFFER_STATE b )
+{
+
+ if ( ! b )
+ return;
+
+ if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+ YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+ if ( b->yy_is_our_buffer )
+ yyfree((void *) b->yy_ch_buf );
+
+ yyfree((void *) b );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+ static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file )
+
+{
+ int oerrno = errno;
+
+ yy_flush_buffer(b );
+
+ b->yy_input_file = file;
+ b->yy_fill_buffer = 1;
+
+ /* If b is the current buffer, then yy_init_buffer was _probably_
+ * called from yyrestart() or through yy_get_next_buffer.
+ * In that case, we don't want to reset the lineno or column.
+ */
+ if (b != YY_CURRENT_BUFFER){
+ b->yy_bs_lineno = 1;
+ b->yy_bs_column = 0;
+ }
+
+ b->yy_is_interactive = 0;
+
+ errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ *
+ */
+ static void yy_flush_buffer (YY_BUFFER_STATE b )
+{
+ if ( ! b )
+ return;
+
+ b->yy_n_chars = 0;
+
+ /* We always need two end-of-buffer characters. The first causes
+ * a transition to the end-of-buffer state. The second causes
+ * a jam in that state.
+ */
+ b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+ b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+ b->yy_buf_pos = &b->yy_ch_buf[0];
+
+ b->yy_at_bol = 1;
+ b->yy_buffer_status = YY_BUFFER_NEW;
+
+ if ( b == YY_CURRENT_BUFFER )
+ yy_load_buffer_state( );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ * the current state. This function will allocate the stack
+ * if necessary.
+ * @param new_buffer The new state.
+ *
+ */
+__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+{
+ if (new_buffer == NULL)
+ return;
+
+ yyensure_buffer_stack();
+
+ /* This block is copied from yy_switch_to_buffer. */
+ if ( YY_CURRENT_BUFFER )
+ {
+ /* Flush out information for old buffer. */
+ *(yy_c_buf_p) = (yy_hold_char);
+ YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+ YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+ }
+
+ /* Only push if top exists. Otherwise, replace top. */
+ if (YY_CURRENT_BUFFER)
+ (yy_buffer_stack_top)++;
+ YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+ /* copied from yy_switch_to_buffer. */
+ yy_load_buffer_state( );
+ (yy_did_buffer_switch_on_eof) = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ * The next element becomes the new top.
+ *
+ */
+__attribute__((unused)) static void yypop_buffer_state (void)
+{
+ if (!YY_CURRENT_BUFFER)
+ return;
+
+ yy_delete_buffer(YY_CURRENT_BUFFER );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ if ((yy_buffer_stack_top) > 0)
+ --(yy_buffer_stack_top);
+
+ if (YY_CURRENT_BUFFER) {
+ yy_load_buffer_state( );
+ (yy_did_buffer_switch_on_eof) = 1;
+ }
+}
+
+/* Allocates the stack if it does not exist.
+ * Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (void)
+{
+ int num_to_alloc;
+
+ if (!(yy_buffer_stack)) {
+
+ /* First allocation is just for 2 elements, since we don't know if this
+ * scanner will even need a stack. We use 2 instead of 1 to avoid an
+ * immediate realloc on the next call.
+ */
+ num_to_alloc = 1;
+ (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+ (num_to_alloc * sizeof(struct yy_buffer_state*)
+ );
+
+ memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+
+ (yy_buffer_stack_max) = num_to_alloc;
+ (yy_buffer_stack_top) = 0;
+ return;
+ }
+
+ if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
+
+ /* Increase the buffer to prepare for a possible push. */
+ int grow_size = 8 /* arbitrary grow size */;
+
+ num_to_alloc = (yy_buffer_stack_max) + grow_size;
+ (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+ ((yy_buffer_stack),
+ num_to_alloc * sizeof(struct yy_buffer_state*)
+ );
+
+ /* zero only the new slots.*/
+ memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
+ (yy_buffer_stack_max) = num_to_alloc;
+ }
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg )
+{
+ (void) fprintf( stderr, "%s\n", msg );
+ exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+ do \
+ { \
+ /* Undo effects of setting up yytext. */ \
+ int yyless_macro_arg = (n); \
+ YY_LESS_LINENO(yyless_macro_arg);\
+ yytext[yyleng] = (yy_hold_char); \
+ (yy_c_buf_p) = yytext + yyless_macro_arg; \
+ (yy_hold_char) = *(yy_c_buf_p); \
+ *(yy_c_buf_p) = '\0'; \
+ yyleng = yyless_macro_arg; \
+ } \
+ while ( 0 )
+
+/* Accessor methods (get/set functions) to struct members. */
+
+/** Get the current line number.
+ *
+ */
+__attribute__((unused)) static int yyget_lineno (void)
+{
+
+ return yylineno;
+}
+
+/** Get the input stream.
+ *
+ */
+__attribute__((unused)) static FILE *yyget_in (void)
+{
+ return yyin;
+}
+
+/** Get the output stream.
+ *
+ */
+__attribute__((unused)) static FILE *yyget_out (void)
+{
+ return yyout;
+}
+
+/** Get the length of the current token.
+ *
+ */
+__attribute__((unused)) static int yyget_leng (void)
+{
+ return yyleng;
+}
+
+/** Get the current token.
+ *
+ */
+
+__attribute__((unused)) static char *yyget_text (void)
+{
+ return yytext;
+}
+
+/** Set the current line number.
+ * @param line_number
+ *
+ */
+__attribute__((unused)) static void yyset_lineno (int line_number )
+{
+
+ yylineno = line_number;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ *
+ * @see yy_switch_to_buffer
+ */
+__attribute__((unused)) static void yyset_in (FILE * in_str )
+{
+ yyin = in_str ;
+}
+
+__attribute__((unused)) static void yyset_out (FILE * out_str )
+{
+ yyout = out_str ;
+}
+
+__attribute__((unused)) static int yyget_debug (void)
+{
+ return yy_flex_debug;
+}
+
+__attribute__((unused)) static void yyset_debug (int bdebug )
+{
+ yy_flex_debug = bdebug ;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+__attribute__((unused)) static int yylex_destroy (void)
+{
+
+ /* Pop the buffer stack, destroying each element. */
+ while(YY_CURRENT_BUFFER){
+ yy_delete_buffer(YY_CURRENT_BUFFER );
+ YY_CURRENT_BUFFER_LVALUE = NULL;
+ yypop_buffer_state();
+ }
+
+ /* Destroy the stack itself. */
+ yyfree((yy_buffer_stack) );
+ (yy_buffer_stack) = NULL;
+
+ return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
+{
+ register int i;
+ for ( i = 0; i < n; ++i )
+ s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s )
+{
+ register int n;
+ for ( n = 0; s[n]; ++n )
+ ;
+
+ return n;
+}
+#endif
+
+static void *yyalloc (yy_size_t size )
+{
+ return (void *) malloc( size );
+}
+
+static void *yyrealloc (void * ptr, yy_size_t size )
+{
+ /* The cast to (char *) in the following accommodates both
+ * implementations that use char* generic pointers, and those
+ * that use void* generic pointers. It works with the latter
+ * because both ANSI C and C++ allow castless assignment from
+ * any pointer type to void*, and deal with argument conversions
+ * as though doing an assignment.
+ */
+ return (void *) realloc( (char *) ptr, size );
+}
+
+static void yyfree (void * ptr )
+{
+ free( (char *) ptr ); /* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef yytext_ptr
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+#line 648 "pars0lex.l"
+
+
+
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+ if (yy_buffer_stack)
+ yylex_destroy();
+ if (stringbuf)
+ free(stringbuf);
+ stringbuf = NULL;
+ stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/xtradb/pars/make_bison.sh b/storage/xtradb/pars/make_bison.sh
new file mode 100755
index 00000000000..09bb86e3106
--- /dev/null
+++ b/storage/xtradb/pars/make_bison.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# generate parser files from bison input files.
+
+set -eu
+TMPFILE=pars0grm.tab.c
+OUTFILE=pars0grm.c
+
+bison -d pars0grm.y
+mv pars0grm.tab.h ../include/pars0grm.h
+
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
+s/\(\(YYSTYPE\|int\) yy\(lval\|parse\)\)/UNIV_INTERN \1/;
+' < "$TMPFILE" > "$OUTFILE"
+
+rm "$TMPFILE"
diff --git a/storage/xtradb/pars/make_flex.sh b/storage/xtradb/pars/make_flex.sh
new file mode 100755
index 00000000000..89308a6636f
--- /dev/null
+++ b/storage/xtradb/pars/make_flex.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# generate lexer files from flex input files.
+
+set -eu
+
+TMPFILE=_flex_tmp.c
+OUTFILE=lexyy.c
+
+flex -o $TMPFILE pars0lex.l
+
+# AIX needs its includes done in a certain order, so include "univ.i" first
+# to be sure we get it right.
+echo '#include "univ.i"' > $OUTFILE
+
+# flex assigns a pointer to an int in one place without a cast, resulting in
+# a warning on Win64. Add the cast. Also define some symbols as static.
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/;
+s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/;
+s/\(void yy_switch_to_buffer\)/__attribute__((unused)) static \1/;
+s/\(void yy\(push\|pop\)_buffer_state\)/__attribute__((unused)) static \1/;
+s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/;
+s/\(\(int\|void\) yy[gs]et_\)/__attribute__((unused)) static \1/;
+s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/;
+s/\(extern \)\?\(int yy\(leng\|lineno\|_flex_debug\)\)/static \2/;
+s/\(int yylex_destroy\)/__attribute__((unused)) static \1/;
+s/\(extern \)\?\(int yylex \)/UNIV_INTERN \2/;
+s/^\(\(FILE\|char\) *\* *yyget\)/__attribute__((unused)) static \1/;
+s/^\(extern \)\?\(\(FILE\|char\) *\* *yy\)/static \2/;
+' < $TMPFILE >> $OUTFILE
+
+rm $TMPFILE
diff --git a/storage/xtradb/pars/pars0grm.c b/storage/xtradb/pars/pars0grm.c
new file mode 100644
index 00000000000..d667970735e
--- /dev/null
+++ b/storage/xtradb/pars/pars0grm.c
@@ -0,0 +1,2601 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
+
+As a special exception, when this file is copied by Bison into a
+Bison output file, you may use that output file without restriction.
+This special exception was added by the Free Software Foundation
+in version 1.24 of Bison.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* A Bison parser, made by GNU Bison 2.0. */
+
+/* Written by Richard Stallman by simplifying the original so called
+ ``semantic'' parser. */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+ infringing on user name space. This should be done even for local
+ variables, as they might otherwise be expanded by user macros.
+ There are some unavoidable exceptions within include files to
+ define necessary library symbols; they are noted "INFRINGES ON
+ USER NAME SPACE" below. */
+
+/* Identify Bison output. */
+#define YYBISON 1
+
+/* Skeleton name. */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers. */
+#define YYPURE 0
+
+/* Using locations. */
+#define YYLSP_NEEDED 0
+
+
+
+/* Tokens. */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+ /* Put the tokens into the symbol table, so that GDB and other debuggers
+ know about them. */
+ enum yytokentype {
+ PARS_INT_LIT = 258,
+ PARS_FLOAT_LIT = 259,
+ PARS_STR_LIT = 260,
+ PARS_FIXBINARY_LIT = 261,
+ PARS_BLOB_LIT = 262,
+ PARS_NULL_LIT = 263,
+ PARS_ID_TOKEN = 264,
+ PARS_AND_TOKEN = 265,
+ PARS_OR_TOKEN = 266,
+ PARS_NOT_TOKEN = 267,
+ PARS_GE_TOKEN = 268,
+ PARS_LE_TOKEN = 269,
+ PARS_NE_TOKEN = 270,
+ PARS_PROCEDURE_TOKEN = 271,
+ PARS_IN_TOKEN = 272,
+ PARS_OUT_TOKEN = 273,
+ PARS_BINARY_TOKEN = 274,
+ PARS_BLOB_TOKEN = 275,
+ PARS_INT_TOKEN = 276,
+ PARS_INTEGER_TOKEN = 277,
+ PARS_FLOAT_TOKEN = 278,
+ PARS_CHAR_TOKEN = 279,
+ PARS_IS_TOKEN = 280,
+ PARS_BEGIN_TOKEN = 281,
+ PARS_END_TOKEN = 282,
+ PARS_IF_TOKEN = 283,
+ PARS_THEN_TOKEN = 284,
+ PARS_ELSE_TOKEN = 285,
+ PARS_ELSIF_TOKEN = 286,
+ PARS_LOOP_TOKEN = 287,
+ PARS_WHILE_TOKEN = 288,
+ PARS_RETURN_TOKEN = 289,
+ PARS_SELECT_TOKEN = 290,
+ PARS_SUM_TOKEN = 291,
+ PARS_COUNT_TOKEN = 292,
+ PARS_DISTINCT_TOKEN = 293,
+ PARS_FROM_TOKEN = 294,
+ PARS_WHERE_TOKEN = 295,
+ PARS_FOR_TOKEN = 296,
+ PARS_DDOT_TOKEN = 297,
+ PARS_READ_TOKEN = 298,
+ PARS_ORDER_TOKEN = 299,
+ PARS_BY_TOKEN = 300,
+ PARS_ASC_TOKEN = 301,
+ PARS_DESC_TOKEN = 302,
+ PARS_INSERT_TOKEN = 303,
+ PARS_INTO_TOKEN = 304,
+ PARS_VALUES_TOKEN = 305,
+ PARS_UPDATE_TOKEN = 306,
+ PARS_SET_TOKEN = 307,
+ PARS_DELETE_TOKEN = 308,
+ PARS_CURRENT_TOKEN = 309,
+ PARS_OF_TOKEN = 310,
+ PARS_CREATE_TOKEN = 311,
+ PARS_TABLE_TOKEN = 312,
+ PARS_INDEX_TOKEN = 313,
+ PARS_UNIQUE_TOKEN = 314,
+ PARS_CLUSTERED_TOKEN = 315,
+ PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+ PARS_ON_TOKEN = 317,
+ PARS_ASSIGN_TOKEN = 318,
+ PARS_DECLARE_TOKEN = 319,
+ PARS_CURSOR_TOKEN = 320,
+ PARS_SQL_TOKEN = 321,
+ PARS_OPEN_TOKEN = 322,
+ PARS_FETCH_TOKEN = 323,
+ PARS_CLOSE_TOKEN = 324,
+ PARS_NOTFOUND_TOKEN = 325,
+ PARS_TO_CHAR_TOKEN = 326,
+ PARS_TO_NUMBER_TOKEN = 327,
+ PARS_TO_BINARY_TOKEN = 328,
+ PARS_BINARY_TO_NUMBER_TOKEN = 329,
+ PARS_SUBSTR_TOKEN = 330,
+ PARS_REPLSTR_TOKEN = 331,
+ PARS_CONCAT_TOKEN = 332,
+ PARS_INSTR_TOKEN = 333,
+ PARS_LENGTH_TOKEN = 334,
+ PARS_SYSDATE_TOKEN = 335,
+ PARS_PRINTF_TOKEN = 336,
+ PARS_ASSERT_TOKEN = 337,
+ PARS_RND_TOKEN = 338,
+ PARS_RND_STR_TOKEN = 339,
+ PARS_ROW_PRINTF_TOKEN = 340,
+ PARS_COMMIT_TOKEN = 341,
+ PARS_ROLLBACK_TOKEN = 342,
+ PARS_WORK_TOKEN = 343,
+ PARS_UNSIGNED_TOKEN = 344,
+ PARS_EXIT_TOKEN = 345,
+ PARS_FUNCTION_TOKEN = 346,
+ PARS_LOCK_TOKEN = 347,
+ PARS_SHARE_TOKEN = 348,
+ PARS_MODE_TOKEN = 349,
+ NEG = 350
+ };
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define NEG 350
+
+
+
+
+/* Copy the first part of user declarations. */
+#line 13 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h> /* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+
+
+/* Enabling traces. */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages. */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+/* Copy the second part of user declarations. */
+
+
+/* Line 213 of yacc.c. */
+#line 297 "pars0grm.c"
+
+#if ! defined (yyoverflow) || YYERROR_VERBOSE
+
+# ifndef YYFREE
+# define YYFREE free
+# endif
+# ifndef YYMALLOC
+# define YYMALLOC malloc
+# endif
+
+/* The parser invokes alloca or malloc; define the necessary symbols. */
+
+# ifdef YYSTACK_USE_ALLOCA
+# if YYSTACK_USE_ALLOCA
+# ifdef __GNUC__
+# define YYSTACK_ALLOC __builtin_alloca
+# else
+# define YYSTACK_ALLOC alloca
+# endif
+# endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+ /* Pacify GCC's `empty if-body' warning. */
+# define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
+# else
+# if defined (__STDC__) || defined (__cplusplus)
+# include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+# define YYSIZE_T size_t
+# endif
+# define YYSTACK_ALLOC YYMALLOC
+# define YYSTACK_FREE YYFREE
+# endif
+#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */
+
+
+#if (! defined (yyoverflow) \
+ && (! defined (__cplusplus) \
+ || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member. */
+union yyalloc
+{
+ short int yyss;
+ YYSTYPE yyvs;
+ };
+
+/* The size of the maximum gap between one aligned stack and the next. */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+ N elements. */
+# define YYSTACK_BYTES(N) \
+ ((N) * (sizeof (short int) + sizeof (YYSTYPE)) \
+ + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO. The source and destination do
+ not overlap. */
+# ifndef YYCOPY
+# if defined (__GNUC__) && 1 < __GNUC__
+# define YYCOPY(To, From, Count) \
+ __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+# else
+# define YYCOPY(To, From, Count) \
+ do \
+ { \
+ register YYSIZE_T yyi; \
+ for (yyi = 0; yyi < (Count); yyi++) \
+ (To)[yyi] = (From)[yyi]; \
+ } \
+ while (0)
+# endif
+# endif
+
+/* Relocate STACK from its old location to the new one. The
+ local variables YYSIZE and YYSTACKSIZE give the old and new number of
+ elements in the stack, and YYPTR gives the new location of the
+ stack. Advance YYPTR to a properly aligned location for the next
+ stack. */
+# define YYSTACK_RELOCATE(Stack) \
+ do \
+ { \
+ YYSIZE_T yynewbytes; \
+ YYCOPY (&yyptr->Stack, Stack, yysize); \
+ Stack = &yyptr->Stack; \
+ yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+ yyptr += yynewbytes / sizeof (*yyptr); \
+ } \
+ while (0)
+
+#endif
+
+#if defined (__STDC__) || defined (__cplusplus)
+ typedef signed char yysigned_char;
+#else
+ typedef short int yysigned_char;
+#endif
+
+/* YYFINAL -- State number of the termination state. */
+#define YYFINAL 5
+/* YYLAST -- Last index in YYTABLE. */
+#define YYLAST 752
+
+/* YYNTOKENS -- Number of terminals. */
+#define YYNTOKENS 111
+/* YYNNTS -- Number of nonterminals. */
+#define YYNNTS 70
+/* YYNRULES -- Number of rules. */
+#define YYNRULES 175
+/* YYNRULES -- Number of states. */
+#define YYNSTATES 339
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */
+#define YYUNDEFTOK 2
+#define YYMAXUTOK 350
+
+#define YYTRANSLATE(YYX) \
+ ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */
+static const unsigned char yytranslate[] =
+{
+ 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 103, 2, 2,
+ 105, 106, 100, 99, 108, 98, 2, 101, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 104,
+ 96, 95, 97, 107, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 109, 2, 110, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+ 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+ 102
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+ YYRHS. */
+static const unsigned short int yyprhs[] =
+{
+ 0, 0, 3, 6, 8, 11, 14, 17, 20, 23,
+ 26, 29, 32, 35, 38, 41, 44, 47, 50, 53,
+ 56, 59, 62, 65, 68, 71, 73, 76, 78, 83,
+ 85, 87, 89, 91, 93, 95, 97, 101, 105, 109,
+ 113, 116, 120, 124, 128, 132, 136, 140, 144, 148,
+ 152, 155, 159, 163, 165, 167, 169, 171, 173, 175,
+ 177, 179, 181, 183, 185, 186, 188, 192, 199, 204,
+ 206, 208, 210, 214, 216, 220, 221, 223, 227, 228,
+ 230, 234, 236, 241, 247, 252, 253, 255, 259, 261,
+ 265, 267, 268, 271, 272, 275, 276, 281, 282, 284,
+ 286, 287, 292, 301, 305, 311, 314, 318, 320, 324,
+ 329, 334, 337, 340, 344, 347, 350, 353, 357, 362,
+ 364, 367, 368, 371, 373, 381, 388, 399, 401, 403,
+ 406, 409, 414, 419, 425, 427, 431, 432, 436, 437,
+ 439, 440, 443, 444, 446, 454, 456, 460, 461, 463,
+ 464, 466, 477, 480, 483, 485, 487, 489, 491, 493,
+ 497, 501, 502, 504, 508, 512, 513, 515, 518, 525,
+ 530, 532, 534, 535, 537, 540
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS. */
+static const short int yyrhs[] =
+{
+ 112, 0, -1, 180, 104, -1, 118, -1, 119, 104,
+ -1, 151, 104, -1, 152, 104, -1, 153, 104, -1,
+ 150, 104, -1, 154, 104, -1, 146, 104, -1, 133,
+ 104, -1, 135, 104, -1, 145, 104, -1, 143, 104,
+ -1, 144, 104, -1, 140, 104, -1, 141, 104, -1,
+ 155, 104, -1, 157, 104, -1, 156, 104, -1, 169,
+ 104, -1, 170, 104, -1, 164, 104, -1, 168, 104,
+ -1, 113, -1, 114, 113, -1, 9, -1, 116, 105,
+ 124, 106, -1, 3, -1, 4, -1, 5, -1, 6,
+ -1, 7, -1, 8, -1, 66, -1, 115, 99, 115,
+ -1, 115, 98, 115, -1, 115, 100, 115, -1, 115,
+ 101, 115, -1, 98, 115, -1, 105, 115, 106, -1,
+ 115, 95, 115, -1, 115, 96, 115, -1, 115, 97,
+ 115, -1, 115, 13, 115, -1, 115, 14, 115, -1,
+ 115, 15, 115, -1, 115, 10, 115, -1, 115, 11,
+ 115, -1, 12, 115, -1, 9, 103, 70, -1, 66,
+ 103, 70, -1, 71, -1, 72, -1, 73, -1, 74,
+ -1, 75, -1, 77, -1, 78, -1, 79, -1, 80,
+ -1, 83, -1, 84, -1, -1, 107, -1, 117, 108,
+ 107, -1, 109, 9, 105, 117, 106, 110, -1, 120,
+ 105, 124, 106, -1, 76, -1, 81, -1, 82, -1,
+ 9, 105, 106, -1, 9, -1, 122, 108, 9, -1,
+ -1, 9, -1, 123, 108, 9, -1, -1, 115, -1,
+ 124, 108, 115, -1, 115, -1, 37, 105, 100, 106,
+ -1, 37, 105, 38, 9, 106, -1, 36, 105, 115,
+ 106, -1, -1, 125, -1, 126, 108, 125, -1, 100,
+ -1, 126, 49, 123, -1, 126, -1, -1, 40, 115,
+ -1, -1, 41, 51, -1, -1, 92, 17, 93, 94,
+ -1, -1, 46, -1, 47, -1, -1, 44, 45, 9,
+ 131, -1, 35, 127, 39, 122, 128, 129, 130, 132,
+ -1, 48, 49, 9, -1, 134, 50, 105, 124, 106,
+ -1, 134, 133, -1, 9, 95, 115, -1, 136, -1,
+ 137, 108, 136, -1, 40, 54, 55, 9, -1, 51,
+ 9, 52, 137, -1, 139, 128, -1, 139, 138, -1,
+ 53, 39, 9, -1, 142, 128, -1, 142, 138, -1,
+ 85, 133, -1, 9, 63, 115, -1, 31, 115, 29,
+ 114, -1, 147, -1, 148, 147, -1, -1, 30, 114,
+ -1, 148, -1, 28, 115, 29, 114, 149, 27, 28,
+ -1, 33, 115, 32, 114, 27, 32, -1, 41, 9,
+ 17, 115, 42, 115, 32, 114, 27, 32, -1, 90,
+ -1, 34, -1, 67, 9, -1, 69, 9, -1, 68,
+ 9, 49, 123, -1, 68, 9, 49, 121, -1, 9,
+ 171, 160, 161, 162, -1, 158, -1, 159, 108, 158,
+ -1, -1, 105, 3, 106, -1, -1, 89, -1, -1,
+ 12, 8, -1, -1, 61, -1, 56, 57, 9, 105,
+ 159, 106, 163, -1, 9, -1, 165, 108, 9, -1,
+ -1, 59, -1, -1, 60, -1, 56, 166, 167, 58,
+ 9, 62, 9, 105, 165, 106, -1, 86, 88, -1,
+ 87, 88, -1, 21, -1, 22, -1, 24, -1, 19,
+ -1, 20, -1, 9, 17, 171, -1, 9, 18, 171,
+ -1, -1, 172, -1, 173, 108, 172, -1, 9, 171,
+ 104, -1, -1, 174, -1, 175, 174, -1, 64, 65,
+ 9, 25, 133, 104, -1, 64, 91, 9, 104, -1,
+ 176, -1, 177, -1, -1, 178, -1, 179, 178, -1,
+ 16, 9, 105, 173, 106, 25, 175, 179, 26, 114,
+ 27, -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
+static const unsigned short int yyrline[] =
+{
+ 0, 138, 138, 141, 142, 143, 144, 145, 146, 147,
+ 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
+ 158, 159, 160, 161, 162, 166, 167, 172, 173, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
+ 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
+ 196, 197, 199, 204, 205, 206, 207, 209, 210, 211,
+ 212, 213, 214, 215, 218, 220, 221, 225, 230, 235,
+ 236, 237, 241, 245, 246, 251, 252, 253, 258, 259,
+ 260, 264, 265, 270, 276, 283, 284, 285, 290, 292,
+ 294, 298, 299, 303, 304, 309, 310, 315, 316, 317,
+ 321, 322, 327, 337, 342, 344, 349, 353, 354, 359,
+ 365, 372, 377, 382, 388, 393, 398, 403, 408, 414,
+ 415, 420, 421, 423, 427, 434, 440, 448, 452, 456,
+ 462, 468, 470, 475, 480, 481, 486, 487, 492, 493,
+ 499, 500, 506, 507, 513, 519, 520, 525, 526, 530,
+ 531, 535, 543, 548, 553, 554, 555, 556, 557, 561,
+ 564, 570, 571, 572, 577, 581, 583, 584, 588, 594,
+ 599, 600, 603, 605, 606, 610
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE
+/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+ First, the terminals, then, starting at YYNTOKENS, nonterminals. */
+static const char *const yytname[] =
+{
+ "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+ "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
+ "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
+ "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
+ "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
+ "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
+ "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
+ "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
+ "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
+ "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
+ "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
+ "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+ "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
+ "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
+ "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
+ "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
+ "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
+ "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
+ "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
+ "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+ "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+ "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+ "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
+ "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
+ "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
+ "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
+ "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
+ "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
+ "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+ "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", "'='", "'<'",
+ "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'",
+ "'?'", "','", "'{'", "'}'", "$accept", "top_statement", "statement",
+ "statement_list", "exp", "function_name", "question_mark_list",
+ "stored_procedure_call", "predefined_procedure_call",
+ "predefined_procedure_name", "user_function_call", "table_list",
+ "variable_list", "exp_list", "select_item", "select_item_list",
+ "select_list", "search_condition", "for_update_clause",
+ "lock_shared_clause", "order_direction", "order_by_clause",
+ "select_statement", "insert_statement_start", "insert_statement",
+ "column_assignment", "column_assignment_list", "cursor_positioned",
+ "update_statement_start", "update_statement_searched",
+ "update_statement_positioned", "delete_statement_start",
+ "delete_statement_searched", "delete_statement_positioned",
+ "row_printf_statement", "assignment_statement", "elsif_element",
+ "elsif_list", "else_part", "if_statement", "while_statement",
+ "for_statement", "exit_statement", "return_statement",
+ "open_cursor_statement", "close_cursor_statement", "fetch_statement",
+ "column_def", "column_def_list", "opt_column_len", "opt_unsigned",
+ "opt_not_null", "not_fit_in_memory", "create_table", "column_list",
+ "unique_def", "clustered_def", "create_index", "commit_statement",
+ "rollback_statement", "type_name", "parameter_declaration",
+ "parameter_declaration_list", "variable_declaration",
+ "variable_declaration_list", "cursor_declaration",
+ "function_declaration", "declaration", "declaration_list",
+ "procedure_definition", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+ token YYLEX-NUM. */
+static const unsigned short int yytoknum[] =
+{
+ 0, 256, 257, 258, 259, 260, 261, 262, 263, 264,
+ 265, 266, 267, 268, 269, 270, 271, 272, 273, 274,
+ 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
+ 295, 296, 297, 298, 299, 300, 301, 302, 303, 304,
+ 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334,
+ 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 61, 60, 62, 45, 43,
+ 42, 47, 350, 37, 59, 40, 41, 63, 44, 123,
+ 125
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */
+static const unsigned char yyr1[] =
+{
+ 0, 111, 112, 113, 113, 113, 113, 113, 113, 113,
+ 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+ 113, 113, 113, 113, 113, 114, 114, 115, 115, 115,
+ 115, 115, 115, 115, 115, 115, 115, 115, 115, 115,
+ 115, 115, 115, 115, 115, 115, 115, 115, 115, 115,
+ 115, 115, 115, 116, 116, 116, 116, 116, 116, 116,
+ 116, 116, 116, 116, 117, 117, 117, 118, 119, 120,
+ 120, 120, 121, 122, 122, 123, 123, 123, 124, 124,
+ 124, 125, 125, 125, 125, 126, 126, 126, 127, 127,
+ 127, 128, 128, 129, 129, 130, 130, 131, 131, 131,
+ 132, 132, 133, 134, 135, 135, 136, 137, 137, 138,
+ 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
+ 148, 149, 149, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 157, 158, 159, 159, 160, 160, 161, 161,
+ 162, 162, 163, 163, 164, 165, 165, 166, 166, 167,
+ 167, 168, 169, 170, 171, 171, 171, 171, 171, 172,
+ 172, 173, 173, 173, 174, 175, 175, 175, 176, 177,
+ 178, 178, 179, 179, 179, 180
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */
+static const unsigned char yyr2[] =
+{
+ 0, 2, 2, 1, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 1, 2, 1, 4, 1,
+ 1, 1, 1, 1, 1, 1, 3, 3, 3, 3,
+ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 3, 3, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0, 1, 3, 6, 4, 1,
+ 1, 1, 3, 1, 3, 0, 1, 3, 0, 1,
+ 3, 1, 4, 5, 4, 0, 1, 3, 1, 3,
+ 1, 0, 2, 0, 2, 0, 4, 0, 1, 1,
+ 0, 4, 8, 3, 5, 2, 3, 1, 3, 4,
+ 4, 2, 2, 3, 2, 2, 2, 3, 4, 1,
+ 2, 0, 2, 1, 7, 6, 10, 1, 1, 2,
+ 2, 4, 4, 5, 1, 3, 0, 3, 0, 1,
+ 0, 2, 0, 1, 7, 1, 3, 0, 1, 0,
+ 1, 10, 2, 2, 1, 1, 1, 1, 1, 3,
+ 3, 0, 1, 3, 3, 0, 1, 2, 6, 4,
+ 1, 1, 0, 1, 2, 11
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+ STATE-NUM when YYTABLE doesn't specify something else to do. Zero
+ means the default is an error. */
+static const unsigned char yydefact[] =
+{
+ 0, 0, 0, 0, 0, 1, 2, 161, 0, 162,
+ 0, 0, 0, 0, 0, 157, 158, 154, 155, 156,
+ 159, 160, 165, 163, 0, 166, 172, 0, 0, 167,
+ 170, 171, 173, 0, 164, 0, 0, 0, 174, 0,
+ 0, 0, 0, 0, 128, 85, 0, 0, 0, 0,
+ 147, 0, 0, 0, 69, 70, 71, 0, 0, 0,
+ 127, 0, 25, 0, 3, 0, 0, 0, 0, 0,
+ 91, 0, 0, 91, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 169, 0, 29, 30, 31, 32, 33, 34, 27,
+ 0, 35, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 0, 0, 0, 0, 0, 0, 0,
+ 88, 81, 86, 90, 0, 0, 0, 0, 0, 0,
+ 148, 149, 129, 0, 130, 116, 152, 153, 0, 175,
+ 26, 4, 78, 11, 0, 105, 12, 0, 111, 112,
+ 16, 17, 114, 115, 14, 15, 13, 10, 8, 5,
+ 6, 7, 9, 18, 20, 19, 23, 24, 21, 22,
+ 0, 117, 0, 50, 0, 40, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 78, 0, 0, 0, 75, 0, 0, 0, 103, 0,
+ 113, 0, 150, 0, 75, 64, 79, 0, 78, 0,
+ 92, 168, 51, 52, 41, 48, 49, 45, 46, 47,
+ 121, 42, 43, 44, 37, 36, 38, 39, 0, 0,
+ 0, 0, 0, 76, 89, 87, 73, 91, 0, 0,
+ 107, 110, 0, 0, 76, 132, 131, 65, 0, 68,
+ 0, 0, 0, 0, 0, 119, 123, 0, 28, 0,
+ 84, 0, 82, 0, 0, 0, 93, 0, 0, 0,
+ 0, 134, 0, 0, 0, 0, 0, 80, 104, 109,
+ 122, 0, 120, 0, 125, 83, 77, 74, 0, 95,
+ 0, 106, 108, 136, 142, 0, 0, 72, 67, 66,
+ 0, 124, 94, 0, 100, 0, 0, 138, 143, 144,
+ 135, 0, 118, 0, 0, 102, 0, 0, 139, 140,
+ 0, 0, 0, 0, 137, 0, 133, 145, 0, 96,
+ 97, 126, 141, 151, 0, 98, 99, 101, 146
+};
+
+/* YYDEFGOTO[NTERM-NUM]. */
+static const short int yydefgoto[] =
+{
+ -1, 2, 62, 63, 206, 116, 248, 64, 65, 66,
+ 245, 237, 234, 207, 122, 123, 124, 148, 289, 304,
+ 337, 315, 67, 68, 69, 240, 241, 149, 70, 71,
+ 72, 73, 74, 75, 76, 77, 255, 256, 257, 78,
+ 79, 80, 81, 82, 83, 84, 85, 271, 272, 307,
+ 319, 326, 309, 86, 328, 131, 203, 87, 88, 89,
+ 20, 9, 10, 25, 26, 30, 31, 32, 33, 3
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+ STATE-NUM. */
+#define YYPACT_NINF -177
+static const short int yypact[] =
+{
+ 28, 38, 54, -46, -29, -177, -177, 56, 50, -177,
+ -75, 8, 8, 46, 56, -177, -177, -177, -177, -177,
+ -177, -177, 63, -177, 8, -177, 2, -26, -51, -177,
+ -177, -177, -177, -13, -177, 71, 72, 587, -177, 57,
+ -21, 26, 272, 272, -177, 13, 91, 55, 96, 67,
+ -22, 99, 100, 103, -177, -177, -177, 75, 29, 35,
+ -177, 116, -177, 396, -177, 22, 23, 27, -9, 30,
+ 87, 31, 32, 87, 47, 49, 52, 58, 59, 60,
+ 61, 62, 65, 66, 74, 77, 78, 86, 89, 102,
+ 75, -177, 272, -177, -177, -177, -177, -177, -177, 39,
+ 272, 51, -177, -177, -177, -177, -177, -177, -177, -177,
+ -177, -177, -177, 272, 272, 361, 25, 489, 45, 90,
+ -177, 651, -177, -39, 93, 142, 124, 108, 152, 170,
+ -177, 131, -177, 143, -177, -177, -177, -177, 98, -177,
+ -177, -177, 272, -177, 110, -177, -177, 256, -177, -177,
+ -177, -177, -177, -177, -177, -177, -177, -177, -177, -177,
+ -177, -177, -177, -177, -177, -177, -177, -177, -177, -177,
+ 112, 651, 137, 101, 147, 204, 88, 272, 272, 272,
+ 272, 272, 587, 272, 272, 272, 272, 272, 272, 272,
+ 272, 587, 272, -30, 211, 168, 212, 272, -177, 213,
+ -177, 118, -177, 167, 217, 122, 651, -63, 272, 175,
+ 651, -177, -177, -177, -177, 101, 101, 21, 21, 651,
+ 332, 21, 21, 21, -6, -6, 204, 204, -60, 460,
+ 198, 222, 126, -177, 125, -177, -177, -33, 584, 140,
+ -177, 128, 228, 229, 139, -177, 125, -177, -53, -177,
+ 272, -49, 240, 587, 272, -177, 224, 226, -177, 225,
+ -177, 150, -177, 258, 272, 260, 230, 272, 272, 213,
+ 8, -177, -45, 208, 166, 164, 176, 651, -177, -177,
+ 587, 631, -177, 254, -177, -177, -177, -177, 234, 194,
+ 638, 651, -177, 182, 227, 228, 280, -177, -177, -177,
+ 587, -177, -177, 273, 247, 587, 289, 214, -177, -177,
+ -177, 195, 587, 209, 261, -177, 524, 199, -177, 295,
+ 292, 215, 299, 279, -177, 304, -177, -177, -44, -177,
+ -8, -177, -177, -177, 305, -177, -177, -177, -177
+};
+
+/* YYPGOTO[NTERM-NUM]. */
+static const short int yypgoto[] =
+{
+ -177, -177, -62, -176, -40, -177, -177, -177, -177, -177,
+ -177, -177, 109, -166, 120, -177, -177, -69, -177, -177,
+ -177, -177, -34, -177, -177, 48, -177, 243, -177, -177,
+ -177, -177, -177, -177, -177, -177, 64, -177, -177, -177,
+ -177, -177, -177, -177, -177, -177, -177, 24, -177, -177,
+ -177, -177, -177, -177, -177, -177, -177, -177, -177, -177,
+ -12, 307, -177, 297, -177, -177, -177, 285, -177, -177
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If
+ positive, shift that token. If negative, reduce the rule which
+ number is the opposite. If zero, do what YYDEFACT says.
+ If YYTABLE_NINF, syntax error. */
+#define YYTABLE_NINF -1
+static const unsigned short int yytable[] =
+{
+ 21, 140, 115, 117, 152, 121, 220, 264, 231, 181,
+ 194, 24, 27, 37, 35, 229, 93, 94, 95, 96,
+ 97, 98, 99, 135, 228, 100, 45, 15, 16, 17,
+ 18, 13, 19, 14, 145, 129, 181, 130, 335, 336,
+ 36, 144, 251, 249, 1, 250, 258, 4, 250, 118,
+ 119, 28, 171, 275, 5, 276, 170, 278, 6, 250,
+ 173, 294, 333, 295, 334, 8, 28, 11, 12, 195,
+ 232, 22, 24, 175, 176, 265, 7, 280, 34, 101,
+ 39, 40, 90, 91, 102, 103, 104, 105, 106, 92,
+ 107, 108, 109, 110, 188, 189, 111, 112, 177, 178,
+ 125, 179, 180, 181, 126, 127, 128, 210, 132, 133,
+ 45, 113, 134, 120, 179, 180, 181, 136, 114, 186,
+ 187, 188, 189, 137, 312, 138, 141, 147, 142, 316,
+ 190, 143, 196, 198, 146, 150, 151, 215, 216, 217,
+ 218, 219, 172, 221, 222, 223, 224, 225, 226, 227,
+ 192, 154, 230, 155, 174, 121, 156, 238, 140, 197,
+ 199, 200, 157, 158, 159, 160, 161, 140, 266, 162,
+ 163, 93, 94, 95, 96, 97, 98, 99, 164, 201,
+ 100, 165, 166, 183, 184, 185, 186, 187, 188, 189,
+ 167, 202, 204, 168, 214, 193, 183, 184, 185, 186,
+ 187, 188, 189, 205, 118, 119, 169, 212, 177, 178,
+ 277, 179, 180, 181, 281, 208, 211, 213, 140, 181,
+ 233, 236, 239, 242, 210, 243, 244, 290, 291, 247,
+ 252, 261, 262, 263, 101, 268, 269, 270, 273, 102,
+ 103, 104, 105, 106, 274, 107, 108, 109, 110, 279,
+ 140, 111, 112, 283, 140, 254, 285, 284, 293, 93,
+ 94, 95, 96, 97, 98, 99, 113, 286, 100, 287,
+ 296, 288, 297, 114, 298, 93, 94, 95, 96, 97,
+ 98, 99, 301, 299, 100, 302, 303, 306, 308, 311,
+ 313, 314, 317, 183, 184, 185, 186, 187, 188, 189,
+ 320, 327, 321, 318, 260, 324, 322, 325, 330, 329,
+ 209, 331, 332, 246, 338, 235, 153, 292, 38, 310,
+ 282, 23, 101, 29, 0, 0, 0, 102, 103, 104,
+ 105, 106, 0, 107, 108, 109, 110, 0, 101, 111,
+ 112, 41, 0, 102, 103, 104, 105, 106, 0, 107,
+ 108, 109, 110, 0, 113, 111, 112, 0, 0, 0,
+ 42, 114, 253, 254, 0, 43, 44, 45, 0, 0,
+ 113, 177, 178, 46, 179, 180, 181, 114, 0, 0,
+ 47, 0, 0, 48, 0, 49, 0, 0, 50, 0,
+ 182, 0, 0, 0, 0, 0, 0, 0, 0, 51,
+ 52, 53, 0, 0, 0, 41, 0, 0, 54, 0,
+ 0, 0, 0, 55, 56, 0, 0, 57, 58, 59,
+ 0, 0, 60, 139, 42, 0, 0, 0, 0, 43,
+ 44, 45, 0, 0, 0, 0, 0, 46, 0, 0,
+ 0, 61, 0, 0, 47, 0, 0, 48, 0, 49,
+ 0, 0, 50, 0, 0, 0, 183, 184, 185, 186,
+ 187, 188, 189, 51, 52, 53, 0, 0, 0, 41,
+ 0, 0, 54, 0, 0, 0, 0, 55, 56, 0,
+ 0, 57, 58, 59, 0, 0, 60, 259, 42, 0,
+ 0, 0, 0, 43, 44, 45, 0, 0, 0, 177,
+ 178, 46, 179, 180, 181, 61, 0, 0, 47, 0,
+ 0, 48, 0, 49, 0, 0, 50, 0, 0, 0,
+ 0, 191, 0, 0, 0, 0, 0, 51, 52, 53,
+ 0, 0, 0, 41, 0, 0, 54, 0, 0, 0,
+ 0, 55, 56, 0, 0, 57, 58, 59, 0, 0,
+ 60, 323, 42, 0, 0, 0, 0, 43, 44, 45,
+ 0, 0, 0, 0, 0, 46, 0, 0, 0, 61,
+ 0, 0, 47, 0, 0, 48, 0, 49, 0, 0,
+ 50, 0, 0, 0, 183, 184, 185, 186, 187, 188,
+ 189, 51, 52, 53, 177, 178, 41, 179, 180, 181,
+ 54, 0, 0, 0, 0, 55, 56, 0, 0, 57,
+ 58, 59, 0, 0, 60, 42, 0, 0, 0, 0,
+ 43, 44, 45, 0, 0, 0, 267, 0, 46, 0,
+ 0, 0, 0, 61, 0, 47, 0, 0, 48, 0,
+ 49, 177, 178, 50, 179, 180, 181, 0, 177, 178,
+ 0, 179, 180, 181, 51, 52, 53, 0, 0, 0,
+ 300, 177, 178, 54, 179, 180, 181, 0, 55, 56,
+ 305, 0, 57, 58, 59, 0, 0, 60, 0, 183,
+ 184, 185, 186, 187, 188, 189, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 61, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 183, 184, 185, 186,
+ 187, 188, 189, 183, 184, 185, 186, 187, 188, 189,
+ 0, 0, 0, 0, 0, 0, 183, 184, 185, 186,
+ 187, 188, 189
+};
+
+static const short int yycheck[] =
+{
+ 12, 63, 42, 43, 73, 45, 182, 40, 38, 15,
+ 49, 9, 24, 26, 65, 191, 3, 4, 5, 6,
+ 7, 8, 9, 57, 190, 12, 35, 19, 20, 21,
+ 22, 106, 24, 108, 68, 57, 15, 59, 46, 47,
+ 91, 50, 208, 106, 16, 108, 106, 9, 108, 36,
+ 37, 64, 92, 106, 0, 108, 90, 106, 104, 108,
+ 100, 106, 106, 108, 108, 9, 64, 17, 18, 108,
+ 100, 25, 9, 113, 114, 108, 105, 253, 104, 66,
+ 9, 9, 25, 104, 71, 72, 73, 74, 75, 63,
+ 77, 78, 79, 80, 100, 101, 83, 84, 10, 11,
+ 9, 13, 14, 15, 49, 9, 39, 147, 9, 9,
+ 35, 98, 9, 100, 13, 14, 15, 88, 105, 98,
+ 99, 100, 101, 88, 300, 9, 104, 40, 105, 305,
+ 105, 104, 39, 9, 104, 104, 104, 177, 178, 179,
+ 180, 181, 103, 183, 184, 185, 186, 187, 188, 189,
+ 105, 104, 192, 104, 103, 195, 104, 197, 220, 17,
+ 52, 9, 104, 104, 104, 104, 104, 229, 237, 104,
+ 104, 3, 4, 5, 6, 7, 8, 9, 104, 9,
+ 12, 104, 104, 95, 96, 97, 98, 99, 100, 101,
+ 104, 60, 49, 104, 106, 105, 95, 96, 97, 98,
+ 99, 100, 101, 105, 36, 37, 104, 70, 10, 11,
+ 250, 13, 14, 15, 254, 105, 104, 70, 280, 15,
+ 9, 9, 9, 105, 264, 58, 9, 267, 268, 107,
+ 55, 9, 106, 108, 66, 95, 108, 9, 9, 71,
+ 72, 73, 74, 75, 105, 77, 78, 79, 80, 9,
+ 312, 83, 84, 27, 316, 31, 106, 32, 270, 3,
+ 4, 5, 6, 7, 8, 9, 98, 9, 12, 9,
+ 62, 41, 106, 105, 110, 3, 4, 5, 6, 7,
+ 8, 9, 28, 107, 12, 51, 92, 105, 61, 9,
+ 17, 44, 3, 95, 96, 97, 98, 99, 100, 101,
+ 105, 9, 93, 89, 106, 106, 45, 12, 9, 94,
+ 54, 32, 8, 204, 9, 195, 73, 269, 33, 295,
+ 256, 14, 66, 26, -1, -1, -1, 71, 72, 73,
+ 74, 75, -1, 77, 78, 79, 80, -1, 66, 83,
+ 84, 9, -1, 71, 72, 73, 74, 75, -1, 77,
+ 78, 79, 80, -1, 98, 83, 84, -1, -1, -1,
+ 28, 105, 30, 31, -1, 33, 34, 35, -1, -1,
+ 98, 10, 11, 41, 13, 14, 15, 105, -1, -1,
+ 48, -1, -1, 51, -1, 53, -1, -1, 56, -1,
+ 29, -1, -1, -1, -1, -1, -1, -1, -1, 67,
+ 68, 69, -1, -1, -1, 9, -1, -1, 76, -1,
+ -1, -1, -1, 81, 82, -1, -1, 85, 86, 87,
+ -1, -1, 90, 27, 28, -1, -1, -1, -1, 33,
+ 34, 35, -1, -1, -1, -1, -1, 41, -1, -1,
+ -1, 109, -1, -1, 48, -1, -1, 51, -1, 53,
+ -1, -1, 56, -1, -1, -1, 95, 96, 97, 98,
+ 99, 100, 101, 67, 68, 69, -1, -1, -1, 9,
+ -1, -1, 76, -1, -1, -1, -1, 81, 82, -1,
+ -1, 85, 86, 87, -1, -1, 90, 27, 28, -1,
+ -1, -1, -1, 33, 34, 35, -1, -1, -1, 10,
+ 11, 41, 13, 14, 15, 109, -1, -1, 48, -1,
+ -1, 51, -1, 53, -1, -1, 56, -1, -1, -1,
+ -1, 32, -1, -1, -1, -1, -1, 67, 68, 69,
+ -1, -1, -1, 9, -1, -1, 76, -1, -1, -1,
+ -1, 81, 82, -1, -1, 85, 86, 87, -1, -1,
+ 90, 27, 28, -1, -1, -1, -1, 33, 34, 35,
+ -1, -1, -1, -1, -1, 41, -1, -1, -1, 109,
+ -1, -1, 48, -1, -1, 51, -1, 53, -1, -1,
+ 56, -1, -1, -1, 95, 96, 97, 98, 99, 100,
+ 101, 67, 68, 69, 10, 11, 9, 13, 14, 15,
+ 76, -1, -1, -1, -1, 81, 82, -1, -1, 85,
+ 86, 87, -1, -1, 90, 28, -1, -1, -1, -1,
+ 33, 34, 35, -1, -1, -1, 42, -1, 41, -1,
+ -1, -1, -1, 109, -1, 48, -1, -1, 51, -1,
+ 53, 10, 11, 56, 13, 14, 15, -1, 10, 11,
+ -1, 13, 14, 15, 67, 68, 69, -1, -1, -1,
+ 29, 10, 11, 76, 13, 14, 15, -1, 81, 82,
+ 32, -1, 85, 86, 87, -1, -1, 90, -1, 95,
+ 96, 97, 98, 99, 100, 101, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 109, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, 95, 96, 97, 98,
+ 99, 100, 101, 95, 96, 97, 98, 99, 100, 101,
+ -1, -1, -1, -1, -1, -1, 95, 96, 97, 98,
+ 99, 100, 101
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+ symbol of state STATE-NUM. */
+static const unsigned char yystos[] =
+{
+ 0, 16, 112, 180, 9, 0, 104, 105, 9, 172,
+ 173, 17, 18, 106, 108, 19, 20, 21, 22, 24,
+ 171, 171, 25, 172, 9, 174, 175, 171, 64, 174,
+ 176, 177, 178, 179, 104, 65, 91, 26, 178, 9,
+ 9, 9, 28, 33, 34, 35, 41, 48, 51, 53,
+ 56, 67, 68, 69, 76, 81, 82, 85, 86, 87,
+ 90, 109, 113, 114, 118, 119, 120, 133, 134, 135,
+ 139, 140, 141, 142, 143, 144, 145, 146, 150, 151,
+ 152, 153, 154, 155, 156, 157, 164, 168, 169, 170,
+ 25, 104, 63, 3, 4, 5, 6, 7, 8, 9,
+ 12, 66, 71, 72, 73, 74, 75, 77, 78, 79,
+ 80, 83, 84, 98, 105, 115, 116, 115, 36, 37,
+ 100, 115, 125, 126, 127, 9, 49, 9, 39, 57,
+ 59, 166, 9, 9, 9, 133, 88, 88, 9, 27,
+ 113, 104, 105, 104, 50, 133, 104, 40, 128, 138,
+ 104, 104, 128, 138, 104, 104, 104, 104, 104, 104,
+ 104, 104, 104, 104, 104, 104, 104, 104, 104, 104,
+ 133, 115, 103, 115, 103, 115, 115, 10, 11, 13,
+ 14, 15, 29, 95, 96, 97, 98, 99, 100, 101,
+ 105, 32, 105, 105, 49, 108, 39, 17, 9, 52,
+ 9, 9, 60, 167, 49, 105, 115, 124, 105, 54,
+ 115, 104, 70, 70, 106, 115, 115, 115, 115, 115,
+ 114, 115, 115, 115, 115, 115, 115, 115, 124, 114,
+ 115, 38, 100, 9, 123, 125, 9, 122, 115, 9,
+ 136, 137, 105, 58, 9, 121, 123, 107, 117, 106,
+ 108, 124, 55, 30, 31, 147, 148, 149, 106, 27,
+ 106, 9, 106, 108, 40, 108, 128, 42, 95, 108,
+ 9, 158, 159, 9, 105, 106, 108, 115, 106, 9,
+ 114, 115, 147, 27, 32, 106, 9, 9, 41, 129,
+ 115, 115, 136, 171, 106, 108, 62, 106, 110, 107,
+ 29, 28, 51, 92, 130, 32, 105, 160, 61, 163,
+ 158, 9, 114, 17, 44, 132, 114, 3, 89, 161,
+ 105, 93, 45, 27, 106, 12, 162, 9, 165, 94,
+ 9, 32, 8, 106, 108, 46, 47, 131, 9
+};
+
+#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__)
+# define YYSIZE_T __SIZE_TYPE__
+#endif
+#if ! defined (YYSIZE_T) && defined (size_t)
+# define YYSIZE_T size_t
+#endif
+#if ! defined (YYSIZE_T)
+# if defined (__STDC__) || defined (__cplusplus)
+# include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+# define YYSIZE_T size_t
+# endif
+#endif
+#if ! defined (YYSIZE_T)
+# define YYSIZE_T unsigned int
+#endif
+
+#define yyerrok (yyerrstatus = 0)
+#define yyclearin (yychar = YYEMPTY)
+#define YYEMPTY (-2)
+#define YYEOF 0
+
+#define YYACCEPT goto yyacceptlab
+#define YYABORT goto yyabortlab
+#define YYERROR goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror. This remains here temporarily
+ to ease the transition to the new meaning of YYERROR, for GCC.
+ Once GCC version 2 has supplanted version 1, this can go. */
+
+#define YYFAIL goto yyerrlab
+
+#define YYRECOVERING() (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value) \
+do \
+ if (yychar == YYEMPTY && yylen == 1) \
+ { \
+ yychar = (Token); \
+ yylval = (Value); \
+ yytoken = YYTRANSLATE (yychar); \
+ YYPOPSTACK; \
+ goto yybackup; \
+ } \
+ else \
+ { \
+ yyerror ("syntax error: cannot back up");\
+ YYERROR; \
+ } \
+while (0)
+
+
+#define YYTERROR 1
+#define YYERRCODE 256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+ If N is 0, then set CURRENT to the empty location which ends
+ the previous symbol: RHS[0] (always defined). */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N) \
+ do \
+ if (N) \
+ { \
+ (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \
+ (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \
+ (Current).last_line = YYRHSLOC (Rhs, N).last_line; \
+ (Current).last_column = YYRHSLOC (Rhs, N).last_column; \
+ } \
+ else \
+ { \
+ (Current).first_line = (Current).last_line = \
+ YYRHSLOC (Rhs, 0).last_line; \
+ (Current).first_column = (Current).last_column = \
+ YYRHSLOC (Rhs, 0).last_column; \
+ } \
+ while (0)
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+ This macro was not mandated originally: define only if we know
+ we won't break user code: when these are the locations we know. */
+
+#ifndef YY_LOCATION_PRINT
+# if YYLTYPE_IS_TRIVIAL
+# define YY_LOCATION_PRINT(File, Loc) \
+ fprintf (File, "%d.%d-%d.%d", \
+ (Loc).first_line, (Loc).first_column, \
+ (Loc).last_line, (Loc).last_column)
+# else
+# define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments. */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (YYLEX_PARAM)
+#else
+# define YYLEX yylex ()
+#endif
+
+/* Enable debugging if requested. */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+# include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+# define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args) \
+do { \
+ if (yydebug) \
+ YYFPRINTF Args; \
+} while (0)
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \
+do { \
+ if (yydebug) \
+ { \
+ YYFPRINTF (stderr, "%s ", Title); \
+ yysymprint (stderr, \
+ Type, Value); \
+ YYFPRINTF (stderr, "\n"); \
+ } \
+} while (0)
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included). |
+`------------------------------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yy_stack_print (short int *bottom, short int *top)
+#else
+static void
+yy_stack_print (bottom, top)
+ short int *bottom;
+ short int *top;
+#endif
+{
+ YYFPRINTF (stderr, "Stack now");
+ for (/* Nothing. */; bottom <= top; ++bottom)
+ YYFPRINTF (stderr, " %d", *bottom);
+ YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top) \
+do { \
+ if (yydebug) \
+ yy_stack_print ((Bottom), (Top)); \
+} while (0)
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced. |
+`------------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yy_reduce_print (int yyrule)
+#else
+static void
+yy_reduce_print (yyrule)
+ int yyrule;
+#endif
+{
+ int yyi;
+ unsigned int yylno = yyrline[yyrule];
+ YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ",
+ yyrule - 1, yylno);
+ /* Print the symbols being reduced, and their result. */
+ for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++)
+ YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]);
+ YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]);
+}
+
+# define YY_REDUCE_PRINT(Rule) \
+do { \
+ if (yydebug) \
+ yy_reduce_print (Rule); \
+} while (0)
+
+/* Nonzero means print parse trace. It is left uninitialized so that
+ multiple parsers can coexist. */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks. */
+#ifndef YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+ if the built-in stack extension method is used).
+
+ Do not make this value too large; the results are undefined if
+ SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH)
+ evaluated with infinite-precision integer arithmetic. */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+# if defined (__GLIBC__) && defined (_STRING_H)
+# define yystrlen strlen
+# else
+/* Return the length of YYSTR. */
+static YYSIZE_T
+# if defined (__STDC__) || defined (__cplusplus)
+yystrlen (const char *yystr)
+# else
+yystrlen (yystr)
+ const char *yystr;
+# endif
+{
+ register const char *yys = yystr;
+
+ while (*yys++ != '\0')
+ continue;
+
+ return yys - yystr - 1;
+}
+# endif
+# endif
+
+# ifndef yystpcpy
+# if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE)
+# define yystpcpy stpcpy
+# else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+ YYDEST. */
+static char *
+# if defined (__STDC__) || defined (__cplusplus)
+yystpcpy (char *yydest, const char *yysrc)
+# else
+yystpcpy (yydest, yysrc)
+ char *yydest;
+ const char *yysrc;
+# endif
+{
+ register char *yyd = yydest;
+ register const char *yys = yysrc;
+
+ while ((*yyd++ = *yys++) != '\0')
+ continue;
+
+ return yyd - 1;
+}
+# endif
+# endif
+
+#endif /* !YYERROR_VERBOSE */
+
+
+
+#if YYDEBUG
+/*--------------------------------.
+| Print this symbol on YYOUTPUT. |
+`--------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yysymprint (yyoutput, yytype, yyvaluep)
+ FILE *yyoutput;
+ int yytype;
+ YYSTYPE *yyvaluep;
+#endif
+{
+ /* Pacify ``unused variable'' warnings. */
+ (void) yyvaluep;
+
+ if (yytype < YYNTOKENS)
+ YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+ else
+ YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+
+# ifdef YYPRINT
+ if (yytype < YYNTOKENS)
+ YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# endif
+ switch (yytype)
+ {
+ default:
+ break;
+ }
+ YYFPRINTF (yyoutput, ")");
+}
+
+#endif /* ! YYDEBUG */
+/*-----------------------------------------------.
+| Release the memory associated to this symbol. |
+`-----------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+ const char *yymsg;
+ int yytype;
+ YYSTYPE *yyvaluep;
+#endif
+{
+ /* Pacify ``unused variable'' warnings. */
+ (void) yyvaluep;
+
+ if (!yymsg)
+ yymsg = "Deleting";
+ YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+ switch (yytype)
+ {
+
+ default:
+ break;
+ }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes. */
+
+#ifdef YYPARSE_PARAM
+# if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void *YYPARSE_PARAM);
+# else
+UNIV_INTERN int yyparse ();
+# endif
+#else /* ! YYPARSE_PARAM */
+#if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void);
+#else
+UNIV_INTERN int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+/* The look-ahead symbol. */
+static int yychar;
+
+/* The semantic value of the look-ahead symbol. */
+UNIV_INTERN YYSTYPE yylval;
+
+/* Number of syntax errors so far. */
+static int yynerrs;
+
+
+
+/*----------.
+| yyparse. |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+# if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void *YYPARSE_PARAM)
+# else
+UNIV_INTERN int yyparse (YYPARSE_PARAM)
+ void *YYPARSE_PARAM;
+# endif
+#else /* ! YYPARSE_PARAM */
+#if defined (__STDC__) || defined (__cplusplus)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+
+ register int yystate;
+ register int yyn;
+ int yyresult;
+ /* Number of tokens to shift before error messages enabled. */
+ int yyerrstatus;
+ /* Look-ahead token as an internal (translated) token number. */
+ int yytoken = 0;
+
+ /* Three stacks and their tools:
+ `yyss': related to states,
+ `yyvs': related to semantic values,
+ `yyls': related to locations.
+
+ Refer to the stacks thru separate pointers, to allow yyoverflow
+ to reallocate them elsewhere. */
+
+ /* The state stack. */
+ short int yyssa[YYINITDEPTH];
+ short int *yyss = yyssa;
+ register short int *yyssp;
+
+ /* The semantic value stack. */
+ YYSTYPE yyvsa[YYINITDEPTH];
+ YYSTYPE *yyvs = yyvsa;
+ register YYSTYPE *yyvsp;
+
+
+
+#define YYPOPSTACK (yyvsp--, yyssp--)
+
+ YYSIZE_T yystacksize = YYINITDEPTH;
+
+ /* The variables used to return semantic value and location from the
+ action routines. */
+ YYSTYPE yyval;
+
+
+ /* When reducing, the number of symbols on the RHS of the reduced
+ rule. */
+ int yylen;
+
+ YYDPRINTF ((stderr, "Starting parse\n"));
+
+ yystate = 0;
+ yyerrstatus = 0;
+ yynerrs = 0;
+ yychar = YYEMPTY; /* Cause a token to be read. */
+
+ /* Initialize stack pointers.
+ Waste one element of value and location stack
+ so that they stay on the same level as the state stack.
+ The wasted elements are never initialized. */
+
+ yyssp = yyss;
+ yyvsp = yyvs;
+
+
+ yyvsp[0] = yylval;
+
+ goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate. |
+`------------------------------------------------------------*/
+ yynewstate:
+ /* In all cases, when you get here, the value and location stacks
+ have just been pushed. so pushing a state here evens the stacks.
+ */
+ yyssp++;
+
+ yysetstate:
+ *yyssp = yystate;
+
+ if (yyss + yystacksize - 1 <= yyssp)
+ {
+ /* Get the current used size of the three stacks, in elements. */
+ YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+ {
+ /* Give user a chance to reallocate the stack. Use copies of
+ these so that the &'s don't force the real ones into
+ memory. */
+ YYSTYPE *yyvs1 = yyvs;
+ short int *yyss1 = yyss;
+
+
+ /* Each stack pointer address is followed by the size of the
+ data in use in that stack, in bytes. This used to be a
+ conditional around just the two extra args, but that might
+ be undefined if yyoverflow is a macro. */
+ yyoverflow ("parser stack overflow",
+ &yyss1, yysize * sizeof (*yyssp),
+ &yyvs1, yysize * sizeof (*yyvsp),
+
+ &yystacksize);
+
+ yyss = yyss1;
+ yyvs = yyvs1;
+ }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+ goto yyoverflowlab;
+# else
+ /* Extend the stack our own way. */
+ if (YYMAXDEPTH <= yystacksize)
+ goto yyoverflowlab;
+ yystacksize *= 2;
+ if (YYMAXDEPTH < yystacksize)
+ yystacksize = YYMAXDEPTH;
+
+ {
+ short int *yyss1 = yyss;
+ union yyalloc *yyptr =
+ (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+ if (! yyptr)
+ goto yyoverflowlab;
+ YYSTACK_RELOCATE (yyss);
+ YYSTACK_RELOCATE (yyvs);
+
+# undef YYSTACK_RELOCATE
+ if (yyss1 != yyssa)
+ YYSTACK_FREE (yyss1);
+ }
+# endif
+#endif /* no yyoverflow */
+
+ yyssp = yyss + yysize - 1;
+ yyvsp = yyvs + yysize - 1;
+
+
+ YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+ (unsigned long int) yystacksize));
+
+ if (yyss + yystacksize - 1 <= yyssp)
+ YYABORT;
+ }
+
+ YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+ goto yybackup;
+
+/*-----------.
+| yybackup. |
+`-----------*/
+yybackup:
+
+/* Do appropriate processing given the current state. */
+/* Read a look-ahead token if we need one and don't already have one. */
+/* yyresume: */
+
+ /* First try to decide what to do without reference to look-ahead token. */
+
+ yyn = yypact[yystate];
+ if (yyn == YYPACT_NINF)
+ goto yydefault;
+
+ /* Not known => get a look-ahead token if don't already have one. */
+
+ /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol. */
+ if (yychar == YYEMPTY)
+ {
+ YYDPRINTF ((stderr, "Reading a token: "));
+ yychar = YYLEX;
+ }
+
+ if (yychar <= YYEOF)
+ {
+ yychar = yytoken = YYEOF;
+ YYDPRINTF ((stderr, "Now at end of input.\n"));
+ }
+ else
+ {
+ yytoken = YYTRANSLATE (yychar);
+ YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+ }
+
+ /* If the proper action on seeing token YYTOKEN is to reduce or to
+ detect an error, take that action. */
+ yyn += yytoken;
+ if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+ goto yydefault;
+ yyn = yytable[yyn];
+ if (yyn <= 0)
+ {
+ if (yyn == 0 || yyn == YYTABLE_NINF)
+ goto yyerrlab;
+ yyn = -yyn;
+ goto yyreduce;
+ }
+
+ if (yyn == YYFINAL)
+ YYACCEPT;
+
+ /* Shift the look-ahead token. */
+ YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+ /* Discard the token being shifted unless it is eof. */
+ if (yychar != YYEOF)
+ yychar = YYEMPTY;
+
+ *++yyvsp = yylval;
+
+
+ /* Count tokens shifted since error; after three, turn off error
+ status. */
+ if (yyerrstatus)
+ yyerrstatus--;
+
+ yystate = yyn;
+ goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state. |
+`-----------------------------------------------------------*/
+yydefault:
+ yyn = yydefact[yystate];
+ if (yyn == 0)
+ goto yyerrlab;
+ goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction. |
+`-----------------------------*/
+yyreduce:
+ /* yyn is the number of a rule to reduce with. */
+ yylen = yyr2[yyn];
+
+ /* If YYLEN is nonzero, implement the default value of the action:
+ `$$ = $1'.
+
+ Otherwise, the following line sets YYVAL to garbage.
+ This behavior is undocumented and Bison
+ users should not rely upon it. Assigning to YYVAL
+ unconditionally makes the parser a bit smaller, and it avoids a
+ GCC warning that YYVAL may be used uninitialized. */
+ yyval = yyvsp[1-yylen];
+
+
+ YY_REDUCE_PRINT (yyn);
+ switch (yyn)
+ {
+ case 25:
+#line 166 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 26:
+#line 168 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
+ break;
+
+ case 27:
+#line 172 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 28:
+#line 174 "pars0grm.y"
+ { (yyval) = pars_func((yyvsp[-3]), (yyvsp[-1])); ;}
+ break;
+
+ case 29:
+#line 175 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 30:
+#line 176 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 31:
+#line 177 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 32:
+#line 178 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 33:
+#line 179 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 34:
+#line 180 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 35:
+#line 181 "pars0grm.y"
+ { (yyval) = (yyvsp[0]);;}
+ break;
+
+ case 36:
+#line 182 "pars0grm.y"
+ { (yyval) = pars_op('+', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 37:
+#line 183 "pars0grm.y"
+ { (yyval) = pars_op('-', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 38:
+#line 184 "pars0grm.y"
+ { (yyval) = pars_op('*', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 39:
+#line 185 "pars0grm.y"
+ { (yyval) = pars_op('/', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 40:
+#line 186 "pars0grm.y"
+ { (yyval) = pars_op('-', (yyvsp[0]), NULL); ;}
+ break;
+
+ case 41:
+#line 187 "pars0grm.y"
+ { (yyval) = (yyvsp[-1]); ;}
+ break;
+
+ case 42:
+#line 188 "pars0grm.y"
+ { (yyval) = pars_op('=', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 43:
+#line 189 "pars0grm.y"
+ { (yyval) = pars_op('<', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 44:
+#line 190 "pars0grm.y"
+ { (yyval) = pars_op('>', (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 45:
+#line 191 "pars0grm.y"
+ { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 46:
+#line 192 "pars0grm.y"
+ { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 47:
+#line 193 "pars0grm.y"
+ { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 48:
+#line 194 "pars0grm.y"
+ { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 49:
+#line 195 "pars0grm.y"
+ { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 50:
+#line 196 "pars0grm.y"
+ { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[0]), NULL); ;}
+ break;
+
+ case 51:
+#line 198 "pars0grm.y"
+ { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
+ break;
+
+ case 52:
+#line 200 "pars0grm.y"
+ { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
+ break;
+
+ case 53:
+#line 204 "pars0grm.y"
+ { (yyval) = &pars_to_char_token; ;}
+ break;
+
+ case 54:
+#line 205 "pars0grm.y"
+ { (yyval) = &pars_to_number_token; ;}
+ break;
+
+ case 55:
+#line 206 "pars0grm.y"
+ { (yyval) = &pars_to_binary_token; ;}
+ break;
+
+ case 56:
+#line 208 "pars0grm.y"
+ { (yyval) = &pars_binary_to_number_token; ;}
+ break;
+
+ case 57:
+#line 209 "pars0grm.y"
+ { (yyval) = &pars_substr_token; ;}
+ break;
+
+ case 58:
+#line 210 "pars0grm.y"
+ { (yyval) = &pars_concat_token; ;}
+ break;
+
+ case 59:
+#line 211 "pars0grm.y"
+ { (yyval) = &pars_instr_token; ;}
+ break;
+
+ case 60:
+#line 212 "pars0grm.y"
+ { (yyval) = &pars_length_token; ;}
+ break;
+
+ case 61:
+#line 213 "pars0grm.y"
+ { (yyval) = &pars_sysdate_token; ;}
+ break;
+
+ case 62:
+#line 214 "pars0grm.y"
+ { (yyval) = &pars_rnd_token; ;}
+ break;
+
+ case 63:
+#line 215 "pars0grm.y"
+ { (yyval) = &pars_rnd_str_token; ;}
+ break;
+
+ case 67:
+#line 226 "pars0grm.y"
+ { (yyval) = pars_stored_procedure_call((yyvsp[-4])); ;}
+ break;
+
+ case 68:
+#line 231 "pars0grm.y"
+ { (yyval) = pars_procedure_call((yyvsp[-3]), (yyvsp[-1])); ;}
+ break;
+
+ case 69:
+#line 235 "pars0grm.y"
+ { (yyval) = &pars_replstr_token; ;}
+ break;
+
+ case 70:
+#line 236 "pars0grm.y"
+ { (yyval) = &pars_printf_token; ;}
+ break;
+
+ case 71:
+#line 237 "pars0grm.y"
+ { (yyval) = &pars_assert_token; ;}
+ break;
+
+ case 72:
+#line 241 "pars0grm.y"
+ { (yyval) = (yyvsp[-2]); ;}
+ break;
+
+ case 73:
+#line 245 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 74:
+#line 247 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 75:
+#line 251 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 76:
+#line 252 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 77:
+#line 254 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 78:
+#line 258 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 79:
+#line 259 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0]));;}
+ break;
+
+ case 80:
+#line 260 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 81:
+#line 264 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 82:
+#line 266 "pars0grm.y"
+ { (yyval) = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ sym_tab_add_int_lit(
+ pars_sym_tab_global, 1))); ;}
+ break;
+
+ case 83:
+#line 271 "pars0grm.y"
+ { (yyval) = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ pars_func(&pars_distinct_token,
+ que_node_list_add_last(
+ NULL, (yyvsp[-1]))))); ;}
+ break;
+
+ case 84:
+#line 277 "pars0grm.y"
+ { (yyval) = pars_func(&pars_sum_token,
+ que_node_list_add_last(NULL,
+ (yyvsp[-1]))); ;}
+ break;
+
+ case 85:
+#line 283 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 86:
+#line 284 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 87:
+#line 286 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 88:
+#line 290 "pars0grm.y"
+ { (yyval) = pars_select_list(&pars_star_denoter,
+ NULL); ;}
+ break;
+
+ case 89:
+#line 293 "pars0grm.y"
+ { (yyval) = pars_select_list((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 90:
+#line 294 "pars0grm.y"
+ { (yyval) = pars_select_list((yyvsp[0]), NULL); ;}
+ break;
+
+ case 91:
+#line 298 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 92:
+#line 299 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 93:
+#line 303 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 94:
+#line 305 "pars0grm.y"
+ { (yyval) = &pars_update_token; ;}
+ break;
+
+ case 95:
+#line 309 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 96:
+#line 311 "pars0grm.y"
+ { yyval = &pars_share_token; ;}
+ break;
+
+ case 97:
+#line 315 "pars0grm.y"
+ { (yyval) = &pars_asc_token; ;}
+ break;
+
+ case 98:
+#line 316 "pars0grm.y"
+ { (yyval) = &pars_asc_token; ;}
+ break;
+
+ case 99:
+#line 317 "pars0grm.y"
+ { (yyval) = &pars_desc_token; ;}
+ break;
+
+ case 100:
+#line 321 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 101:
+#line 323 "pars0grm.y"
+ { (yyval) = pars_order_by((yyvsp[-1]), (yyvsp[0])); ;}
+ break;
+
+ case 102:
+#line 332 "pars0grm.y"
+ { (yyval) = pars_select_statement((yyvsp[-6]), (yyvsp[-4]), (yyvsp[-3]),
+ (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
+ break;
+
+ case 103:
+#line 338 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 104:
+#line 343 "pars0grm.y"
+ { (yyval) = pars_insert_statement((yyvsp[-4]), (yyvsp[-1]), NULL); ;}
+ break;
+
+ case 105:
+#line 345 "pars0grm.y"
+ { (yyval) = pars_insert_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+ break;
+
+ case 106:
+#line 349 "pars0grm.y"
+ { (yyval) = pars_column_assignment((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 107:
+#line 353 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 108:
+#line 355 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 109:
+#line 361 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 110:
+#line 367 "pars0grm.y"
+ { (yyval) = pars_update_statement_start(FALSE,
+ (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 111:
+#line 373 "pars0grm.y"
+ { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+ break;
+
+ case 112:
+#line 378 "pars0grm.y"
+ { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
+ break;
+
+ case 113:
+#line 383 "pars0grm.y"
+ { (yyval) = pars_update_statement_start(TRUE,
+ (yyvsp[0]), NULL); ;}
+ break;
+
+ case 114:
+#line 389 "pars0grm.y"
+ { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+ break;
+
+ case 115:
+#line 394 "pars0grm.y"
+ { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
+ break;
+
+ case 116:
+#line 399 "pars0grm.y"
+ { (yyval) = pars_row_printf_statement((yyvsp[0])); ;}
+ break;
+
+ case 117:
+#line 404 "pars0grm.y"
+ { (yyval) = pars_assignment_statement((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 118:
+#line 410 "pars0grm.y"
+ { (yyval) = pars_elsif_element((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 119:
+#line 414 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 120:
+#line 416 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
+ break;
+
+ case 121:
+#line 420 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 122:
+#line 422 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 123:
+#line 423 "pars0grm.y"
+ { (yyval) = (yyvsp[0]); ;}
+ break;
+
+ case 124:
+#line 430 "pars0grm.y"
+ { (yyval) = pars_if_statement((yyvsp[-5]), (yyvsp[-3]), (yyvsp[-2])); ;}
+ break;
+
+ case 125:
+#line 436 "pars0grm.y"
+ { (yyval) = pars_while_statement((yyvsp[-4]), (yyvsp[-2])); ;}
+ break;
+
+ case 126:
+#line 444 "pars0grm.y"
+ { (yyval) = pars_for_statement((yyvsp[-8]), (yyvsp[-6]), (yyvsp[-4]), (yyvsp[-2])); ;}
+ break;
+
+ case 127:
+#line 448 "pars0grm.y"
+ { (yyval) = pars_exit_statement(); ;}
+ break;
+
+ case 128:
+#line 452 "pars0grm.y"
+ { (yyval) = pars_return_statement(); ;}
+ break;
+
+ case 129:
+#line 457 "pars0grm.y"
+ { (yyval) = pars_open_statement(
+ ROW_SEL_OPEN_CURSOR, (yyvsp[0])); ;}
+ break;
+
+ case 130:
+#line 463 "pars0grm.y"
+ { (yyval) = pars_open_statement(
+ ROW_SEL_CLOSE_CURSOR, (yyvsp[0])); ;}
+ break;
+
+ case 131:
+#line 469 "pars0grm.y"
+ { (yyval) = pars_fetch_statement((yyvsp[-2]), (yyvsp[0]), NULL); ;}
+ break;
+
+ case 132:
+#line 471 "pars0grm.y"
+ { (yyval) = pars_fetch_statement((yyvsp[-2]), NULL, (yyvsp[0])); ;}
+ break;
+
+ case 133:
+#line 476 "pars0grm.y"
+ { (yyval) = pars_column_def((yyvsp[-4]), (yyvsp[-3]), (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
+ break;
+
+ case 134:
+#line 480 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 135:
+#line 482 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 136:
+#line 486 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 137:
+#line 488 "pars0grm.y"
+ { (yyval) = (yyvsp[-1]); ;}
+ break;
+
+ case 138:
+#line 492 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 139:
+#line 494 "pars0grm.y"
+ { (yyval) = &pars_int_token;
+ /* pass any non-NULL pointer */ ;}
+ break;
+
+ case 140:
+#line 499 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 141:
+#line 501 "pars0grm.y"
+ { (yyval) = &pars_int_token;
+ /* pass any non-NULL pointer */ ;}
+ break;
+
+ case 142:
+#line 506 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 143:
+#line 508 "pars0grm.y"
+ { (yyval) = &pars_int_token;
+ /* pass any non-NULL pointer */ ;}
+ break;
+
+ case 144:
+#line 515 "pars0grm.y"
+ { (yyval) = pars_create_table((yyvsp[-4]), (yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 145:
+#line 519 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 146:
+#line 521 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 147:
+#line 525 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 148:
+#line 526 "pars0grm.y"
+ { (yyval) = &pars_unique_token; ;}
+ break;
+
+ case 149:
+#line 530 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 150:
+#line 531 "pars0grm.y"
+ { (yyval) = &pars_clustered_token; ;}
+ break;
+
+ case 151:
+#line 539 "pars0grm.y"
+ { (yyval) = pars_create_index((yyvsp[-8]), (yyvsp[-7]), (yyvsp[-5]), (yyvsp[-3]), (yyvsp[-1])); ;}
+ break;
+
+ case 152:
+#line 544 "pars0grm.y"
+ { (yyval) = pars_commit_statement(); ;}
+ break;
+
+ case 153:
+#line 549 "pars0grm.y"
+ { (yyval) = pars_rollback_statement(); ;}
+ break;
+
+ case 154:
+#line 553 "pars0grm.y"
+ { (yyval) = &pars_int_token; ;}
+ break;
+
+ case 155:
+#line 554 "pars0grm.y"
+ { (yyval) = &pars_int_token; ;}
+ break;
+
+ case 156:
+#line 555 "pars0grm.y"
+ { (yyval) = &pars_char_token; ;}
+ break;
+
+ case 157:
+#line 556 "pars0grm.y"
+ { (yyval) = &pars_binary_token; ;}
+ break;
+
+ case 158:
+#line 557 "pars0grm.y"
+ { (yyval) = &pars_blob_token; ;}
+ break;
+
+ case 159:
+#line 562 "pars0grm.y"
+ { (yyval) = pars_parameter_declaration((yyvsp[-2]),
+ PARS_INPUT, (yyvsp[0])); ;}
+ break;
+
+ case 160:
+#line 565 "pars0grm.y"
+ { (yyval) = pars_parameter_declaration((yyvsp[-2]),
+ PARS_OUTPUT, (yyvsp[0])); ;}
+ break;
+
+ case 161:
+#line 570 "pars0grm.y"
+ { (yyval) = NULL; ;}
+ break;
+
+ case 162:
+#line 571 "pars0grm.y"
+ { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+ break;
+
+ case 163:
+#line 573 "pars0grm.y"
+ { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+ break;
+
+ case 164:
+#line 578 "pars0grm.y"
+ { (yyval) = pars_variable_declaration((yyvsp[-2]), (yyvsp[-1])); ;}
+ break;
+
+ case 168:
+#line 590 "pars0grm.y"
+ { (yyval) = pars_cursor_declaration((yyvsp[-3]), (yyvsp[-1])); ;}
+ break;
+
+ case 169:
+#line 595 "pars0grm.y"
+ { (yyval) = pars_function_declaration((yyvsp[-1])); ;}
+ break;
+
+ case 175:
+#line 616 "pars0grm.y"
+ { (yyval) = pars_procedure_definition((yyvsp[-9]), (yyvsp[-7]),
+ (yyvsp[-1])); ;}
+ break;
+
+
+ }
+
+/* Line 1010 of yacc.c. */
+#line 2345 "pars0grm.c"
+
+ yyvsp -= yylen;
+ yyssp -= yylen;
+
+
+ YY_STACK_PRINT (yyss, yyssp);
+
+ *++yyvsp = yyval;
+
+
+ /* Now `shift' the result of the reduction. Determine what state
+ that goes to, based on the state we popped back to and the rule
+ number reduced by. */
+
+ yyn = yyr1[yyn];
+
+ yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+ if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+ yystate = yytable[yystate];
+ else
+ yystate = yydefgoto[yyn - YYNTOKENS];
+
+ goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+ /* If not already recovering from an error, report this error. */
+ if (!yyerrstatus)
+ {
+ ++yynerrs;
+#if YYERROR_VERBOSE
+ yyn = yypact[yystate];
+
+ if (YYPACT_NINF < yyn && yyn < YYLAST)
+ {
+ YYSIZE_T yysize = 0;
+ int yytype = YYTRANSLATE (yychar);
+ const char* yyprefix;
+ char *yymsg;
+ int yyx;
+
+ /* Start YYX at -YYN if negative to avoid negative indexes in
+ YYCHECK. */
+ int yyxbegin = yyn < 0 ? -yyn : 0;
+
+ /* Stay within bounds of both yycheck and yytname. */
+ int yychecklim = YYLAST - yyn;
+ int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+ int yycount = 0;
+
+ yyprefix = ", expecting ";
+ for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+ if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+ {
+ yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]);
+ yycount += 1;
+ if (yycount == 5)
+ {
+ yysize = 0;
+ break;
+ }
+ }
+ yysize += (sizeof ("syntax error, unexpected ")
+ + yystrlen (yytname[yytype]));
+ yymsg = (char *) YYSTACK_ALLOC (yysize);
+ if (yymsg != 0)
+ {
+ char *yyp = yystpcpy (yymsg, "syntax error, unexpected ");
+ yyp = yystpcpy (yyp, yytname[yytype]);
+
+ if (yycount < 5)
+ {
+ yyprefix = ", expecting ";
+ for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+ if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+ {
+ yyp = yystpcpy (yyp, yyprefix);
+ yyp = yystpcpy (yyp, yytname[yyx]);
+ yyprefix = " or ";
+ }
+ }
+ yyerror (yymsg);
+ YYSTACK_FREE (yymsg);
+ }
+ else
+ yyerror ("syntax error; also virtual memory exhausted");
+ }
+ else
+#endif /* YYERROR_VERBOSE */
+ yyerror ("syntax error");
+ }
+
+
+
+ if (yyerrstatus == 3)
+ {
+ /* If just tried and failed to reuse look-ahead token after an
+ error, discard it. */
+
+ if (yychar <= YYEOF)
+ {
+ /* If at end of input, pop the error token,
+ then the rest of the stack, then return failure. */
+ if (yychar == YYEOF)
+ for (;;)
+ {
+
+ YYPOPSTACK;
+ if (yyssp == yyss)
+ YYABORT;
+ yydestruct ("Error: popping",
+ yystos[*yyssp], yyvsp);
+ }
+ }
+ else
+ {
+ yydestruct ("Error: discarding", yytoken, &yylval);
+ yychar = YYEMPTY;
+ }
+ }
+
+ /* Else will try to reuse look-ahead token after shifting the error
+ token. */
+ goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR. |
+`---------------------------------------------------*/
+yyerrorlab:
+
+#ifdef __GNUC__
+ /* Pacify GCC when the user code never invokes YYERROR and the label
+ yyerrorlab therefore never appears in user code. */
+ if (0)
+ goto yyerrorlab;
+#endif
+
+yyvsp -= yylen;
+ yyssp -= yylen;
+ yystate = *yyssp;
+ goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR. |
+`-------------------------------------------------------------*/
+yyerrlab1:
+ yyerrstatus = 3; /* Each real token shifted decrements this. */
+
+ for (;;)
+ {
+ yyn = yypact[yystate];
+ if (yyn != YYPACT_NINF)
+ {
+ yyn += YYTERROR;
+ if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+ {
+ yyn = yytable[yyn];
+ if (0 < yyn)
+ break;
+ }
+ }
+
+ /* Pop the current state because it cannot handle the error token. */
+ if (yyssp == yyss)
+ YYABORT;
+
+
+ yydestruct ("Error: popping", yystos[yystate], yyvsp);
+ YYPOPSTACK;
+ yystate = *yyssp;
+ YY_STACK_PRINT (yyss, yyssp);
+ }
+
+ if (yyn == YYFINAL)
+ YYACCEPT;
+
+ *++yyvsp = yylval;
+
+
+ /* Shift the error token. */
+ YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+ yystate = yyn;
+ goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here. |
+`-------------------------------------*/
+yyacceptlab:
+ yyresult = 0;
+ goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here. |
+`-----------------------------------*/
+yyabortlab:
+ yydestruct ("Error: discarding lookahead",
+ yytoken, &yylval);
+ yychar = YYEMPTY;
+ yyresult = 1;
+ goto yyreturn;
+
+#ifndef yyoverflow
+/*----------------------------------------------.
+| yyoverflowlab -- parser overflow comes here. |
+`----------------------------------------------*/
+yyoverflowlab:
+ yyerror ("parser stack overflow");
+ yyresult = 2;
+ /* Fall through. */
+#endif
+
+yyreturn:
+#ifndef yyoverflow
+ if (yyss != yyssa)
+ YYSTACK_FREE (yyss);
+#endif
+ return yyresult;
+}
+
+
+#line 620 "pars0grm.y"
+
+
diff --git a/storage/xtradb/pars/pars0grm.y b/storage/xtradb/pars/pars0grm.y
new file mode 100644
index 00000000000..14d64f1826f
--- /dev/null
+++ b/storage/xtradb/pars/pars0grm.y
@@ -0,0 +1,635 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser: input file for the GNU Bison parser generator
+
+Look from pars0lex.l for instructions how to generate the C files for
+the InnoDB parser.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%{
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h> /* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+%}
+
+%token PARS_INT_LIT
+%token PARS_FLOAT_LIT
+%token PARS_STR_LIT
+%token PARS_FIXBINARY_LIT
+%token PARS_BLOB_LIT
+%token PARS_NULL_LIT
+%token PARS_ID_TOKEN
+%token PARS_AND_TOKEN
+%token PARS_OR_TOKEN
+%token PARS_NOT_TOKEN
+%token PARS_GE_TOKEN
+%token PARS_LE_TOKEN
+%token PARS_NE_TOKEN
+%token PARS_PROCEDURE_TOKEN
+%token PARS_IN_TOKEN
+%token PARS_OUT_TOKEN
+%token PARS_BINARY_TOKEN
+%token PARS_BLOB_TOKEN
+%token PARS_INT_TOKEN
+%token PARS_INTEGER_TOKEN
+%token PARS_FLOAT_TOKEN
+%token PARS_CHAR_TOKEN
+%token PARS_IS_TOKEN
+%token PARS_BEGIN_TOKEN
+%token PARS_END_TOKEN
+%token PARS_IF_TOKEN
+%token PARS_THEN_TOKEN
+%token PARS_ELSE_TOKEN
+%token PARS_ELSIF_TOKEN
+%token PARS_LOOP_TOKEN
+%token PARS_WHILE_TOKEN
+%token PARS_RETURN_TOKEN
+%token PARS_SELECT_TOKEN
+%token PARS_SUM_TOKEN
+%token PARS_COUNT_TOKEN
+%token PARS_DISTINCT_TOKEN
+%token PARS_FROM_TOKEN
+%token PARS_WHERE_TOKEN
+%token PARS_FOR_TOKEN
+%token PARS_DDOT_TOKEN
+%token PARS_READ_TOKEN
+%token PARS_ORDER_TOKEN
+%token PARS_BY_TOKEN
+%token PARS_ASC_TOKEN
+%token PARS_DESC_TOKEN
+%token PARS_INSERT_TOKEN
+%token PARS_INTO_TOKEN
+%token PARS_VALUES_TOKEN
+%token PARS_UPDATE_TOKEN
+%token PARS_SET_TOKEN
+%token PARS_DELETE_TOKEN
+%token PARS_CURRENT_TOKEN
+%token PARS_OF_TOKEN
+%token PARS_CREATE_TOKEN
+%token PARS_TABLE_TOKEN
+%token PARS_INDEX_TOKEN
+%token PARS_UNIQUE_TOKEN
+%token PARS_CLUSTERED_TOKEN
+%token PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+%token PARS_ON_TOKEN
+%token PARS_ASSIGN_TOKEN
+%token PARS_DECLARE_TOKEN
+%token PARS_CURSOR_TOKEN
+%token PARS_SQL_TOKEN
+%token PARS_OPEN_TOKEN
+%token PARS_FETCH_TOKEN
+%token PARS_CLOSE_TOKEN
+%token PARS_NOTFOUND_TOKEN
+%token PARS_TO_CHAR_TOKEN
+%token PARS_TO_NUMBER_TOKEN
+%token PARS_TO_BINARY_TOKEN
+%token PARS_BINARY_TO_NUMBER_TOKEN
+%token PARS_SUBSTR_TOKEN
+%token PARS_REPLSTR_TOKEN
+%token PARS_CONCAT_TOKEN
+%token PARS_INSTR_TOKEN
+%token PARS_LENGTH_TOKEN
+%token PARS_SYSDATE_TOKEN
+%token PARS_PRINTF_TOKEN
+%token PARS_ASSERT_TOKEN
+%token PARS_RND_TOKEN
+%token PARS_RND_STR_TOKEN
+%token PARS_ROW_PRINTF_TOKEN
+%token PARS_COMMIT_TOKEN
+%token PARS_ROLLBACK_TOKEN
+%token PARS_WORK_TOKEN
+%token PARS_UNSIGNED_TOKEN
+%token PARS_EXIT_TOKEN
+%token PARS_FUNCTION_TOKEN
+%token PARS_LOCK_TOKEN
+%token PARS_SHARE_TOKEN
+%token PARS_MODE_TOKEN
+
+%left PARS_AND_TOKEN PARS_OR_TOKEN
+%left PARS_NOT_TOKEN
+%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN
+%left '-' '+'
+%left '*' '/'
+%left NEG /* negation--unary minus */
+%left '%'
+
+/* Grammar follows */
+%%
+
+top_statement:
+ procedure_definition ';'
+
+statement:
+ stored_procedure_call
+ | predefined_procedure_call ';'
+ | while_statement ';'
+ | for_statement ';'
+ | exit_statement ';'
+ | if_statement ';'
+ | return_statement ';'
+ | assignment_statement ';'
+ | select_statement ';'
+ | insert_statement ';'
+ | row_printf_statement ';'
+ | delete_statement_searched ';'
+ | delete_statement_positioned ';'
+ | update_statement_searched ';'
+ | update_statement_positioned ';'
+ | open_cursor_statement ';'
+ | fetch_statement ';'
+ | close_cursor_statement ';'
+ | commit_statement ';'
+ | rollback_statement ';'
+ | create_table ';'
+ | create_index ';'
+;
+
+statement_list:
+ statement { $$ = que_node_list_add_last(NULL, $1); }
+ | statement_list statement
+ { $$ = que_node_list_add_last($1, $2); }
+;
+
+exp:
+ PARS_ID_TOKEN { $$ = $1;}
+ | function_name '(' exp_list ')'
+ { $$ = pars_func($1, $3); }
+ | PARS_INT_LIT { $$ = $1;}
+ | PARS_FLOAT_LIT { $$ = $1;}
+ | PARS_STR_LIT { $$ = $1;}
+ | PARS_FIXBINARY_LIT { $$ = $1;}
+ | PARS_BLOB_LIT { $$ = $1;}
+ | PARS_NULL_LIT { $$ = $1;}
+ | PARS_SQL_TOKEN { $$ = $1;}
+ | exp '+' exp { $$ = pars_op('+', $1, $3); }
+ | exp '-' exp { $$ = pars_op('-', $1, $3); }
+ | exp '*' exp { $$ = pars_op('*', $1, $3); }
+ | exp '/' exp { $$ = pars_op('/', $1, $3); }
+ | '-' exp %prec NEG { $$ = pars_op('-', $2, NULL); }
+ | '(' exp ')' { $$ = $2; }
+ | exp '=' exp { $$ = pars_op('=', $1, $3); }
+ | exp '<' exp { $$ = pars_op('<', $1, $3); }
+ | exp '>' exp { $$ = pars_op('>', $1, $3); }
+ | exp PARS_GE_TOKEN exp { $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
+ | exp PARS_LE_TOKEN exp { $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
+ | exp PARS_NE_TOKEN exp { $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
+ | exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); }
+ | exp PARS_OR_TOKEN exp { $$ = pars_op(PARS_OR_TOKEN, $1, $3); }
+ | PARS_NOT_TOKEN exp { $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); }
+ | PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN
+ { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+ | PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN
+ { $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+;
+
+function_name:
+ PARS_TO_CHAR_TOKEN { $$ = &pars_to_char_token; }
+ | PARS_TO_NUMBER_TOKEN { $$ = &pars_to_number_token; }
+ | PARS_TO_BINARY_TOKEN { $$ = &pars_to_binary_token; }
+ | PARS_BINARY_TO_NUMBER_TOKEN
+ { $$ = &pars_binary_to_number_token; }
+ | PARS_SUBSTR_TOKEN { $$ = &pars_substr_token; }
+ | PARS_CONCAT_TOKEN { $$ = &pars_concat_token; }
+ | PARS_INSTR_TOKEN { $$ = &pars_instr_token; }
+ | PARS_LENGTH_TOKEN { $$ = &pars_length_token; }
+ | PARS_SYSDATE_TOKEN { $$ = &pars_sysdate_token; }
+ | PARS_RND_TOKEN { $$ = &pars_rnd_token; }
+ | PARS_RND_STR_TOKEN { $$ = &pars_rnd_str_token; }
+;
+
+question_mark_list:
+ /* Nothing */
+ | '?'
+ | question_mark_list ',' '?'
+;
+
+stored_procedure_call:
+ '{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
+ { $$ = pars_stored_procedure_call($2); }
+;
+
+predefined_procedure_call:
+ predefined_procedure_name '(' exp_list ')'
+ { $$ = pars_procedure_call($1, $3); }
+;
+
+predefined_procedure_name:
+ PARS_REPLSTR_TOKEN { $$ = &pars_replstr_token; }
+ | PARS_PRINTF_TOKEN { $$ = &pars_printf_token; }
+ | PARS_ASSERT_TOKEN { $$ = &pars_assert_token; }
+;
+
+user_function_call:
+ PARS_ID_TOKEN '(' ')' { $$ = $1; }
+;
+
+table_list:
+ PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); }
+ | table_list ',' PARS_ID_TOKEN
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_list:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); }
+ | variable_list ',' PARS_ID_TOKEN
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+exp_list:
+ /* Nothing */ { $$ = NULL; }
+ | exp { $$ = que_node_list_add_last(NULL, $1);}
+ | exp_list ',' exp { $$ = que_node_list_add_last($1, $3); }
+;
+
+select_item:
+ exp { $$ = $1; }
+ | PARS_COUNT_TOKEN '(' '*' ')'
+ { $$ = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ sym_tab_add_int_lit(
+ pars_sym_tab_global, 1))); }
+ | PARS_COUNT_TOKEN '(' PARS_DISTINCT_TOKEN PARS_ID_TOKEN ')'
+ { $$ = pars_func(&pars_count_token,
+ que_node_list_add_last(NULL,
+ pars_func(&pars_distinct_token,
+ que_node_list_add_last(
+ NULL, $4)))); }
+ | PARS_SUM_TOKEN '(' exp ')'
+ { $$ = pars_func(&pars_sum_token,
+ que_node_list_add_last(NULL,
+ $3)); }
+;
+
+select_item_list:
+ /* Nothing */ { $$ = NULL; }
+ | select_item { $$ = que_node_list_add_last(NULL, $1); }
+ | select_item_list ',' select_item
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+select_list:
+ '*' { $$ = pars_select_list(&pars_star_denoter,
+ NULL); }
+ | select_item_list PARS_INTO_TOKEN variable_list
+ { $$ = pars_select_list($1, $3); }
+ | select_item_list { $$ = pars_select_list($1, NULL); }
+;
+
+search_condition:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_WHERE_TOKEN exp { $$ = $2; }
+;
+
+for_update_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_FOR_TOKEN PARS_UPDATE_TOKEN
+ { $$ = &pars_update_token; }
+;
+
+lock_shared_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN
+ { $$ = &pars_share_token; }
+;
+
+order_direction:
+ /* Nothing */ { $$ = &pars_asc_token; }
+ | PARS_ASC_TOKEN { $$ = &pars_asc_token; }
+ | PARS_DESC_TOKEN { $$ = &pars_desc_token; }
+;
+
+order_by_clause:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
+ { $$ = pars_order_by($3, $4); }
+;
+
+select_statement:
+ PARS_SELECT_TOKEN select_list
+ PARS_FROM_TOKEN table_list
+ search_condition
+ for_update_clause
+ lock_shared_clause
+ order_by_clause { $$ = pars_select_statement($2, $4, $5,
+ $6, $7, $8); }
+;
+
+insert_statement_start:
+ PARS_INSERT_TOKEN PARS_INTO_TOKEN
+ PARS_ID_TOKEN { $$ = $3; }
+;
+
+insert_statement:
+ insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
+ { $$ = pars_insert_statement($1, $4, NULL); }
+ | insert_statement_start select_statement
+ { $$ = pars_insert_statement($1, NULL, $2); }
+;
+
+column_assignment:
+ PARS_ID_TOKEN '=' exp { $$ = pars_column_assignment($1, $3); }
+;
+
+column_assignment_list:
+ column_assignment { $$ = que_node_list_add_last(NULL, $1); }
+ | column_assignment_list ',' column_assignment
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+cursor_positioned:
+ PARS_WHERE_TOKEN
+ PARS_CURRENT_TOKEN PARS_OF_TOKEN
+ PARS_ID_TOKEN { $$ = $4; }
+;
+
+update_statement_start:
+ PARS_UPDATE_TOKEN PARS_ID_TOKEN
+ PARS_SET_TOKEN
+ column_assignment_list { $$ = pars_update_statement_start(FALSE,
+ $2, $4); }
+;
+
+update_statement_searched:
+ update_statement_start
+ search_condition { $$ = pars_update_statement($1, NULL, $2); }
+;
+
+update_statement_positioned:
+ update_statement_start
+ cursor_positioned { $$ = pars_update_statement($1, $2, NULL); }
+;
+
+delete_statement_start:
+ PARS_DELETE_TOKEN PARS_FROM_TOKEN
+ PARS_ID_TOKEN { $$ = pars_update_statement_start(TRUE,
+ $3, NULL); }
+;
+
+delete_statement_searched:
+ delete_statement_start
+ search_condition { $$ = pars_update_statement($1, NULL, $2); }
+;
+
+delete_statement_positioned:
+ delete_statement_start
+ cursor_positioned { $$ = pars_update_statement($1, $2, NULL); }
+;
+
+row_printf_statement:
+ PARS_ROW_PRINTF_TOKEN select_statement
+ { $$ = pars_row_printf_statement($2); }
+;
+
+assignment_statement:
+ PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
+ { $$ = pars_assignment_statement($1, $3); }
+;
+
+elsif_element:
+ PARS_ELSIF_TOKEN
+ exp PARS_THEN_TOKEN statement_list
+ { $$ = pars_elsif_element($2, $4); }
+;
+
+elsif_list:
+ elsif_element { $$ = que_node_list_add_last(NULL, $1); }
+ | elsif_list elsif_element
+ { $$ = que_node_list_add_last($1, $2); }
+;
+
+else_part:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_ELSE_TOKEN statement_list
+ { $$ = $2; }
+ | elsif_list { $$ = $1; }
+;
+
+if_statement:
+ PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list
+ else_part
+ PARS_END_TOKEN PARS_IF_TOKEN
+ { $$ = pars_if_statement($2, $4, $5); }
+;
+
+while_statement:
+ PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list
+ PARS_END_TOKEN PARS_LOOP_TOKEN
+ { $$ = pars_while_statement($2, $4); }
+;
+
+for_statement:
+ PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN
+ exp PARS_DDOT_TOKEN exp
+ PARS_LOOP_TOKEN statement_list
+ PARS_END_TOKEN PARS_LOOP_TOKEN
+ { $$ = pars_for_statement($2, $4, $6, $8); }
+;
+
+exit_statement:
+ PARS_EXIT_TOKEN { $$ = pars_exit_statement(); }
+;
+
+return_statement:
+ PARS_RETURN_TOKEN { $$ = pars_return_statement(); }
+;
+
+open_cursor_statement:
+ PARS_OPEN_TOKEN PARS_ID_TOKEN
+ { $$ = pars_open_statement(
+ ROW_SEL_OPEN_CURSOR, $2); }
+;
+
+close_cursor_statement:
+ PARS_CLOSE_TOKEN PARS_ID_TOKEN
+ { $$ = pars_open_statement(
+ ROW_SEL_CLOSE_CURSOR, $2); }
+;
+
+fetch_statement:
+ PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
+ { $$ = pars_fetch_statement($2, $4, NULL); }
+ | PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
+ { $$ = pars_fetch_statement($2, NULL, $4); }
+;
+
+column_def:
+ PARS_ID_TOKEN type_name opt_column_len opt_unsigned opt_not_null
+ { $$ = pars_column_def($1, $2, $3, $4, $5); }
+;
+
+column_def_list:
+ column_def { $$ = que_node_list_add_last(NULL, $1); }
+ | column_def_list ',' column_def
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+opt_column_len:
+ /* Nothing */ { $$ = NULL; }
+ | '(' PARS_INT_LIT ')'
+ { $$ = $2; }
+;
+
+opt_unsigned:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_UNSIGNED_TOKEN
+ { $$ = &pars_int_token;
+ /* pass any non-NULL pointer */ }
+;
+
+opt_not_null:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_NOT_TOKEN PARS_NULL_LIT
+ { $$ = &pars_int_token;
+ /* pass any non-NULL pointer */ }
+;
+
+not_fit_in_memory:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+ { $$ = &pars_int_token;
+ /* pass any non-NULL pointer */ }
+;
+
+create_table:
+ PARS_CREATE_TOKEN PARS_TABLE_TOKEN
+ PARS_ID_TOKEN '(' column_def_list ')'
+ not_fit_in_memory { $$ = pars_create_table($3, $5, $7); }
+;
+
+column_list:
+ PARS_ID_TOKEN { $$ = que_node_list_add_last(NULL, $1); }
+ | column_list ',' PARS_ID_TOKEN
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+unique_def:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_UNIQUE_TOKEN { $$ = &pars_unique_token; }
+;
+
+clustered_def:
+ /* Nothing */ { $$ = NULL; }
+ | PARS_CLUSTERED_TOKEN { $$ = &pars_clustered_token; }
+;
+
+create_index:
+ PARS_CREATE_TOKEN unique_def
+ clustered_def
+ PARS_INDEX_TOKEN
+ PARS_ID_TOKEN PARS_ON_TOKEN PARS_ID_TOKEN
+ '(' column_list ')' { $$ = pars_create_index($2, $3, $5, $7, $9); }
+;
+
+commit_statement:
+ PARS_COMMIT_TOKEN PARS_WORK_TOKEN
+ { $$ = pars_commit_statement(); }
+;
+
+rollback_statement:
+ PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN
+ { $$ = pars_rollback_statement(); }
+;
+
+type_name:
+ PARS_INT_TOKEN { $$ = &pars_int_token; }
+ | PARS_INTEGER_TOKEN { $$ = &pars_int_token; }
+ | PARS_CHAR_TOKEN { $$ = &pars_char_token; }
+ | PARS_BINARY_TOKEN { $$ = &pars_binary_token; }
+ | PARS_BLOB_TOKEN { $$ = &pars_blob_token; }
+;
+
+parameter_declaration:
+ PARS_ID_TOKEN PARS_IN_TOKEN type_name
+ { $$ = pars_parameter_declaration($1,
+ PARS_INPUT, $3); }
+ | PARS_ID_TOKEN PARS_OUT_TOKEN type_name
+ { $$ = pars_parameter_declaration($1,
+ PARS_OUTPUT, $3); }
+;
+
+parameter_declaration_list:
+ /* Nothing */ { $$ = NULL; }
+ | parameter_declaration { $$ = que_node_list_add_last(NULL, $1); }
+ | parameter_declaration_list ',' parameter_declaration
+ { $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_declaration:
+ PARS_ID_TOKEN type_name ';'
+ { $$ = pars_variable_declaration($1, $2); }
+;
+
+variable_declaration_list:
+ /* Nothing */
+ | variable_declaration
+ | variable_declaration_list variable_declaration
+;
+
+cursor_declaration:
+ PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
+ PARS_IS_TOKEN select_statement ';'
+ { $$ = pars_cursor_declaration($3, $5); }
+;
+
+function_declaration:
+ PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
+ { $$ = pars_function_declaration($3); }
+;
+
+declaration:
+ cursor_declaration
+ | function_declaration
+;
+
+declaration_list:
+ /* Nothing */
+ | declaration
+ | declaration_list declaration
+;
+
+procedure_definition:
+ PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' parameter_declaration_list ')'
+ PARS_IS_TOKEN
+ variable_declaration_list
+ declaration_list
+ PARS_BEGIN_TOKEN
+ statement_list
+ PARS_END_TOKEN { $$ = pars_procedure_definition($2, $4,
+ $10); }
+;
+
+%%
diff --git a/storage/xtradb/pars/pars0lex.l b/storage/xtradb/pars/pars0lex.l
new file mode 100644
index 00000000000..7bd39f7514b
--- /dev/null
+++ b/storage/xtradb/pars/pars0lex.l
@@ -0,0 +1,678 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%option nostdinit
+%option 8bit
+%option warn
+%option pointer
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyy_scan_buffer
+%option noyy_scan_bytes
+%option noyy_scan_string
+%option nounistd
+
+%{
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A) ut_malloc(A)
+#define free(A) ut_free(A)
+#define realloc(P, A) ut_realloc(P, A)
+#define exit(A) ut_error
+
+#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+
+/* String buffer for removing quotes */
+static ulint stringbuf_len_alloc = 0; /* Allocated length */
+static ulint stringbuf_len = 0; /* Current length */
+static char* stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+ const char* str, /*!< in: string to be appended */
+ ulint len) /*!< in: length of the string */
+{
+ if (stringbuf == NULL) {
+ stringbuf = malloc(1);
+ stringbuf_len_alloc = 1;
+ }
+
+ if (stringbuf_len + len > stringbuf_len_alloc) {
+ while (stringbuf_len + len > stringbuf_len_alloc) {
+ stringbuf_len_alloc <<= 1;
+ }
+ stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+ }
+
+ memcpy(stringbuf + stringbuf_len, str, len);
+ stringbuf_len += len;
+}
+
+%}
+
+DIGIT [0-9]
+ID [a-z_A-Z][a-z_A-Z0-9]*
+BOUND_LIT \:[a-z_A-Z0-9]+
+BOUND_ID \$[a-z_A-Z0-9]+
+
+%x comment
+%x quoted
+%x id
+%%
+
+{DIGIT}+ {
+ yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+ atoi(yytext));
+ return(PARS_INT_LIT);
+}
+
+{DIGIT}+"."{DIGIT}* {
+ ut_error; /* not implemented */
+
+ return(PARS_FLOAT_LIT);
+}
+
+{BOUND_LIT} {
+ ulint type;
+
+ yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+ yytext + 1, &type);
+
+ return((int) type);
+}
+
+{BOUND_ID} {
+ yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+ yytext + 1);
+
+ return(PARS_ID_TOKEN);
+}
+
+"'" {
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'. This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+ BEGIN(quoted);
+ stringbuf_len = 0;
+}
+<quoted>[^\']+ {
+ /* Got a sequence of characters other than "'":
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+<quoted>"'"+ {
+ /* Got a sequence of "'" characters:
+ append half of them to string buffer,
+ as "''" represents a single "'".
+ We apply truncating division,
+ so that "'''" will result in "'". */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ string literal. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_str_lit(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+ return(PARS_STR_LIT);
+ }
+}
+
+\" {
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+ BEGIN(id);
+ stringbuf_len = 0;
+}
+<id>[^\"]+ {
+ /* Got a sequence of characters other than '"':
+ append to string buffer */
+ string_append(yytext, yyleng);
+}
+<id>\"+ {
+ /* Got a sequence of '"' characters:
+ append half of them to string buffer,
+ as '""' represents a single '"'.
+ We apply truncating division,
+ so that '"""' will result in '"'. */
+
+ string_append(yytext, yyleng / 2);
+
+ /* If we got an odd number of quotes, then the
+ last quote we got is the terminating quote.
+ At the end of the string, we return to the
+ initial start state and report the scanned
+ identifier. */
+
+ if (yyleng % 2) {
+ BEGIN(INITIAL);
+ yylval = sym_tab_add_id(
+ pars_sym_tab_global,
+ (byte*) stringbuf, stringbuf_len);
+
+ return(PARS_ID_TOKEN);
+ }
+}
+
+"NULL" {
+ yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+ return(PARS_NULL_LIT);
+}
+
+"SQL" {
+ /* Implicit cursor name */
+ yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+ (byte*) yytext, yyleng);
+ return(PARS_SQL_TOKEN);
+}
+
+"AND" {
+ return(PARS_AND_TOKEN);
+}
+
+"OR" {
+ return(PARS_OR_TOKEN);
+}
+
+"NOT" {
+ return(PARS_NOT_TOKEN);
+}
+
+"PROCEDURE" {
+ return(PARS_PROCEDURE_TOKEN);
+}
+
+"IN" {
+ return(PARS_IN_TOKEN);
+}
+
+"OUT" {
+ return(PARS_OUT_TOKEN);
+}
+
+"BINARY" {
+ return(PARS_BINARY_TOKEN);
+}
+
+"BLOB" {
+ return(PARS_BLOB_TOKEN);
+}
+
+"INT" {
+ return(PARS_INT_TOKEN);
+}
+
+"INTEGER" {
+ return(PARS_INT_TOKEN);
+}
+
+"FLOAT" {
+ return(PARS_FLOAT_TOKEN);
+}
+
+"CHAR" {
+ return(PARS_CHAR_TOKEN);
+}
+
+"IS" {
+ return(PARS_IS_TOKEN);
+}
+
+"BEGIN" {
+ return(PARS_BEGIN_TOKEN);
+}
+
+"END" {
+ return(PARS_END_TOKEN);
+}
+
+"IF" {
+ return(PARS_IF_TOKEN);
+}
+
+"THEN" {
+ return(PARS_THEN_TOKEN);
+}
+
+"ELSE" {
+ return(PARS_ELSE_TOKEN);
+}
+
+"ELSIF" {
+ return(PARS_ELSIF_TOKEN);
+}
+
+"LOOP" {
+ return(PARS_LOOP_TOKEN);
+}
+
+"WHILE" {
+ return(PARS_WHILE_TOKEN);
+}
+
+"RETURN" {
+ return(PARS_RETURN_TOKEN);
+}
+
+"SELECT" {
+ return(PARS_SELECT_TOKEN);
+}
+
+"SUM" {
+ return(PARS_SUM_TOKEN);
+}
+
+"COUNT" {
+ return(PARS_COUNT_TOKEN);
+}
+
+"DISTINCT" {
+ return(PARS_DISTINCT_TOKEN);
+}
+
+"FROM" {
+ return(PARS_FROM_TOKEN);
+}
+
+"WHERE" {
+ return(PARS_WHERE_TOKEN);
+}
+
+"FOR" {
+ return(PARS_FOR_TOKEN);
+}
+
+"READ" {
+ return(PARS_READ_TOKEN);
+}
+
+"ORDER" {
+ return(PARS_ORDER_TOKEN);
+}
+
+"BY" {
+ return(PARS_BY_TOKEN);
+}
+
+"ASC" {
+ return(PARS_ASC_TOKEN);
+}
+
+"DESC" {
+ return(PARS_DESC_TOKEN);
+}
+
+"INSERT" {
+ return(PARS_INSERT_TOKEN);
+}
+
+"INTO" {
+ return(PARS_INTO_TOKEN);
+}
+
+"VALUES" {
+ return(PARS_VALUES_TOKEN);
+}
+
+"UPDATE" {
+ return(PARS_UPDATE_TOKEN);
+}
+
+"SET" {
+ return(PARS_SET_TOKEN);
+}
+
+"DELETE" {
+ return(PARS_DELETE_TOKEN);
+}
+
+"CURRENT" {
+ return(PARS_CURRENT_TOKEN);
+}
+
+"OF" {
+ return(PARS_OF_TOKEN);
+}
+
+"CREATE" {
+ return(PARS_CREATE_TOKEN);
+}
+
+"TABLE" {
+ return(PARS_TABLE_TOKEN);
+}
+
+"INDEX" {
+ return(PARS_INDEX_TOKEN);
+}
+
+"UNIQUE" {
+ return(PARS_UNIQUE_TOKEN);
+}
+
+"CLUSTERED" {
+ return(PARS_CLUSTERED_TOKEN);
+}
+
+"DOES_NOT_FIT_IN_MEMORY" {
+ return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+
+"ON" {
+ return(PARS_ON_TOKEN);
+}
+
+"DECLARE" {
+ return(PARS_DECLARE_TOKEN);
+}
+
+"CURSOR" {
+ return(PARS_CURSOR_TOKEN);
+}
+
+"OPEN" {
+ return(PARS_OPEN_TOKEN);
+}
+
+"FETCH" {
+ return(PARS_FETCH_TOKEN);
+}
+
+"CLOSE" {
+ return(PARS_CLOSE_TOKEN);
+}
+
+"NOTFOUND" {
+ return(PARS_NOTFOUND_TOKEN);
+}
+
+"TO_CHAR" {
+ return(PARS_TO_CHAR_TOKEN);
+}
+
+"TO_NUMBER" {
+ return(PARS_TO_NUMBER_TOKEN);
+}
+
+"TO_BINARY" {
+ return(PARS_TO_BINARY_TOKEN);
+}
+
+"BINARY_TO_NUMBER" {
+ return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+
+"SUBSTR" {
+ return(PARS_SUBSTR_TOKEN);
+}
+
+"REPLSTR" {
+ return(PARS_REPLSTR_TOKEN);
+}
+
+"CONCAT" {
+ return(PARS_CONCAT_TOKEN);
+}
+
+"INSTR" {
+ return(PARS_INSTR_TOKEN);
+}
+
+"LENGTH" {
+ return(PARS_LENGTH_TOKEN);
+}
+
+"SYSDATE" {
+ return(PARS_SYSDATE_TOKEN);
+}
+
+"PRINTF" {
+ return(PARS_PRINTF_TOKEN);
+}
+
+"ASSERT" {
+ return(PARS_ASSERT_TOKEN);
+}
+
+"RND" {
+ return(PARS_RND_TOKEN);
+}
+
+"RND_STR" {
+ return(PARS_RND_STR_TOKEN);
+}
+
+"ROW_PRINTF" {
+ return(PARS_ROW_PRINTF_TOKEN);
+}
+
+"COMMIT" {
+ return(PARS_COMMIT_TOKEN);
+}
+
+"ROLLBACK" {
+ return(PARS_ROLLBACK_TOKEN);
+}
+
+"WORK" {
+ return(PARS_WORK_TOKEN);
+}
+
+"UNSIGNED" {
+ return(PARS_UNSIGNED_TOKEN);
+}
+
+"EXIT" {
+ return(PARS_EXIT_TOKEN);
+}
+
+"FUNCTION" {
+ return(PARS_FUNCTION_TOKEN);
+}
+
+"LOCK" {
+ return(PARS_LOCK_TOKEN);
+}
+
+"SHARE" {
+ return(PARS_SHARE_TOKEN);
+}
+
+"MODE" {
+ return(PARS_MODE_TOKEN);
+}
+
+{ID} {
+ yylval = sym_tab_add_id(pars_sym_tab_global,
+ (byte*)yytext,
+ ut_strlen(yytext));
+ return(PARS_ID_TOKEN);
+}
+
+".." {
+ return(PARS_DDOT_TOKEN);
+}
+
+":=" {
+ return(PARS_ASSIGN_TOKEN);
+}
+
+"<=" {
+ return(PARS_LE_TOKEN);
+}
+
+">=" {
+ return(PARS_GE_TOKEN);
+}
+
+"<>" {
+ return(PARS_NE_TOKEN);
+}
+
+"(" {
+
+ return((int)(*yytext));
+}
+
+"=" {
+
+ return((int)(*yytext));
+}
+
+">" {
+
+ return((int)(*yytext));
+}
+
+"<" {
+
+ return((int)(*yytext));
+}
+
+"," {
+
+ return((int)(*yytext));
+}
+
+";" {
+
+ return((int)(*yytext));
+}
+
+")" {
+
+ return((int)(*yytext));
+}
+
+"+" {
+
+ return((int)(*yytext));
+}
+
+"-" {
+
+ return((int)(*yytext));
+}
+
+"*" {
+
+ return((int)(*yytext));
+}
+
+"/" {
+
+ return((int)(*yytext));
+}
+
+"%" {
+
+ return((int)(*yytext));
+}
+
+"{" {
+
+ return((int)(*yytext));
+}
+
+"}" {
+
+ return((int)(*yytext));
+}
+
+"?" {
+
+ return((int)(*yytext));
+}
+
+"/*" BEGIN(comment); /* eat up comment */
+
+<comment>[^*]*
+<comment>"*"+[^*/]*
+<comment>"*"+"/" BEGIN(INITIAL);
+
+[ \t\n]+ /* eat up whitespace */
+
+
+. {
+ fprintf(stderr,"Unrecognized character: %02x\n",
+ *yytext);
+
+ ut_error;
+
+ return(0);
+}
+
+%%
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+ if (yy_buffer_stack)
+ yylex_destroy();
+ if (stringbuf)
+ free(stringbuf);
+ stringbuf = NULL;
+ stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/xtradb/pars/pars0opt.c b/storage/xtradb/pars/pars0opt.c
new file mode 100644
index 00000000000..2e392ba4836
--- /dev/null
+++ b/storage/xtradb/pars/pars0opt.c
@@ -0,0 +1,1216 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0opt.c
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0opt.h"
+
+#ifdef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "lock0lock.h"
+
+#define OPT_EQUAL 1 /* comparison by = */
+#define OPT_COMPARISON 2 /* comparison by <, >, <=, or >= */
+
+#define OPT_NOT_COND 1
+#define OPT_END_COND 2
+#define OPT_TEST_COND 3
+#define OPT_SCROLL_COND 4
+
+
+/*******************************************************************//**
+Inverts a comparison operator.
+@return the equivalent operator when the order of the arguments is switched */
+static
+int
+opt_invert_cmp_op(
+/*==============*/
+ int op) /*!< in: operator */
+{
+ if (op == '<') {
+ return('>');
+ } else if (op == '>') {
+ return('<');
+ } else if (op == '=') {
+ return('=');
+ } else if (op == PARS_LE_TOKEN) {
+ return(PARS_GE_TOKEN);
+ } else if (op == PARS_GE_TOKEN) {
+ return(PARS_LE_TOKEN);
+ } else {
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Checks if the value of an expression can be calculated BEFORE the nth table
+in a join is accessed. If this is the case, it can possibly be used in an
+index search for the nth table.
+@return TRUE if already determined */
+static
+ibool
+opt_check_exp_determined_before(
+/*============================*/
+ que_node_t* exp, /*!< in: expression */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table) /*!< in: nth table will be accessed */
+{
+ func_node_t* func_node;
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ que_node_t* arg;
+ ulint i;
+
+ ut_ad(exp && sel_node);
+
+ if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+ func_node = exp;
+
+ arg = func_node->args;
+
+ while (arg) {
+ if (!opt_check_exp_determined_before(arg, sel_node,
+ nth_table)) {
+ return(FALSE);
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ return(TRUE);
+ }
+
+ ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+ sym_node = exp;
+
+ if (sym_node->token_type != SYM_COLUMN) {
+
+ return(TRUE);
+ }
+
+ for (i = 0; i < nth_table; i++) {
+
+ table = sel_node_get_nth_plan(sel_node, i)->table;
+
+ if (sym_node->table == table) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Looks in a comparison condition if a column value is already restricted by
+it BEFORE the nth table is accessed.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_comparison_before(
+/*==================================*/
+ ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */
+ ulint col_no, /*!< in: column number */
+ func_node_t* search_cond, /*!< in: comparison condition */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table, /*!< in: nth table in a join (a query
+ from a single table is considered a
+ join of 1 table) */
+ ulint* op) /*!< out: comparison operator ('=',
+ PARS_GE_TOKEN, ... ); this is inverted
+ if the column appears on the right
+ side */
+{
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ que_node_t* exp;
+ que_node_t* arg;
+
+ ut_ad(search_cond);
+
+ ut_a((search_cond->func == '<')
+ || (search_cond->func == '>')
+ || (search_cond->func == '=')
+ || (search_cond->func == PARS_GE_TOKEN)
+ || (search_cond->func == PARS_LE_TOKEN));
+
+ table = sel_node_get_nth_plan(sel_node, nth_table)->table;
+
+ if ((cmp_type == OPT_EQUAL) && (search_cond->func != '=')) {
+
+ return(NULL);
+
+ } else if ((cmp_type == OPT_COMPARISON)
+ && (search_cond->func != '<')
+ && (search_cond->func != '>')
+ && (search_cond->func != PARS_GE_TOKEN)
+ && (search_cond->func != PARS_LE_TOKEN)) {
+
+ return(NULL);
+ }
+
+ arg = search_cond->args;
+
+ if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+ sym_node = arg;
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)
+ && (sym_node->col_no == col_no)) {
+
+ /* sym_node contains the desired column id */
+
+ /* Check if the expression on the right side of the
+ operator is already determined */
+
+ exp = que_node_get_next(arg);
+
+ if (opt_check_exp_determined_before(exp, sel_node,
+ nth_table)) {
+ *op = search_cond->func;
+
+ return(exp);
+ }
+ }
+ }
+
+ exp = search_cond->args;
+ arg = que_node_get_next(arg);
+
+ if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+ sym_node = arg;
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)
+ && (sym_node->col_no == col_no)) {
+
+ if (opt_check_exp_determined_before(exp, sel_node,
+ nth_table)) {
+ *op = opt_invert_cmp_op(search_cond->func);
+
+ return(exp);
+ }
+ }
+ }
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Looks in a search condition if a column value is already restricted by the
+search condition BEFORE the nth table is accessed. Takes into account that
+if we will fetch in an ascending order, we cannot utilize an upper limit for
+a column value; in a descending order, respectively, a lower limit.
+@return expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_cond_before(
+/*============================*/
+ ulint cmp_type, /*!< in: OPT_EQUAL, OPT_COMPARISON */
+ ulint col_no, /*!< in: column number */
+ func_node_t* search_cond, /*!< in: search condition or NULL */
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint nth_table, /*!< in: nth table in a join (a query
+ from a single table is considered a
+ join of 1 table) */
+ ulint* op) /*!< out: comparison operator ('=',
+ PARS_GE_TOKEN, ... ) */
+{
+ func_node_t* new_cond;
+ que_node_t* exp;
+
+ if (search_cond == NULL) {
+
+ return(NULL);
+ }
+
+ ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+ ut_a(search_cond->func != PARS_OR_TOKEN);
+ ut_a(search_cond->func != PARS_NOT_TOKEN);
+
+ if (search_cond->func == PARS_AND_TOKEN) {
+ new_cond = search_cond->args;
+
+ exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+ new_cond, sel_node,
+ nth_table, op);
+ if (exp) {
+
+ return(exp);
+ }
+
+ new_cond = que_node_get_next(new_cond);
+
+ exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+ new_cond, sel_node,
+ nth_table, op);
+ return(exp);
+ }
+
+ exp = opt_look_for_col_in_comparison_before(cmp_type, col_no,
+ search_cond, sel_node,
+ nth_table, op);
+ if (exp == NULL) {
+
+ return(NULL);
+ }
+
+ /* If we will fetch in an ascending order, we cannot utilize an upper
+ limit for a column value; in a descending order, respectively, a lower
+ limit */
+
+ if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) {
+
+ return(NULL);
+
+ } else if (!sel_node->asc
+ && ((*op == '>') || (*op == PARS_GE_TOKEN))) {
+
+ return(NULL);
+ }
+
+ return(exp);
+}
+
+/*******************************************************************//**
+Calculates the goodness for an index according to a select node. The
+goodness is 4 times the number of first fields in index whose values we
+already know exactly in the query. If we have a comparison condition for
+an additional field, 2 point are added. If the index is unique, and we know
+all the unique fields for the index we add 1024 points. For a clustered index
+we add 1 point.
+@return goodness */
+static
+ulint
+opt_calc_index_goodness(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ sel_node_t* sel_node, /*!< in: parsed select node */
+ ulint nth_table, /*!< in: nth table in a join */
+ que_node_t** index_plan, /*!< in/out: comparison expressions for
+ this index */
+ ulint* last_op) /*!< out: last comparison operator, if
+ goodness > 1 */
+{
+ que_node_t* exp;
+ ulint goodness;
+ ulint n_fields;
+ ulint col_no;
+ ulint op;
+ ulint j;
+
+ goodness = 0;
+
+ /* Note that as higher level node pointers in the B-tree contain
+ page addresses as the last field, we must not put more fields in
+ the search tuple than dict_index_get_n_unique_in_tree(index); see
+ the note in btr_cur_search_to_nth_level. */
+
+ n_fields = dict_index_get_n_unique_in_tree(index);
+
+ for (j = 0; j < n_fields; j++) {
+
+ col_no = dict_index_get_nth_col_no(index, j);
+
+ exp = opt_look_for_col_in_cond_before(
+ OPT_EQUAL, col_no, sel_node->search_cond,
+ sel_node, nth_table, &op);
+ if (exp) {
+ /* The value for this column is exactly known already
+ at this stage of the join */
+
+ index_plan[j] = exp;
+ *last_op = op;
+ goodness += 4;
+ } else {
+ /* Look for non-equality comparisons */
+
+ exp = opt_look_for_col_in_cond_before(
+ OPT_COMPARISON, col_no, sel_node->search_cond,
+ sel_node, nth_table, &op);
+ if (exp) {
+ index_plan[j] = exp;
+ *last_op = op;
+ goodness += 2;
+ }
+
+ break;
+ }
+ }
+
+ if (goodness >= 4 * dict_index_get_n_unique(index)) {
+ goodness += 1024;
+
+ if (dict_index_is_clust(index)) {
+
+ goodness += 1024;
+ }
+ }
+
+ /* We have to test for goodness here, as last_op may note be set */
+ if (goodness && dict_index_is_clust(index)) {
+
+ goodness++;
+ }
+
+ return(goodness);
+}
+
+/*******************************************************************//**
+Calculates the number of matched fields based on an index goodness.
+@return number of excatly or partially matched fields */
+UNIV_INLINE
+ulint
+opt_calc_n_fields_from_goodness(
+/*============================*/
+ ulint goodness) /*!< in: goodness */
+{
+ return(((goodness % 1024) + 2) / 4);
+}
+
+/*******************************************************************//**
+Converts a comparison operator to the corresponding search mode PAGE_CUR_GE,
+...
+@return search mode */
+UNIV_INLINE
+ulint
+opt_op_to_search_mode(
+/*==================*/
+ ibool asc, /*!< in: TRUE if the rows should be fetched in an
+ ascending order */
+ ulint op) /*!< in: operator '=', PARS_GE_TOKEN, ... */
+{
+ if (op == '=') {
+ if (asc) {
+ return(PAGE_CUR_GE);
+ } else {
+ return(PAGE_CUR_LE);
+ }
+ } else if (op == '<') {
+ ut_a(!asc);
+ return(PAGE_CUR_L);
+ } else if (op == '>') {
+ ut_a(asc);
+ return(PAGE_CUR_G);
+ } else if (op == PARS_GE_TOKEN) {
+ ut_a(asc);
+ return(PAGE_CUR_GE);
+ } else if (op == PARS_LE_TOKEN) {
+ ut_a(!asc);
+ return(PAGE_CUR_LE);
+ } else {
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*******************************************************************//**
+Determines if a node is an argument node of a function node.
+@return TRUE if is an argument */
+static
+ibool
+opt_is_arg(
+/*=======*/
+ que_node_t* arg_node, /*!< in: possible argument node */
+ func_node_t* func_node) /*!< in: function node */
+{
+ que_node_t* arg;
+
+ arg = func_node->args;
+
+ while (arg) {
+ if (arg == arg_node) {
+
+ return(TRUE);
+ }
+
+ arg = que_node_get_next(arg);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Decides if the fetching of rows should be made in a descending order, and
+also checks that the chosen query plan produces a result which satisfies
+the order-by. */
+static
+void
+opt_check_order_by(
+/*===============*/
+ sel_node_t* sel_node) /*!< in: select node; asserts an error
+ if the plan does not agree with the
+ order-by */
+{
+ order_node_t* order_node;
+ dict_table_t* order_table;
+ ulint order_col_no;
+ plan_t* plan;
+ ulint i;
+
+ if (!sel_node->order_by) {
+
+ return;
+ }
+
+ order_node = sel_node->order_by;
+ order_col_no = order_node->column->col_no;
+ order_table = order_node->column->table;
+
+ /* If there is an order-by clause, the first non-exactly matched field
+ in the index used for the last table in the table list should be the
+ column defined in the order-by clause, and for all the other tables
+ we should get only at most a single row, otherwise we cannot presently
+ calculate the order-by, as we have no sort utility */
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ if (i < sel_node->n_tables - 1) {
+ ut_a(dict_index_get_n_unique(plan->index)
+ <= plan->n_exact_match);
+ } else {
+ ut_a(plan->table == order_table);
+
+ ut_a((dict_index_get_n_unique(plan->index)
+ <= plan->n_exact_match)
+ || (dict_index_get_nth_col_no(plan->index,
+ plan->n_exact_match)
+ == order_col_no));
+ }
+ }
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+static
+void
+opt_search_plan_for_table(
+/*======================*/
+ sel_node_t* sel_node, /*!< in: parsed select node */
+ ulint i, /*!< in: this is the ith table */
+ dict_table_t* table) /*!< in: table */
+{
+ plan_t* plan;
+ dict_index_t* index;
+ dict_index_t* best_index;
+ ulint n_fields;
+ ulint goodness;
+ ulint last_op = 75946965; /* Eliminate a Purify
+ warning */
+ ulint best_goodness;
+ ulint best_last_op = 0; /* remove warning */
+ que_node_t* index_plan[256];
+ que_node_t* best_index_plan[256];
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ plan->table = table;
+ plan->asc = sel_node->asc;
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+
+ /* Calculate goodness for each index of the table */
+
+ index = dict_table_get_first_index(table);
+ best_index = index; /* Eliminate compiler warning */
+ best_goodness = 0;
+
+ /* should be do ... until ? comment by Jani */
+ while (index) {
+ goodness = opt_calc_index_goodness(index, sel_node, i,
+ index_plan, &last_op);
+ if (goodness > best_goodness) {
+
+ best_index = index;
+ best_goodness = goodness;
+ n_fields = opt_calc_n_fields_from_goodness(goodness);
+
+ ut_memcpy(best_index_plan, index_plan,
+ n_fields * sizeof(void*));
+ best_last_op = last_op;
+ }
+
+ index = dict_table_get_next_index(index);
+ }
+
+ plan->index = best_index;
+
+ n_fields = opt_calc_n_fields_from_goodness(best_goodness);
+
+ if (n_fields == 0) {
+ plan->tuple = NULL;
+ plan->n_exact_match = 0;
+ } else {
+ plan->tuple = dtuple_create(pars_sym_tab_global->heap,
+ n_fields);
+ dict_index_copy_types(plan->tuple, plan->index, n_fields);
+
+ plan->tuple_exps = mem_heap_alloc(pars_sym_tab_global->heap,
+ n_fields * sizeof(void*));
+
+ ut_memcpy(plan->tuple_exps, best_index_plan,
+ n_fields * sizeof(void*));
+ if (best_last_op == '=') {
+ plan->n_exact_match = n_fields;
+ } else {
+ plan->n_exact_match = n_fields - 1;
+ }
+
+ plan->mode = opt_op_to_search_mode(sel_node->asc,
+ best_last_op);
+ }
+
+ if (dict_index_is_clust(best_index)
+ && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) {
+
+ plan->unique_search = TRUE;
+ } else {
+ plan->unique_search = FALSE;
+ }
+
+ plan->old_vers_heap = NULL;
+
+ btr_pcur_init(&(plan->pcur));
+ btr_pcur_init(&(plan->clust_pcur));
+}
+
+/*******************************************************************//**
+Looks at a comparison condition and decides if it can, and need, be tested for
+a table AFTER the table has been accessed.
+@return OPT_NOT_COND if not for this table, else OPT_END_COND,
+OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the
+condition need not be tested, except when scroll cursors are used */
+static
+ulint
+opt_classify_comparison(
+/*====================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* cond) /*!< in: comparison condition */
+{
+ plan_t* plan;
+ ulint n_fields;
+ ulint op;
+ ulint j;
+
+ ut_ad(cond && sel_node);
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ /* Check if the condition is determined after the ith table has been
+ accessed, but not after the i - 1:th */
+
+ if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) {
+
+ return(OPT_NOT_COND);
+ }
+
+ if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) {
+
+ return(OPT_NOT_COND);
+ }
+
+ /* If the condition is an exact match condition used in constructing
+ the search tuple, it is classified as OPT_END_COND */
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+ } else {
+ n_fields = 0;
+ }
+
+ for (j = 0; j < plan->n_exact_match; j++) {
+
+ if (opt_is_arg(plan->tuple_exps[j], cond)) {
+
+ return(OPT_END_COND);
+ }
+ }
+
+ /* If the condition is an non-exact match condition used in
+ constructing the search tuple, it is classified as OPT_SCROLL_COND.
+ When the cursor is positioned, and if a non-scroll cursor is used,
+ there is no need to test this condition; if a scroll cursor is used
+ the testing is necessary when the cursor is reversed. */
+
+ if ((n_fields > plan->n_exact_match)
+ && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) {
+
+ return(OPT_SCROLL_COND);
+ }
+
+ /* If the condition is a non-exact match condition on the first field
+ in index for which there is no exact match, and it limits the search
+ range from the opposite side of the search tuple already BEFORE we
+ access the table, it is classified as OPT_END_COND */
+
+ if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match)
+ && opt_look_for_col_in_comparison_before(
+ OPT_COMPARISON,
+ dict_index_get_nth_col_no(plan->index,
+ plan->n_exact_match),
+ cond, sel_node, i, &op)) {
+
+ if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) {
+
+ return(OPT_END_COND);
+ }
+
+ if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) {
+
+ return(OPT_END_COND);
+ }
+ }
+
+ /* Otherwise, cond is classified as OPT_TEST_COND */
+
+ return(OPT_TEST_COND);
+}
+
+/*******************************************************************//**
+Recursively looks for test conditions for a table in a join. */
+static
+void
+opt_find_test_conds(
+/*================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* cond) /*!< in: conjunction of search
+ conditions or NULL */
+{
+ func_node_t* new_cond;
+ ulint class;
+ plan_t* plan;
+
+ if (cond == NULL) {
+
+ return;
+ }
+
+ if (cond->func == PARS_AND_TOKEN) {
+ new_cond = cond->args;
+
+ opt_find_test_conds(sel_node, i, new_cond);
+
+ new_cond = que_node_get_next(new_cond);
+
+ opt_find_test_conds(sel_node, i, new_cond);
+
+ return;
+ }
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ class = opt_classify_comparison(sel_node, i, cond);
+
+ if (class == OPT_END_COND) {
+ UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond);
+
+ } else if (class == OPT_TEST_COND) {
+ UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond);
+
+ }
+}
+
+/*******************************************************************//**
+Normalizes a list of comparison conditions so that a column of the table
+appears on the left side of the comparison if possible. This is accomplished
+by switching the arguments of the operator. */
+static
+void
+opt_normalize_cmp_conds(
+/*====================*/
+ func_node_t* cond, /*!< in: first in a list of comparison
+ conditions, or NULL */
+ dict_table_t* table) /*!< in: table */
+{
+ que_node_t* arg1;
+ que_node_t* arg2;
+ sym_node_t* sym_node;
+
+ while (cond) {
+ arg1 = cond->args;
+ arg2 = que_node_get_next(arg1);
+
+ if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
+
+ sym_node = arg2;
+
+ if ((sym_node->token_type == SYM_COLUMN)
+ && (sym_node->table == table)) {
+
+ /* Switch the order of the arguments */
+
+ cond->args = arg2;
+ que_node_list_add_last(NULL, arg2);
+ que_node_list_add_last(arg2, arg1);
+
+ /* Invert the operator */
+ cond->func = opt_invert_cmp_op(cond->func);
+ }
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+}
+
+/*******************************************************************//**
+Finds out the search condition conjuncts we can, and need, to test as the ith
+table in a join is accessed. The search tuple can eliminate the need to test
+some conjuncts. */
+static
+void
+opt_determine_and_normalize_test_conds(
+/*===================================*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i) /*!< in: ith table in the join */
+{
+ plan_t* plan;
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ UT_LIST_INIT(plan->end_conds);
+ UT_LIST_INIT(plan->other_conds);
+
+ /* Recursively go through the conjuncts and classify them */
+
+ opt_find_test_conds(sel_node, i, sel_node->search_cond);
+
+ opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
+ plan->table);
+
+ ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match);
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+ ibool copy_val, /*!< in: if TRUE, new found columns are
+ added as columns to copy */
+ dict_index_t* index, /*!< in: index of the table to use */
+ sym_node_list_t* col_list, /*!< in: base node of a list where
+ to add new found columns */
+ plan_t* plan, /*!< in: plan or NULL */
+ que_node_t* exp) /*!< in: expression or condition or
+ NULL */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ sym_node_t* col_node;
+ ulint col_pos;
+
+ if (exp == NULL) {
+
+ return;
+ }
+
+ if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+ func_node = exp;
+
+ arg = func_node->args;
+
+ while (arg) {
+ opt_find_all_cols(copy_val, index, col_list, plan,
+ arg);
+ arg = que_node_get_next(arg);
+ }
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+ sym_node = exp;
+
+ if (sym_node->token_type != SYM_COLUMN) {
+
+ return;
+ }
+
+ if (sym_node->table != index->table) {
+
+ return;
+ }
+
+ /* Look for an occurrence of the same column in the plan column
+ list */
+
+ col_node = UT_LIST_GET_FIRST(*col_list);
+
+ while (col_node) {
+ if (col_node->col_no == sym_node->col_no) {
+
+ if (col_node == sym_node) {
+ /* sym_node was already in a list: do
+ nothing */
+
+ return;
+ }
+
+ /* Put an indirection */
+ sym_node->indirection = col_node;
+ sym_node->alias = col_node;
+
+ return;
+ }
+
+ col_node = UT_LIST_GET_NEXT(col_var_list, col_node);
+ }
+
+ /* The same column did not occur in the list: add it */
+
+ UT_LIST_ADD_LAST(col_var_list, *col_list, sym_node);
+
+ sym_node->copy_val = copy_val;
+
+ /* Fill in the field_no fields in sym_node */
+
+ sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
+ dict_table_get_first_index(index->table), sym_node->col_no);
+ if (!dict_index_is_clust(index)) {
+
+ ut_a(plan);
+
+ col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no);
+
+ if (col_pos == ULINT_UNDEFINED) {
+
+ plan->must_get_clust = TRUE;
+ }
+
+ sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos;
+ }
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in conditions which are
+not yet determined AFTER the join operation has fetched a row in the ith
+table. The values for these column must be copied to dynamic memory for
+later use. */
+static
+void
+opt_find_copy_cols(
+/*===============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i, /*!< in: ith table in the join */
+ func_node_t* search_cond) /*!< in: search condition or NULL */
+{
+ func_node_t* new_cond;
+ plan_t* plan;
+
+ if (search_cond == NULL) {
+
+ return;
+ }
+
+ ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+
+ if (search_cond->func == PARS_AND_TOKEN) {
+ new_cond = search_cond->args;
+
+ opt_find_copy_cols(sel_node, i, new_cond);
+
+ new_cond = que_node_get_next(new_cond);
+
+ opt_find_copy_cols(sel_node, i, new_cond);
+
+ return;
+ }
+
+ if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) {
+
+ /* Any ith table columns occurring in search_cond should be
+ copied, as this condition cannot be tested already on the
+ fetch from the ith table */
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+ search_cond);
+ }
+}
+
+/*******************************************************************//**
+Classifies the table columns according to whether we use the column only while
+holding the latch on the page, or whether we have to copy the column value to
+dynamic memory. Puts the first occurrence of a column to either list in the
+plan node, and puts indirections to later occurrences of the column. */
+static
+void
+opt_classify_cols(
+/*==============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint i) /*!< in: ith table in the join */
+{
+ plan_t* plan;
+ que_node_t* exp;
+
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ /* The final value of the following field will depend on the
+ environment of the select statement: */
+
+ plan->must_get_clust = FALSE;
+
+ UT_LIST_INIT(plan->columns);
+
+ /* All select list columns should be copied: therefore TRUE as the
+ first argument */
+
+ exp = sel_node->select_list;
+
+ while (exp) {
+ opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+ exp);
+ exp = que_node_get_next(exp);
+ }
+
+ opt_find_copy_cols(sel_node, i, sel_node->search_cond);
+
+ /* All remaining columns in the search condition are temporary
+ columns: therefore FALSE */
+
+ opt_find_all_cols(FALSE, plan->index, &(plan->columns), plan,
+ sel_node->search_cond);
+}
+
+/*******************************************************************//**
+Fills in the info in plan which is used in accessing a clustered index
+record. The columns must already be classified for the plan node. */
+static
+void
+opt_clust_access(
+/*=============*/
+ sel_node_t* sel_node, /*!< in: select node */
+ ulint n) /*!< in: nth table in select */
+{
+ plan_t* plan;
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dict_index_t* index;
+ mem_heap_t* heap;
+ ulint n_fields;
+ ulint pos;
+ ulint i;
+
+ plan = sel_node_get_nth_plan(sel_node, n);
+
+ index = plan->index;
+
+ /* The final value of the following field depends on the environment
+ of the select statement: */
+
+ plan->no_prefetch = FALSE;
+
+ if (dict_index_is_clust(index)) {
+ plan->clust_map = NULL;
+ plan->clust_ref = NULL;
+
+ return;
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ n_fields = dict_index_get_n_unique(clust_index);
+
+ heap = pars_sym_tab_global->heap;
+
+ plan->clust_ref = dtuple_create(heap, n_fields);
+
+ dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
+
+ plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint));
+
+ for (i = 0; i < n_fields; i++) {
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ /* We optimize here only queries to InnoDB's internal system
+ tables, and they should not contain column prefix indexes. */
+
+ if (dict_index_get_nth_field(index, pos)->prefix_len != 0
+ || dict_index_get_nth_field(clust_index, i)
+ ->prefix_len != 0) {
+ fprintf(stderr,
+ "InnoDB: Error in pars0opt.c:"
+ " table %s has prefix_len != 0\n",
+ index->table_name);
+ }
+
+ *(plan->clust_map + i) = pos;
+
+ ut_ad(pos != ULINT_UNDEFINED);
+ }
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+ sel_node_t* sel_node) /*!< in: parsed select node */
+{
+ sym_node_t* table_node;
+ dict_table_t* table;
+ order_node_t* order_by;
+ ulint i;
+
+ sel_node->plans = mem_heap_alloc(pars_sym_tab_global->heap,
+ sel_node->n_tables * sizeof(plan_t));
+
+ /* Analyze the search condition to find out what we know at each
+ join stage about the conditions that the columns of a table should
+ satisfy */
+
+ table_node = sel_node->table_list;
+
+ if (sel_node->order_by == NULL) {
+ sel_node->asc = TRUE;
+ } else {
+ order_by = sel_node->order_by;
+
+ sel_node->asc = order_by->asc;
+ }
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ table = table_node->table;
+
+ /* Choose index through which to access the table */
+
+ opt_search_plan_for_table(sel_node, i, table);
+
+ /* Determine the search condition conjuncts we can test at
+ this table; normalize the end conditions */
+
+ opt_determine_and_normalize_test_conds(sel_node, i);
+
+ table_node = que_node_get_next(table_node);
+ }
+
+ table_node = sel_node->table_list;
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+
+ /* Classify the table columns into those we only need to access
+ but not copy, and to those we must copy to dynamic memory */
+
+ opt_classify_cols(sel_node, i);
+
+ /* Calculate possible info for accessing the clustered index
+ record */
+
+ opt_clust_access(sel_node, i);
+
+ table_node = que_node_get_next(table_node);
+ }
+
+ /* Check that the plan obeys a possible order-by clause: if not,
+ an assertion error occurs */
+
+ opt_check_order_by(sel_node);
+
+#ifdef UNIV_SQL_DEBUG
+ opt_print_query_plan(sel_node);
+#endif
+}
+
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+ sel_node_t* sel_node) /*!< in: select node */
+{
+ plan_t* plan;
+ ulint n_fields;
+ ulint i;
+
+ fputs("QUERY PLAN FOR A SELECT NODE\n", stderr);
+
+ fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr);
+
+ if (sel_node->set_x_locks) {
+ fputs("sets row x-locks; ", stderr);
+ ut_a(sel_node->row_lock_mode == LOCK_X);
+ ut_a(!sel_node->consistent_read);
+ } else if (sel_node->consistent_read) {
+ fputs("consistent read; ", stderr);
+ } else {
+ ut_a(sel_node->row_lock_mode == LOCK_S);
+ fputs("sets row s-locks; ", stderr);
+ }
+
+ putc('\n', stderr);
+
+ for (i = 0; i < sel_node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(sel_node, i);
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+ } else {
+ n_fields = 0;
+ }
+
+ fputs("Table ", stderr);
+ dict_index_name_print(stderr, NULL, plan->index);
+ fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n",
+ (unsigned long) plan->n_exact_match,
+ (unsigned long) n_fields,
+ (unsigned long) UT_LIST_GET_LEN(plan->end_conds));
+ }
+}
diff --git a/storage/xtradb/pars/pars0pars.c b/storage/xtradb/pars/pars0pars.c
new file mode 100644
index 00000000000..9faf36d00a8
--- /dev/null
+++ b/storage/xtradb/pars/pars0pars.c
@@ -0,0 +1,2196 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0pars.c
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+/* Historical note: Innobase executed its first SQL string (CREATE TABLE)
+on 1/27/1998 */
+
+#include "pars0pars.h"
+
+#ifdef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0opt.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+UNIV_INTERN ibool pars_print_lexed = FALSE;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+UNIV_INTERN sym_tab_t* pars_sym_tab_global;
+
+/* Global variables used to denote certain reserved words, used in
+constructing the parsing tree */
+
+UNIV_INTERN pars_res_word_t pars_to_char_token = {PARS_TO_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_to_number_token = {PARS_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t pars_to_binary_token = {PARS_TO_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t pars_binary_to_number_token = {PARS_BINARY_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t pars_substr_token = {PARS_SUBSTR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_replstr_token = {PARS_REPLSTR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_concat_token = {PARS_CONCAT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_instr_token = {PARS_INSTR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_length_token = {PARS_LENGTH_TOKEN};
+UNIV_INTERN pars_res_word_t pars_sysdate_token = {PARS_SYSDATE_TOKEN};
+UNIV_INTERN pars_res_word_t pars_printf_token = {PARS_PRINTF_TOKEN};
+UNIV_INTERN pars_res_word_t pars_assert_token = {PARS_ASSERT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_rnd_token = {PARS_RND_TOKEN};
+UNIV_INTERN pars_res_word_t pars_rnd_str_token = {PARS_RND_STR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_count_token = {PARS_COUNT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_sum_token = {PARS_SUM_TOKEN};
+UNIV_INTERN pars_res_word_t pars_distinct_token = {PARS_DISTINCT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_binary_token = {PARS_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t pars_blob_token = {PARS_BLOB_TOKEN};
+UNIV_INTERN pars_res_word_t pars_int_token = {PARS_INT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_char_token = {PARS_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t pars_float_token = {PARS_FLOAT_TOKEN};
+UNIV_INTERN pars_res_word_t pars_update_token = {PARS_UPDATE_TOKEN};
+UNIV_INTERN pars_res_word_t pars_asc_token = {PARS_ASC_TOKEN};
+UNIV_INTERN pars_res_word_t pars_desc_token = {PARS_DESC_TOKEN};
+UNIV_INTERN pars_res_word_t pars_open_token = {PARS_OPEN_TOKEN};
+UNIV_INTERN pars_res_word_t pars_close_token = {PARS_CLOSE_TOKEN};
+UNIV_INTERN pars_res_word_t pars_share_token = {PARS_SHARE_TOKEN};
+UNIV_INTERN pars_res_word_t pars_unique_token = {PARS_UNIQUE_TOKEN};
+UNIV_INTERN pars_res_word_t pars_clustered_token = {PARS_CLUSTERED_TOKEN};
+
+/** Global variable used to denote the '*' in SELECT * FROM.. */
+UNIV_INTERN ulint pars_star_denoter = 12345678;
+
+
+/*********************************************************************//**
+Determines the class of a function code.
+@return function class: PARS_FUNC_ARITH, ... */
+static
+ulint
+pars_func_get_class(
+/*================*/
+ int func) /*!< in: function code: '=', PARS_GE_TOKEN, ... */
+{
+ switch (func) {
+ case '+': case '-': case '*': case '/':
+ return(PARS_FUNC_ARITH);
+
+ case '=': case '<': case '>':
+ case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN:
+ return(PARS_FUNC_CMP);
+
+ case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN:
+ return(PARS_FUNC_LOGICAL);
+
+ case PARS_COUNT_TOKEN: case PARS_SUM_TOKEN:
+ return(PARS_FUNC_AGGREGATE);
+
+ case PARS_TO_CHAR_TOKEN:
+ case PARS_TO_NUMBER_TOKEN:
+ case PARS_TO_BINARY_TOKEN:
+ case PARS_BINARY_TO_NUMBER_TOKEN:
+ case PARS_SUBSTR_TOKEN:
+ case PARS_CONCAT_TOKEN:
+ case PARS_LENGTH_TOKEN:
+ case PARS_INSTR_TOKEN:
+ case PARS_SYSDATE_TOKEN:
+ case PARS_NOTFOUND_TOKEN:
+ case PARS_PRINTF_TOKEN:
+ case PARS_ASSERT_TOKEN:
+ case PARS_RND_TOKEN:
+ case PARS_RND_STR_TOKEN:
+ case PARS_REPLSTR_TOKEN:
+ return(PARS_FUNC_PREDEFINED);
+
+ default:
+ return(PARS_FUNC_OTHER);
+ }
+}
+
+/*********************************************************************//**
+Parses an operator or predefined function expression.
+@return own: function node in a query tree */
+static
+func_node_t*
+pars_func_low(
+/*==========*/
+ int func, /*!< in: function token code */
+ que_node_t* arg) /*!< in: first argument in the argument list */
+{
+ func_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t));
+
+ node->common.type = QUE_NODE_FUNC;
+ dfield_set_data(&(node->common.val), NULL, 0);
+ node->common.val_buf_size = 0;
+
+ node->func = func;
+
+ node->class = pars_func_get_class(func);
+
+ node->args = arg;
+
+ UT_LIST_ADD_LAST(func_node_list, pars_sym_tab_global->func_node_list,
+ node);
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a function expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+ que_node_t* res_word,/*!< in: function name reserved word */
+ que_node_t* arg) /*!< in: first argument in the argument list */
+{
+ return(pars_func_low(((pars_res_word_t*)res_word)->code, arg));
+}
+
+/*********************************************************************//**
+Parses an operator expression.
+@return own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+ int func, /*!< in: operator token code */
+ que_node_t* arg1, /*!< in: first argument */
+ que_node_t* arg2) /*!< in: second argument or NULL for an unary
+ operator */
+{
+ que_node_list_add_last(NULL, arg1);
+
+ if (arg2) {
+ que_node_list_add_last(arg1, arg2);
+ }
+
+ return(pars_func_low(func, arg1));
+}
+
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+ sym_node_t* column, /*!< in: column name */
+ pars_res_word_t* asc) /*!< in: &pars_asc_token or pars_desc_token */
+{
+ order_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(order_node_t));
+
+ node->common.type = QUE_NODE_ORDER;
+
+ node->column = column;
+
+ if (asc == &pars_asc_token) {
+ node->asc = TRUE;
+ } else {
+ ut_a(asc == &pars_desc_token);
+ node->asc = FALSE;
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Determine if a data type is a built-in string data type of the InnoDB
+SQL parser.
+@return TRUE if string data type */
+static
+ibool
+pars_is_string_type(
+/*================*/
+ ulint mtype) /*!< in: main data type */
+{
+ switch (mtype) {
+ case DATA_VARCHAR: case DATA_CHAR:
+ case DATA_FIXBINARY: case DATA_BINARY:
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Resolves the data type of a function in an expression. The argument data
+types must already be resolved. */
+static
+void
+pars_resolve_func_data_type(
+/*========================*/
+ func_node_t* node) /*!< in: function node */
+{
+ que_node_t* arg;
+
+ ut_a(que_node_get_type(node) == QUE_NODE_FUNC);
+
+ arg = node->args;
+
+ switch (node->func) {
+ case PARS_SUM_TOKEN:
+ case '+': case '-': case '*': case '/':
+ /* Inherit the data type from the first argument (which must
+ not be the SQL null literal whose type is DATA_ERROR) */
+
+ dtype_copy(que_node_get_data_type(node),
+ que_node_get_data_type(arg));
+
+ ut_a(dtype_get_mtype(que_node_get_data_type(node))
+ == DATA_INT);
+ break;
+
+ case PARS_COUNT_TOKEN:
+ ut_a(arg);
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_TO_CHAR_TOKEN:
+ case PARS_RND_STR_TOKEN:
+ ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ break;
+
+ case PARS_TO_BINARY_TOKEN:
+ if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) {
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ } else {
+ dtype_set(que_node_get_data_type(node), DATA_BINARY,
+ 0, 0);
+ }
+ break;
+
+ case PARS_TO_NUMBER_TOKEN:
+ case PARS_BINARY_TO_NUMBER_TOKEN:
+ case PARS_LENGTH_TOKEN:
+ case PARS_INSTR_TOKEN:
+ ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_SYSDATE_TOKEN:
+ ut_a(arg == NULL);
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_SUBSTR_TOKEN:
+ case PARS_CONCAT_TOKEN:
+ ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+ dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+ DATA_ENGLISH, 0);
+ break;
+
+ case '>': case '<': case '=':
+ case PARS_GE_TOKEN:
+ case PARS_LE_TOKEN:
+ case PARS_NE_TOKEN:
+ case PARS_AND_TOKEN:
+ case PARS_OR_TOKEN:
+ case PARS_NOT_TOKEN:
+ case PARS_NOTFOUND_TOKEN:
+
+ /* We currently have no iboolean type: use integer type */
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ case PARS_RND_TOKEN:
+ ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+ dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+ break;
+
+ default:
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression and the data types of
+functions. It is an error if some identifier cannot be resolved here. */
+static
+void
+pars_resolve_exp_variables_and_types(
+/*=================================*/
+ sel_node_t* select_node, /*!< in: select node or NULL; if
+ this is not NULL then the variable
+ sym nodes are added to the
+ copy_variables list of select_node */
+ que_node_t* exp_node) /*!< in: expression */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ sym_node_t* node;
+
+ ut_a(exp_node);
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+ func_node = exp_node;
+
+ arg = func_node->args;
+
+ while (arg) {
+ pars_resolve_exp_variables_and_types(select_node, arg);
+
+ arg = que_node_get_next(arg);
+ }
+
+ pars_resolve_func_data_type(func_node);
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+ sym_node = exp_node;
+
+ if (sym_node->resolved) {
+
+ return;
+ }
+
+ /* Not resolved yet: look in the symbol table for a variable
+ or a cursor or a function with the same name */
+
+ node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+ while (node) {
+ if (node->resolved
+ && ((node->token_type == SYM_VAR)
+ || (node->token_type == SYM_CURSOR)
+ || (node->token_type == SYM_FUNCTION))
+ && node->name
+ && (sym_node->name_len == node->name_len)
+ && (ut_memcmp(sym_node->name, node->name,
+ node->name_len) == 0)) {
+
+ /* Found a variable or a cursor declared with
+ the same name */
+
+ break;
+ }
+
+ node = UT_LIST_GET_NEXT(sym_list, node);
+ }
+
+ if (!node) {
+ fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n",
+ sym_node->name);
+ }
+
+ ut_a(node);
+
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_IMPLICIT_VAR;
+ sym_node->alias = node;
+ sym_node->indirection = node;
+
+ if (select_node) {
+ UT_LIST_ADD_LAST(col_var_list, select_node->copy_variables,
+ sym_node);
+ }
+
+ dfield_set_type(que_node_get_val(sym_node),
+ que_node_get_data_type(node));
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression list. It is an error if
+some identifier cannot be resolved here. Resolves also the data types of
+functions. */
+static
+void
+pars_resolve_exp_list_variables_and_types(
+/*======================================*/
+ sel_node_t* select_node, /*!< in: select node or NULL */
+ que_node_t* exp_node) /*!< in: expression list first node, or
+ NULL */
+{
+ while (exp_node) {
+ pars_resolve_exp_variables_and_types(select_node, exp_node);
+
+ exp_node = que_node_get_next(exp_node);
+ }
+}
+
+/*********************************************************************//**
+Resolves the columns in an expression. */
+static
+void
+pars_resolve_exp_columns(
+/*=====================*/
+ sym_node_t* table_node, /*!< in: first node in a table list */
+ que_node_t* exp_node) /*!< in: expression */
+{
+ func_node_t* func_node;
+ que_node_t* arg;
+ sym_node_t* sym_node;
+ dict_table_t* table;
+ sym_node_t* t_node;
+ ulint n_cols;
+ ulint i;
+
+ ut_a(exp_node);
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+ func_node = exp_node;
+
+ arg = func_node->args;
+
+ while (arg) {
+ pars_resolve_exp_columns(table_node, arg);
+
+ arg = que_node_get_next(arg);
+ }
+
+ return;
+ }
+
+ ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+ sym_node = exp_node;
+
+ if (sym_node->resolved) {
+
+ return;
+ }
+
+ /* Not resolved yet: look in the table list for a column with the
+ same name */
+
+ t_node = table_node;
+
+ while (t_node) {
+ table = t_node->table;
+
+ n_cols = dict_table_get_n_cols(table);
+
+ for (i = 0; i < n_cols; i++) {
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, i);
+ const char* col_name
+ = dict_table_get_col_name(table, i);
+
+ if ((sym_node->name_len == ut_strlen(col_name))
+ && (0 == ut_memcmp(sym_node->name, col_name,
+ sym_node->name_len))) {
+ /* Found */
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_COLUMN;
+ sym_node->table = table;
+ sym_node->col_no = i;
+ sym_node->prefetch_buf = NULL;
+
+ dict_col_copy_type(
+ col,
+ dfield_get_type(&sym_node
+ ->common.val));
+
+ return;
+ }
+ }
+
+ t_node = que_node_get_next(t_node);
+ }
+}
+
+/*********************************************************************//**
+Resolves the meaning of columns in an expression list. */
+static
+void
+pars_resolve_exp_list_columns(
+/*==========================*/
+ sym_node_t* table_node, /*!< in: first node in a table list */
+ que_node_t* exp_node) /*!< in: expression list first node, or
+ NULL */
+{
+ while (exp_node) {
+ pars_resolve_exp_columns(table_node, exp_node);
+
+ exp_node = que_node_get_next(exp_node);
+ }
+}
+
+/*********************************************************************//**
+Retrieves the table definition for a table name id. */
+static
+void
+pars_retrieve_table_def(
+/*====================*/
+ sym_node_t* sym_node) /*!< in: table node */
+{
+ const char* table_name;
+
+ ut_a(sym_node);
+ ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_TABLE;
+
+ table_name = (const char*) sym_node->name;
+
+ sym_node->table = dict_table_get_low(table_name);
+
+ ut_a(sym_node->table);
+}
+
+/*********************************************************************//**
+Retrieves the table definitions for a list of table name ids.
+@return number of tables */
+static
+ulint
+pars_retrieve_table_list_defs(
+/*==========================*/
+ sym_node_t* sym_node) /*!< in: first table node in list */
+{
+ ulint count = 0;
+
+ if (sym_node == NULL) {
+
+ return(count);
+ }
+
+ while (sym_node) {
+ pars_retrieve_table_def(sym_node);
+
+ count++;
+
+ sym_node = que_node_get_next(sym_node);
+ }
+
+ return(count);
+}
+
+/*********************************************************************//**
+Adds all columns to the select list if the query is SELECT * FROM ... */
+static
+void
+pars_select_all_columns(
+/*====================*/
+ sel_node_t* select_node) /*!< in: select node already containing
+ the table list */
+{
+ sym_node_t* col_node;
+ sym_node_t* table_node;
+ dict_table_t* table;
+ ulint i;
+
+ select_node->select_list = NULL;
+
+ table_node = select_node->table_list;
+
+ while (table_node) {
+ table = table_node->table;
+
+ for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+ const char* col_name = dict_table_get_col_name(
+ table, i);
+
+ col_node = sym_tab_add_id(pars_sym_tab_global,
+ (byte*)col_name,
+ ut_strlen(col_name));
+
+ select_node->select_list = que_node_list_add_last(
+ select_node->select_list, col_node);
+ }
+
+ table_node = que_node_get_next(table_node);
+ }
+}
+
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+ que_node_t* select_list, /*!< in: select list */
+ sym_node_t* into_list) /*!< in: variables list or NULL */
+{
+ sel_node_t* node;
+
+ node = sel_node_create(pars_sym_tab_global->heap);
+
+ node->select_list = select_list;
+ node->into_list = into_list;
+
+ pars_resolve_exp_list_variables_and_types(NULL, into_list);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Checks if the query is an aggregate query, in which case the selct list must
+contain only aggregate function items. */
+static
+void
+pars_check_aggregate(
+/*=================*/
+ sel_node_t* select_node) /*!< in: select node already containing
+ the select list */
+{
+ que_node_t* exp_node;
+ func_node_t* func_node;
+ ulint n_nodes = 0;
+ ulint n_aggregate_nodes = 0;
+
+ exp_node = select_node->select_list;
+
+ while (exp_node) {
+
+ n_nodes++;
+
+ if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+
+ func_node = exp_node;
+
+ if (func_node->class == PARS_FUNC_AGGREGATE) {
+
+ n_aggregate_nodes++;
+ }
+ }
+
+ exp_node = que_node_get_next(exp_node);
+ }
+
+ if (n_aggregate_nodes > 0) {
+ ut_a(n_nodes == n_aggregate_nodes);
+
+ select_node->is_aggregate = TRUE;
+ } else {
+ select_node->is_aggregate = FALSE;
+ }
+}
+
+/*********************************************************************//**
+Parses a select statement.
+@return own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+ sel_node_t* select_node, /*!< in: select node already containing
+ the select list */
+ sym_node_t* table_list, /*!< in: table list */
+ que_node_t* search_cond, /*!< in: search condition or NULL */
+ pars_res_word_t* for_update, /*!< in: NULL or &pars_update_token */
+ pars_res_word_t* lock_shared, /*!< in: NULL or &pars_share_token */
+ order_node_t* order_by) /*!< in: NULL or an order-by node */
+{
+ select_node->state = SEL_NODE_OPEN;
+
+ select_node->table_list = table_list;
+ select_node->n_tables = pars_retrieve_table_list_defs(table_list);
+
+ if (select_node->select_list == &pars_star_denoter) {
+
+ /* SELECT * FROM ... */
+ pars_select_all_columns(select_node);
+ }
+
+ if (select_node->into_list) {
+ ut_a(que_node_list_get_len(select_node->into_list)
+ == que_node_list_get_len(select_node->select_list));
+ }
+
+ UT_LIST_INIT(select_node->copy_variables);
+
+ pars_resolve_exp_list_columns(table_list, select_node->select_list);
+ pars_resolve_exp_list_variables_and_types(select_node,
+ select_node->select_list);
+ pars_check_aggregate(select_node);
+
+ select_node->search_cond = search_cond;
+
+ if (search_cond) {
+ pars_resolve_exp_columns(table_list, search_cond);
+ pars_resolve_exp_variables_and_types(select_node, search_cond);
+ }
+
+ if (for_update) {
+ ut_a(!lock_shared);
+
+ select_node->set_x_locks = TRUE;
+ select_node->row_lock_mode = LOCK_X;
+
+ select_node->consistent_read = FALSE;
+ select_node->read_view = NULL;
+ } else if (lock_shared){
+ select_node->set_x_locks = FALSE;
+ select_node->row_lock_mode = LOCK_S;
+
+ select_node->consistent_read = FALSE;
+ select_node->read_view = NULL;
+ } else {
+ select_node->set_x_locks = FALSE;
+ select_node->row_lock_mode = LOCK_S;
+
+ select_node->consistent_read = TRUE;
+ }
+
+ select_node->order_by = order_by;
+
+ if (order_by) {
+ pars_resolve_exp_columns(table_list, order_by->column);
+ }
+
+ /* The final value of the following fields depend on the environment
+ where the select statement appears: */
+
+ select_node->can_get_updated = FALSE;
+ select_node->explicit_cursor = NULL;
+
+ opt_search_plan(select_node);
+
+ return(select_node);
+}
+
+/*********************************************************************//**
+Parses a cursor declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+ sym_node_t* sym_node, /*!< in: cursor id node in the symbol
+ table */
+ sel_node_t* select_node) /*!< in: select node */
+{
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_CURSOR;
+ sym_node->cursor_def = select_node;
+
+ select_node->state = SEL_NODE_CLOSED;
+ select_node->explicit_cursor = sym_node;
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a function declaration.
+@return sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+ sym_node_t* sym_node) /*!< in: function id node in the symbol
+ table */
+{
+ sym_node->resolved = TRUE;
+ sym_node->token_type = SYM_FUNCTION;
+
+ /* Check that the function exists. */
+ ut_a(pars_info_get_user_func(pars_sym_tab_global->info,
+ sym_node->name));
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+ ibool is_delete, /*!< in: TRUE if delete */
+ sym_node_t* table_sym, /*!< in: table name node */
+ col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL
+ if delete */
+{
+ upd_node_t* node;
+
+ node = upd_node_create(pars_sym_tab_global->heap);
+
+ node->is_delete = is_delete;
+
+ node->table_sym = table_sym;
+ node->col_assign_list = col_assign_list;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+ sym_node_t* column, /*!< in: column to assign */
+ que_node_t* exp) /*!< in: value to assign */
+{
+ col_assign_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap,
+ sizeof(col_assign_node_t));
+ node->common.type = QUE_NODE_COL_ASSIGNMENT;
+
+ node->col = column;
+ node->val = exp;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Processes an update node assignment list. */
+static
+void
+pars_process_assign_list(
+/*=====================*/
+ upd_node_t* node) /*!< in: update node */
+{
+ col_assign_node_t* col_assign_list;
+ sym_node_t* table_sym;
+ col_assign_node_t* assign_node;
+ upd_field_t* upd_field;
+ dict_index_t* clust_index;
+ sym_node_t* col_sym;
+ ulint changes_ord_field;
+ ulint changes_field_size;
+ ulint n_assigns;
+ ulint i;
+
+ table_sym = node->table_sym;
+ col_assign_list = node->col_assign_list;
+ clust_index = dict_table_get_first_index(node->table);
+
+ assign_node = col_assign_list;
+ n_assigns = 0;
+
+ while (assign_node) {
+ pars_resolve_exp_columns(table_sym, assign_node->col);
+ pars_resolve_exp_columns(table_sym, assign_node->val);
+ pars_resolve_exp_variables_and_types(NULL, assign_node->val);
+#if 0
+ ut_a(dtype_get_mtype(
+ dfield_get_type(que_node_get_val(
+ assign_node->col)))
+ == dtype_get_mtype(
+ dfield_get_type(que_node_get_val(
+ assign_node->val))));
+#endif
+
+ /* Add to the update node all the columns found in assignment
+ values as columns to copy: therefore, TRUE */
+
+ opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL,
+ assign_node->val);
+ n_assigns++;
+
+ assign_node = que_node_get_next(assign_node);
+ }
+
+ node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
+
+ assign_node = col_assign_list;
+
+ changes_field_size = UPD_NODE_NO_SIZE_CHANGE;
+
+ for (i = 0; i < n_assigns; i++) {
+ upd_field = upd_get_nth_field(node->update, i);
+
+ col_sym = assign_node->col;
+
+ upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
+ clust_index, col_sym->col_no),
+ clust_index, NULL);
+ upd_field->exp = assign_node->val;
+
+ if (!dict_col_get_fixed_size(
+ dict_index_get_nth_col(clust_index,
+ upd_field->field_no),
+ dict_table_is_comp(node->table))) {
+ changes_field_size = 0;
+ }
+
+ assign_node = que_node_get_next(assign_node);
+ }
+
+ /* Find out if the update can modify an ordering field in any index */
+
+ changes_ord_field = UPD_NODE_NO_ORD_CHANGE;
+
+ if (row_upd_changes_some_index_ord_field_binary(node->table,
+ node->update)) {
+ changes_ord_field = 0;
+ }
+
+ node->cmpl_info = changes_ord_field | changes_field_size;
+}
+
+/*********************************************************************//**
+Parses an update or delete statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+ upd_node_t* node, /*!< in: update node */
+ sym_node_t* cursor_sym, /*!< in: pointer to a cursor entry in
+ the symbol table or NULL */
+ que_node_t* search_cond) /*!< in: search condition or NULL */
+{
+ sym_node_t* table_sym;
+ sel_node_t* sel_node;
+ plan_t* plan;
+
+ table_sym = node->table_sym;
+
+ pars_retrieve_table_def(table_sym);
+ node->table = table_sym->table;
+
+ UT_LIST_INIT(node->columns);
+
+ /* Make the single table node into a list of table nodes of length 1 */
+
+ que_node_list_add_last(NULL, table_sym);
+
+ if (cursor_sym) {
+ pars_resolve_exp_variables_and_types(NULL, cursor_sym);
+
+ sel_node = cursor_sym->alias->cursor_def;
+
+ node->searched_update = FALSE;
+ } else {
+ sel_node = pars_select_list(NULL, NULL);
+
+ pars_select_statement(sel_node, table_sym, search_cond, NULL,
+ &pars_share_token, NULL);
+ node->searched_update = TRUE;
+ sel_node->common.parent = node;
+ }
+
+ node->select = sel_node;
+
+ ut_a(!node->is_delete || (node->col_assign_list == NULL));
+ ut_a(node->is_delete || (node->col_assign_list != NULL));
+
+ if (node->is_delete) {
+ node->cmpl_info = 0;
+ } else {
+ pars_process_assign_list(node);
+ }
+
+ if (node->searched_update) {
+ node->has_clust_rec_x_lock = TRUE;
+ sel_node->set_x_locks = TRUE;
+ sel_node->row_lock_mode = LOCK_X;
+ } else {
+ node->has_clust_rec_x_lock = sel_node->set_x_locks;
+ }
+
+ ut_a(sel_node->n_tables == 1);
+ ut_a(sel_node->consistent_read == FALSE);
+ ut_a(sel_node->order_by == NULL);
+ ut_a(sel_node->is_aggregate == FALSE);
+
+ sel_node->can_get_updated = TRUE;
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ plan = sel_node_get_nth_plan(sel_node, 0);
+
+ plan->no_prefetch = TRUE;
+
+ if (!dict_index_is_clust(plan->index)) {
+
+ plan->must_get_clust = TRUE;
+
+ node->pcur = &(plan->clust_pcur);
+ } else {
+ node->pcur = &(plan->pcur);
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an insert statement.
+@return own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+ sym_node_t* table_sym, /*!< in: table name node */
+ que_node_t* values_list, /*!< in: value expression list or NULL */
+ sel_node_t* select) /*!< in: select condition or NULL */
+{
+ ins_node_t* node;
+ dtuple_t* row;
+ ulint ins_type;
+
+ ut_a(values_list || select);
+ ut_a(!values_list || !select);
+
+ if (values_list) {
+ ins_type = INS_VALUES;
+ } else {
+ ins_type = INS_SEARCHED;
+ }
+
+ pars_retrieve_table_def(table_sym);
+
+ node = ins_node_create(ins_type, table_sym->table,
+ pars_sym_tab_global->heap);
+
+ row = dtuple_create(pars_sym_tab_global->heap,
+ dict_table_get_n_cols(node->table));
+
+ dict_table_copy_types(row, table_sym->table);
+
+ ins_node_set_new_row(node, row);
+
+ node->select = select;
+
+ if (select) {
+ select->common.parent = node;
+
+ ut_a(que_node_list_get_len(select->select_list)
+ == dict_table_get_n_user_cols(table_sym->table));
+ }
+
+ node->values_list = values_list;
+
+ if (node->values_list) {
+ pars_resolve_exp_list_variables_and_types(NULL, values_list);
+
+ ut_a(que_node_list_get_len(values_list)
+ == dict_table_get_n_user_cols(table_sym->table));
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Set the type of a dfield. */
+static
+void
+pars_set_dfield_type(
+/*=================*/
+ dfield_t* dfield, /*!< in: dfield */
+ pars_res_word_t* type, /*!< in: pointer to a type
+ token */
+ ulint len, /*!< in: length, or 0 */
+ ibool is_unsigned, /*!< in: if TRUE, column is
+ UNSIGNED. */
+ ibool is_not_null) /*!< in: if TRUE, column is
+ NOT NULL. */
+{
+ ulint flags = 0;
+
+ if (is_not_null) {
+ flags |= DATA_NOT_NULL;
+ }
+
+ if (is_unsigned) {
+ flags |= DATA_UNSIGNED;
+ }
+
+ if (type == &pars_int_token) {
+ ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
+
+ } else if (type == &pars_char_token) {
+ ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
+ DATA_ENGLISH | flags, 0);
+ } else if (type == &pars_binary_token) {
+ ut_a(len != 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_FIXBINARY,
+ DATA_BINARY_TYPE | flags, len);
+ } else if (type == &pars_blob_token) {
+ ut_a(len == 0);
+
+ dtype_set(dfield_get_type(dfield), DATA_BLOB,
+ DATA_BINARY_TYPE | flags, 0);
+ } else {
+ ut_error;
+ }
+}
+
+/*********************************************************************//**
+Parses a variable declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the variable */
+ pars_res_word_t* type) /*!< in: pointer to a type token */
+{
+ node->resolved = TRUE;
+ node->token_type = SYM_VAR;
+
+ node->param_type = PARS_NOT_PARAM;
+
+ pars_set_dfield_type(que_node_get_val(node), type, 0, FALSE, FALSE);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+ sym_node_t* node, /*!< in: symbol table node allocated for the
+ id of the parameter */
+ ulint param_type,
+ /*!< in: PARS_INPUT or PARS_OUTPUT */
+ pars_res_word_t* type) /*!< in: pointer to a type token */
+{
+ ut_a((param_type == PARS_INPUT) || (param_type == PARS_OUTPUT));
+
+ pars_variable_declaration(node, type);
+
+ node->param_type = param_type;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Sets the parent field in a query node list. */
+static
+void
+pars_set_parent_in_list(
+/*====================*/
+ que_node_t* node_list, /*!< in: first node in a list */
+ que_node_t* parent) /*!< in: parent value to set in all
+ nodes of the list */
+{
+ que_common_t* common;
+
+ common = node_list;
+
+ while (common) {
+ common->parent = parent;
+
+ common = que_node_get_next(common);
+ }
+}
+
+/*********************************************************************//**
+Parses an elsif element.
+@return elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ elsif_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(elsif_node_t));
+
+ node->common.type = QUE_NODE_ELSIF;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an if-statement.
+@return if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+ que_node_t* cond, /*!< in: if-condition */
+ que_node_t* stat_list, /*!< in: statement list */
+ que_node_t* else_part) /*!< in: else-part statement list
+ or elsif element list */
+{
+ if_node_t* node;
+ elsif_node_t* elsif_node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(if_node_t));
+
+ node->common.type = QUE_NODE_IF;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) {
+
+ /* There is a list of elsif conditions */
+
+ node->else_part = NULL;
+ node->elsif_list = else_part;
+
+ elsif_node = else_part;
+
+ while (elsif_node) {
+ pars_set_parent_in_list(elsif_node->stat_list, node);
+
+ elsif_node = que_node_get_next(elsif_node);
+ }
+ } else {
+ node->else_part = else_part;
+ node->elsif_list = NULL;
+
+ pars_set_parent_in_list(else_part, node);
+ }
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a while-statement.
+@return while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+ que_node_t* cond, /*!< in: while-condition */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ while_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(while_node_t));
+
+ node->common.type = QUE_NODE_WHILE;
+
+ node->cond = cond;
+
+ pars_resolve_exp_variables_and_types(NULL, cond);
+
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+ sym_node_t* loop_var, /*!< in: loop variable */
+ que_node_t* loop_start_limit,/*!< in: loop start expression */
+ que_node_t* loop_end_limit, /*!< in: loop end expression */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ for_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t));
+
+ node->common.type = QUE_NODE_FOR;
+
+ pars_resolve_exp_variables_and_types(NULL, loop_var);
+ pars_resolve_exp_variables_and_types(NULL, loop_start_limit);
+ pars_resolve_exp_variables_and_types(NULL, loop_end_limit);
+
+ node->loop_var = loop_var->indirection;
+
+ ut_a(loop_var->indirection);
+
+ node->loop_start_limit = loop_start_limit;
+ node->loop_end_limit = loop_end_limit;
+
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an exit statement.
+@return exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void)
+/*=====================*/
+{
+ exit_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t));
+ node->common.type = QUE_NODE_EXIT;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a return-statement.
+@return return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void)
+/*=======================*/
+{
+ return_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap,
+ sizeof(return_node_t));
+ node->common.type = QUE_NODE_RETURN;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an assignment statement.
+@return assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+ sym_node_t* var, /*!< in: variable to assign */
+ que_node_t* val) /*!< in: value to assign */
+{
+ assign_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap,
+ sizeof(assign_node_t));
+ node->common.type = QUE_NODE_ASSIGNMENT;
+
+ node->var = var;
+ node->val = val;
+
+ pars_resolve_exp_variables_and_types(NULL, var);
+ pars_resolve_exp_variables_and_types(NULL, val);
+
+ ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var)))
+ == dtype_get_mtype(dfield_get_type(que_node_get_val(val))));
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure call.
+@return function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+ que_node_t* res_word,/*!< in: procedure name reserved word */
+ que_node_t* args) /*!< in: argument list */
+{
+ func_node_t* node;
+
+ node = pars_func(res_word, args);
+
+ pars_resolve_exp_list_variables_and_types(NULL, args);
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+ sym_node_t* cursor, /*!< in: cursor node */
+ sym_node_t* into_list, /*!< in: variables to set, or NULL */
+ sym_node_t* user_func) /*!< in: user function name, or NULL */
+{
+ sym_node_t* cursor_decl;
+ fetch_node_t* node;
+
+ /* Logical XOR. */
+ ut_a(!into_list != !user_func);
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(fetch_node_t));
+
+ node->common.type = QUE_NODE_FETCH;
+
+ pars_resolve_exp_variables_and_types(NULL, cursor);
+
+ if (into_list) {
+ pars_resolve_exp_list_variables_and_types(NULL, into_list);
+ node->into_list = into_list;
+ node->func = NULL;
+ } else {
+ pars_resolve_exp_variables_and_types(NULL, user_func);
+
+ node->func = pars_info_get_user_func(pars_sym_tab_global->info,
+ user_func->name);
+ ut_a(node->func);
+
+ node->into_list = NULL;
+ }
+
+ cursor_decl = cursor->alias;
+
+ ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+ node->cursor_def = cursor_decl->cursor_def;
+
+ if (into_list) {
+ ut_a(que_node_list_get_len(into_list)
+ == que_node_list_get_len(node->cursor_def->select_list));
+ }
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+ ulint type, /*!< in: ROW_SEL_OPEN_CURSOR
+ or ROW_SEL_CLOSE_CURSOR */
+ sym_node_t* cursor) /*!< in: cursor node */
+{
+ sym_node_t* cursor_decl;
+ open_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(open_node_t));
+
+ node->common.type = QUE_NODE_OPEN;
+
+ pars_resolve_exp_variables_and_types(NULL, cursor);
+
+ cursor_decl = cursor->alias;
+
+ ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+ node->op_type = type;
+ node->cursor_def = cursor_decl->cursor_def;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+ sel_node_t* sel_node) /*!< in: select node */
+{
+ row_printf_node_t* node;
+
+ node = mem_heap_alloc(pars_sym_tab_global->heap,
+ sizeof(row_printf_node_t));
+ node->common.type = QUE_NODE_ROW_PRINTF;
+
+ node->sel_node = sel_node;
+
+ sel_node->common.parent = node;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a commit statement.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void)
+/*=======================*/
+{
+ return(commit_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a rollback statement.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void)
+/*=========================*/
+{
+ return(roll_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+ sym_node_t* sym_node, /*!< in: column node in the
+ symbol table */
+ pars_res_word_t* type, /*!< in: data type */
+ sym_node_t* len, /*!< in: length of column, or
+ NULL */
+ void* is_unsigned, /*!< in: if not NULL, column
+ is of type UNSIGNED. */
+ void* is_not_null) /*!< in: if not NULL, column
+ is of type NOT NULL. */
+{
+ ulint len2;
+
+ if (len) {
+ len2 = eval_node_get_int_val(len);
+ } else {
+ len2 = 0;
+ }
+
+ pars_set_dfield_type(que_node_get_val(sym_node), type, len2,
+ is_unsigned != NULL, is_not_null != NULL);
+
+ return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a table creation operation.
+@return table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_defs, /*!< in: list of column names */
+ void* not_fit_in_memory __attribute__((unused)))
+ /*!< in: a non-NULL pointer means that
+ this is a table which in simulations
+ should be simulated as not fitting
+ in memory; thread is put to sleep
+ to simulate disk accesses; NOTE that
+ this flag is not stored to the data
+ dictionary on disk, and the database
+ will forget about non-NULL value if
+ it has to reload the table definition
+ from disk */
+{
+ dict_table_t* table;
+ sym_node_t* column;
+ tab_node_t* node;
+ const dtype_t* dtype;
+ ulint n_cols;
+
+ n_cols = que_node_list_get_len(column_defs);
+
+ /* As the InnoDB SQL parser is for internal use only,
+ for creating some system tables, this function will only
+ create tables in the old (not compact) record format. */
+ table = dict_mem_table_create(table_sym->name, 0, n_cols, 0);
+
+#ifdef UNIV_DEBUG
+ if (not_fit_in_memory != NULL) {
+ table->does_not_fit_in_memory = TRUE;
+ }
+#endif /* UNIV_DEBUG */
+ column = column_defs;
+
+ while (column) {
+ dtype = dfield_get_type(que_node_get_val(column));
+
+ dict_mem_table_add_col(table, table->heap,
+ column->name, dtype->mtype,
+ dtype->prtype, dtype->len);
+ column->resolved = TRUE;
+ column->token_type = SYM_COLUMN;
+
+ column = que_node_get_next(column);
+ }
+
+ node = tab_create_graph_create(table, pars_sym_tab_global->heap);
+
+ table_sym->resolved = TRUE;
+ table_sym->token_type = SYM_TABLE;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses an index creation operation.
+@return index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+ pars_res_word_t* unique_def, /*!< in: not NULL if a unique index */
+ pars_res_word_t* clustered_def, /*!< in: not NULL if a clustered index */
+ sym_node_t* index_sym, /*!< in: index name node in the symbol
+ table */
+ sym_node_t* table_sym, /*!< in: table name node in the symbol
+ table */
+ sym_node_t* column_list) /*!< in: list of column names */
+{
+ dict_index_t* index;
+ sym_node_t* column;
+ ind_node_t* node;
+ ulint n_fields;
+ ulint ind_type;
+
+ n_fields = que_node_list_get_len(column_list);
+
+ ind_type = 0;
+
+ if (unique_def) {
+ ind_type = ind_type | DICT_UNIQUE;
+ }
+
+ if (clustered_def) {
+ ind_type = ind_type | DICT_CLUSTERED;
+ }
+
+ index = dict_mem_index_create(table_sym->name, index_sym->name, 0,
+ ind_type, n_fields);
+ column = column_list;
+
+ while (column) {
+ dict_mem_index_add_field(index, column->name, 0);
+
+ column->resolved = TRUE;
+ column->token_type = SYM_COLUMN;
+
+ column = que_node_get_next(column);
+ }
+
+ node = ind_create_graph_create(index, pars_sym_tab_global->heap);
+
+ table_sym->resolved = TRUE;
+ table_sym->token_type = SYM_TABLE;
+
+ index_sym->resolved = TRUE;
+ index_sym->token_type = SYM_TABLE;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure definition.
+@return query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+ sym_node_t* sym_node, /*!< in: procedure id node in the symbol
+ table */
+ sym_node_t* param_list, /*!< in: parameter declaration list */
+ que_node_t* stat_list) /*!< in: statement list */
+{
+ proc_node_t* node;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ mem_heap_t* heap;
+
+ heap = pars_sym_tab_global->heap;
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap);
+ fork->trx = NULL;
+
+ thr = que_thr_create(fork, heap);
+
+ node = mem_heap_alloc(heap, sizeof(proc_node_t));
+
+ node->common.type = QUE_NODE_PROC;
+ node->common.parent = thr;
+
+ sym_node->token_type = SYM_PROCEDURE_NAME;
+ sym_node->resolved = TRUE;
+
+ node->proc_id = sym_node;
+ node->param_list = param_list;
+ node->stat_list = stat_list;
+
+ pars_set_parent_in_list(stat_list, node);
+
+ node->sym_tab = pars_sym_tab_global;
+
+ thr->child = node;
+
+ pars_sym_tab_global->query_graph = fork;
+
+ return(fork);
+}
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+ sym_node_t* sym_node __attribute__((unused)))
+ /*!< in: stored procedure name */
+{
+ ut_error;
+ return(NULL);
+}
+
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+void
+pars_get_lex_chars(
+/*===============*/
+ char* buf, /*!< in/out: buffer where to copy */
+ int* result, /*!< out: number of characters copied or EOF */
+ int max_size) /*!< in: maximum number of characters which fit
+ in the buffer */
+{
+ int len;
+
+ len = pars_sym_tab_global->string_len
+ - pars_sym_tab_global->next_char_pos;
+ if (len == 0) {
+#ifdef YYDEBUG
+ /* fputs("SQL string ends\n", stderr); */
+#endif
+ *result = 0;
+
+ return;
+ }
+
+ if (len > max_size) {
+ len = max_size;
+ }
+
+#ifdef UNIV_SQL_DEBUG
+ if (pars_print_lexed) {
+
+ if (len >= 5) {
+ len = 5;
+ }
+
+ fwrite(pars_sym_tab_global->sql_string
+ + pars_sym_tab_global->next_char_pos,
+ 1, len, stderr);
+ }
+#endif /* UNIV_SQL_DEBUG */
+
+ ut_memcpy(buf, pars_sym_tab_global->sql_string
+ + pars_sym_tab_global->next_char_pos, len);
+ *result = len;
+
+ pars_sym_tab_global->next_char_pos += len;
+}
+
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+ const char* s __attribute__((unused)))
+ /*!< in: error message string */
+{
+ ut_ad(s);
+
+ fputs("PARSER ERROR: Syntax error in SQL string\n", stderr);
+
+ ut_error;
+}
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+ pars_info_t* info, /*!< in: extra information, or NULL */
+ const char* str) /*!< in: SQL string */
+{
+ sym_node_t* sym_node;
+ mem_heap_t* heap;
+ que_t* graph;
+
+ ut_ad(str);
+
+ heap = mem_heap_create(256);
+
+ /* Currently, the parser is not reentrant: */
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+
+ pars_sym_tab_global = sym_tab_create(heap);
+
+ pars_sym_tab_global->string_len = strlen(str);
+ pars_sym_tab_global->sql_string = mem_heap_dup(
+ heap, str, pars_sym_tab_global->string_len + 1);
+ pars_sym_tab_global->next_char_pos = 0;
+ pars_sym_tab_global->info = info;
+
+ yyparse();
+
+ sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+ while (sym_node) {
+ ut_a(sym_node->resolved);
+
+ sym_node = UT_LIST_GET_NEXT(sym_list, sym_node);
+ }
+
+ graph = pars_sym_tab_global->query_graph;
+
+ graph->sym_tab = pars_sym_tab_global;
+ graph->info = info;
+
+ /* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
+
+ return(graph);
+}
+
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+ que_node_t* node, /*!< in: root node for an incomplete
+ query graph */
+ trx_t* trx, /*!< in: transaction handle */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ que_fork_t* fork;
+ que_thr_t* thr;
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+
+ thr->child = node;
+
+ que_node_set_parent(node, thr);
+
+ trx->graph = NULL;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Create parser info struct.
+@return own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void)
+/*==================*/
+{
+ pars_info_t* info;
+ mem_heap_t* heap;
+
+ heap = mem_heap_create(512);
+
+ info = mem_heap_alloc(heap, sizeof(*info));
+
+ info->heap = heap;
+ info->funcs = NULL;
+ info->bound_lits = NULL;
+ info->bound_ids = NULL;
+ info->graph_owns_us = TRUE;
+
+ return(info);
+}
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+ pars_info_t* info) /*!< in, own: info struct */
+{
+ mem_heap_free(info->heap);
+}
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const void* address, /*!< in: address */
+ ulint length, /*!< in: length of data */
+ ulint type, /*!< in: type, e.g. DATA_FIXBINARY */
+ ulint prtype) /*!< in: precise type, e.g.
+ DATA_UNSIGNED */
+{
+ pars_bound_lit_t* pbl;
+
+ ut_ad(!pars_info_get_bound_lit(info, name));
+
+ pbl = mem_heap_alloc(info->heap, sizeof(*pbl));
+
+ pbl->name = name;
+ pbl->address = address;
+ pbl->length = length;
+ pbl->type = type;
+ pbl->prtype = prtype;
+
+ if (!info->bound_lits) {
+ info->bound_lits = ib_vector_create(info->heap, 8);
+ }
+
+ ib_vector_push(info->bound_lits, pbl);
+}
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* str) /*!< in: string */
+{
+ pars_info_add_literal(info, name, str, strlen(str),
+ DATA_VARCHAR, DATA_ENGLISH);
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ lint val) /*!< in: value */
+{
+ byte* buf = mem_heap_alloc(info->heap, 4);
+
+ mach_write_to_4(buf, val);
+ pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_dulint_literal(
+/*=========================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ dulint val) /*!< in: value */
+{
+ byte* buf = mem_heap_alloc(info->heap, 8);
+
+ mach_write_to_8(buf, val);
+
+ pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+}
+
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_add_function(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: function name */
+ pars_user_func_cb_t func, /*!< in: function address */
+ void* arg) /*!< in: user-supplied argument */
+{
+ pars_user_func_t* puf;
+
+ ut_ad(!pars_info_get_user_func(info, name));
+
+ puf = mem_heap_alloc(info->heap, sizeof(*puf));
+
+ puf->name = name;
+ puf->func = func;
+ puf->arg = arg;
+
+ if (!info->funcs) {
+ info->funcs = ib_vector_create(info->heap, 8);
+ }
+
+ ib_vector_push(info->funcs, puf);
+}
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name, /*!< in: name */
+ const char* id) /*!< in: id */
+{
+ pars_bound_id_t* bid;
+
+ ut_ad(!pars_info_get_bound_id(info, name));
+
+ bid = mem_heap_alloc(info->heap, sizeof(*bid));
+
+ bid->name = name;
+ bid->id = id;
+
+ if (!info->bound_ids) {
+ info->bound_ids = ib_vector_create(info->heap, 8);
+ }
+
+ ib_vector_push(info->bound_ids, bid);
+}
+
+/****************************************************************//**
+Get user function with the given name.
+@return user func, or NULL if not found */
+UNIV_INTERN
+pars_user_func_t*
+pars_info_get_user_func(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name) /*!< in: function name to find*/
+{
+ ulint i;
+ ib_vector_t* vec;
+
+ if (!info || !info->funcs) {
+ return(NULL);
+ }
+
+ vec = info->funcs;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_user_func_t* puf = ib_vector_get(vec, i);
+
+ if (strcmp(puf->name, name) == 0) {
+ return(puf);
+ }
+ }
+
+ return(NULL);
+}
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name) /*!< in: bound literal name to find */
+{
+ ulint i;
+ ib_vector_t* vec;
+
+ if (!info || !info->bound_lits) {
+ return(NULL);
+ }
+
+ vec = info->bound_lits;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_bound_lit_t* pbl = ib_vector_get(vec, i);
+
+ if (strcmp(pbl->name, name) == 0) {
+ return(pbl);
+ }
+ }
+
+ return(NULL);
+}
+
+/****************************************************************//**
+Get bound id with the given name.
+@return bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+ pars_info_t* info, /*!< in: info struct */
+ const char* name) /*!< in: bound id name to find */
+{
+ ulint i;
+ ib_vector_t* vec;
+
+ if (!info || !info->bound_ids) {
+ return(NULL);
+ }
+
+ vec = info->bound_ids;
+
+ for (i = 0; i < ib_vector_size(vec); i++) {
+ pars_bound_id_t* bid = ib_vector_get(vec, i);
+
+ if (strcmp(bid->name, name) == 0) {
+ return(bid);
+ }
+ }
+
+ return(NULL);
+}
diff --git a/storage/xtradb/pars/pars0sym.c b/storage/xtradb/pars/pars0sym.c
new file mode 100644
index 00000000000..b56350116bb
--- /dev/null
+++ b/storage/xtradb/pars/pars0sym.c
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0sym.c
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0sym.h"
+
+#ifdef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#include "mem0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+#include "row0sel.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+ mem_heap_t* heap) /*!< in: memory heap where to create */
+{
+ sym_tab_t* sym_tab;
+
+ sym_tab = mem_heap_alloc(heap, sizeof(sym_tab_t));
+
+ UT_LIST_INIT(sym_tab->sym_list);
+ UT_LIST_INIT(sym_tab->func_node_list);
+
+ sym_tab->heap = heap;
+
+ return(sym_tab);
+}
+
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+ sym_tab_t* sym_tab) /*!< in, own: symbol table */
+{
+ sym_node_t* sym;
+ func_node_t* func;
+
+ sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+
+ while (sym) {
+ eval_node_free_val_buf(sym);
+
+ if (sym->prefetch_buf) {
+ sel_col_prefetch_buf_free(sym->prefetch_buf);
+ }
+
+ if (sym->cursor_def) {
+ que_graph_free_recursive(sym->cursor_def);
+ }
+
+ sym = UT_LIST_GET_NEXT(sym_list, sym);
+ }
+
+ func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+
+ while (func) {
+ eval_node_free_val_buf(func);
+
+ func = UT_LIST_GET_NEXT(func_node_list, func);
+ }
+}
+
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ ulint val) /*!< in: integer value */
+{
+ sym_node_t* node;
+ byte* data;
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
+
+ data = mem_heap_alloc(sym_tab->heap, 4);
+ mach_write_to_4(data, val);
+
+ dfield_set_data(&(node->common.val), data, 4);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds a string literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* str, /*!< in: string with no quotes around
+ it */
+ ulint len) /*!< in: string length */
+{
+ sym_node_t* node;
+ byte* data;
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dtype_set(dfield_get_type(&node->common.val),
+ DATA_VARCHAR, DATA_ENGLISH, 0);
+
+ if (len) {
+ data = mem_heap_alloc(sym_tab->heap, len);
+ ut_memcpy(data, str, len);
+ } else {
+ data = NULL;
+ }
+
+ dfield_set_data(&(node->common.val), data, len);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name, /*!< in: name of bound literal */
+ ulint* lit_type) /*!< out: type of literal (PARS_*_LIT) */
+{
+ sym_node_t* node;
+ pars_bound_lit_t* blit;
+ ulint len = 0;
+
+ blit = pars_info_get_bound_lit(sym_tab->info, name);
+ ut_a(blit);
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ switch (blit->type) {
+ case DATA_FIXBINARY:
+ len = blit->length;
+ *lit_type = PARS_FIXBINARY_LIT;
+ break;
+
+ case DATA_BLOB:
+ *lit_type = PARS_BLOB_LIT;
+ break;
+
+ case DATA_VARCHAR:
+ *lit_type = PARS_STR_LIT;
+ break;
+
+ case DATA_CHAR:
+ ut_a(blit->length > 0);
+
+ len = blit->length;
+ *lit_type = PARS_STR_LIT;
+ break;
+
+ case DATA_INT:
+ ut_a(blit->length > 0);
+ ut_a(blit->length <= 8);
+
+ len = blit->length;
+ *lit_type = PARS_INT_LIT;
+ break;
+
+ default:
+ ut_error;
+ }
+
+ dtype_set(dfield_get_type(&node->common.val),
+ blit->type, blit->prtype, len);
+
+ dfield_set_data(&(node->common.val), blit->address, blit->length);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+ sym_tab_t* sym_tab) /*!< in: symbol table */
+{
+ sym_node_t* node;
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = TRUE;
+ node->token_type = SYM_LIT;
+
+ node->indirection = NULL;
+
+ dfield_get_type(&node->common.val)->mtype = DATA_ERROR;
+
+ dfield_set_null(&node->common.val);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ byte* name, /*!< in: identifier name */
+ ulint len) /*!< in: identifier length */
+{
+ sym_node_t* node;
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = FALSE;
+ node->indirection = NULL;
+
+ node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
+ node->name_len = len;
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ dfield_set_null(&node->common.val);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+ sym_tab_t* sym_tab, /*!< in: symbol table */
+ const char* name) /*!< in: name of bound id */
+{
+ sym_node_t* node;
+ pars_bound_id_t* bid;
+
+ bid = pars_info_get_bound_id(sym_tab->info, name);
+ ut_a(bid);
+
+ node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+ node->common.type = QUE_NODE_SYMBOL;
+
+ node->resolved = FALSE;
+ node->indirection = NULL;
+
+ node->name = mem_heap_strdup(sym_tab->heap, bid->id);
+ node->name_len = strlen(node->name);
+
+ UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+ dfield_set_null(&node->common.val);
+
+ node->common.val_buf_size = 0;
+ node->prefetch_buf = NULL;
+ node->cursor_def = NULL;
+
+ node->sym_table = sym_tab;
+
+ return(node);
+}
diff --git a/storage/xtradb/plug.in b/storage/xtradb/plug.in
new file mode 100644
index 00000000000..37c895fb520
--- /dev/null
+++ b/storage/xtradb/plug.in
@@ -0,0 +1,228 @@
+#
+# Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+MYSQL_STORAGE_ENGINE(xtradb, xtradb, [XtraDB Storage Engine],
+ [XtraDB - a drop-in replacement for InnoDB], [max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(xtradb, [storage/xtradb])
+MYSQL_PLUGIN_STATIC(xtradb, [libxtradb.a])
+MYSQL_PLUGIN_DYNAMIC(xtradb, [ha_xtradb.la])
+MYSQL_PLUGIN_ACTIONS(xtradb, [
+ with_plugin_innobase=$with_plugin_xtradb # for legacy code in configure.in
+ AC_CHECK_LIB(rt, aio_read, [innodb_system_libs="-lrt"])
+ AC_SUBST(innodb_system_libs)
+ AC_CHECK_HEADERS(aio.h sched.h)
+ AC_CHECK_SIZEOF(int, 4)
+ AC_CHECK_SIZEOF(long, 4)
+ AC_CHECK_SIZEOF(void*, 4)
+ AC_CHECK_FUNCS(sched_yield fdatasync localtime_r)
+ AC_C_BIGENDIAN
+ case "$target_os" in
+ lin*)
+ CFLAGS="$CFLAGS -DUNIV_LINUX";;
+ hpux10*)
+ CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";;
+ hp*)
+ CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX";;
+ aix*)
+ CFLAGS="$CFLAGS -DUNIV_AIX";;
+ irix*|osf*|sysv5uw7*|openbsd*)
+ CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE";;
+ *solaris*|*SunOS*)
+ CFLAGS="$CFLAGS -DUNIV_SOLARIS";;
+ esac
+
+ INNODB_DYNAMIC_CFLAGS="-DMYSQL_DYNAMIC_PLUGIN"
+
+ case "$target_cpu" in
+ x86_64)
+ # The AMD64 ABI forbids absolute addresses in shared libraries
+ ;;
+ *86)
+ # Use absolute addresses on IA-32
+ INNODB_DYNAMIC_CFLAGS="$INNODB_DYNAMIC_CFLAGS -prefer-non-pic"
+ ;;
+ esac
+ AC_SUBST(INNODB_DYNAMIC_CFLAGS)
+
+ AC_MSG_CHECKING(whether GCC atomic builtins are available)
+ # either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
+ AC_TRY_RUN(
+ [
+ int main()
+ {
+ long x;
+ long y;
+ long res;
+ char c;
+
+ x = 10;
+ y = 123;
+ res = __sync_bool_compare_and_swap(&x, x, y);
+ if (!res || x != y) {
+ return(1);
+ }
+
+ x = 10;
+ y = 123;
+ res = __sync_bool_compare_and_swap(&x, x + 1, y);
+ if (res || x != 10) {
+ return(1);
+ }
+
+ x = 10;
+ y = 123;
+ res = __sync_add_and_fetch(&x, y);
+ if (res != 123 + 10 || x != 123 + 10) {
+ return(1);
+ }
+
+ c = 10;
+ res = __sync_lock_test_and_set(&c, 123);
+ if (res != 10 || c != 123) {
+ return(1);
+ }
+
+ return(0);
+ }
+ ],
+ [
+ AC_DEFINE([HAVE_IB_GCC_ATOMIC_BUILTINS], [1],
+ [GCC atomic builtins are available])
+ AC_MSG_RESULT(yes)
+ ],
+ [
+ AC_MSG_RESULT(no)
+ ]
+ )
+
+ AC_MSG_CHECKING(whether pthread_t can be used by GCC atomic builtins)
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
+ AC_TRY_RUN(
+ [
+ #include <pthread.h>
+ #include <string.h>
+
+ int main(int argc, char** argv) {
+ pthread_t x1;
+ pthread_t x2;
+ pthread_t x3;
+
+ memset(&x1, 0x0, sizeof(x1));
+ memset(&x2, 0x0, sizeof(x2));
+ memset(&x3, 0x0, sizeof(x3));
+
+ __sync_bool_compare_and_swap(&x1, x2, x3);
+
+ return(0);
+ }
+ ],
+ [
+ AC_DEFINE([HAVE_IB_ATOMIC_PTHREAD_T_GCC], [1],
+ [pthread_t can be used by GCC atomic builtins])
+ AC_MSG_RESULT(yes)
+ ],
+ [
+ AC_MSG_RESULT(no)
+ ]
+ )
+
+ AC_MSG_CHECKING(whether Solaris libc atomic functions are available)
+ # either define HAVE_IB_SOLARIS_ATOMICS or not
+ AC_CHECK_FUNCS(atomic_add_long \
+ atomic_cas_32 \
+ atomic_cas_64 \
+ atomic_cas_ulong,
+
+ AC_DEFINE([HAVE_IB_SOLARIS_ATOMICS], [1],
+ [Define to 1 if Solaris libc atomic functions \
+ are available])
+ )
+
+ AC_MSG_CHECKING(whether pthread_t can be used by Solaris libc atomic functions)
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
+ AC_TRY_RUN(
+ [
+ #include <pthread.h>
+ #include <string.h>
+
+ int main(int argc, char** argv) {
+ pthread_t x1;
+ pthread_t x2;
+ pthread_t x3;
+
+ memset(&x1, 0x0, sizeof(x1));
+ memset(&x2, 0x0, sizeof(x2));
+ memset(&x3, 0x0, sizeof(x3));
+
+ if (sizeof(pthread_t) == 4) {
+
+ atomic_cas_32(&x1, x2, x3);
+
+ } else if (sizeof(pthread_t) == 8) {
+
+ atomic_cas_64(&x1, x2, x3);
+
+ } else {
+
+ return(1);
+ }
+
+ return(0);
+ }
+ ],
+ [
+ AC_DEFINE([HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS], [1],
+ [pthread_t can be used by solaris atomics])
+ AC_MSG_RESULT(yes)
+ ],
+ [
+ AC_MSG_RESULT(no)
+ ]
+ )
+
+ # this is needed to know which one of atomic_cas_32() or atomic_cas_64()
+ # to use in the source
+ AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
+
+ # Check for x86 PAUSE instruction
+ AC_MSG_CHECKING(for x86 PAUSE instruction)
+ # We have to actually try running the test program, because of a bug
+ # in Solaris on x86_64, where it wrongly reports that PAUSE is not
+ # supported when trying to run an application. See
+ # http://bugs.opensolaris.org/bugdatabase/printableBug.do?bug_id=6478684
+ # We use ib_ prefix to avoid collisoins if this code is added to
+ # mysql's configure.in.
+ AC_TRY_RUN(
+ [
+ int main() {
+ __asm__ __volatile__ ("pause");
+ return(0);
+ }
+ ],
+ [
+ AC_DEFINE([HAVE_IB_PAUSE_INSTRUCTION], [1], [Does x86 PAUSE instruction exist])
+ AC_MSG_RESULT(yes)
+ ],
+ [
+ AC_MSG_RESULT(no)
+ ],
+ [
+ AC_MSG_RESULT(no)
+ ]
+ )
+ ])
+
+# vim: set ft=config:
diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.c
new file mode 100644
index 00000000000..5c85a04d139
--- /dev/null
+++ b/storage/xtradb/que/que0que.c
@@ -0,0 +1,1450 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file que/que0que.c
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+#ifdef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#include "srv0que.h"
+#include "usr0sess.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "row0undo.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0purge.h"
+#include "dict0crea.h"
+#include "log0log.h"
+#include "eval0proc.h"
+#include "eval0eval.h"
+#include "pars0types.h"
+
+#define QUE_PARALLELIZE_LIMIT (64 * 256 * 256 * 256)
+#define QUE_ROUND_ROBIN_LIMIT (64 * 256 * 256 * 256)
+#define QUE_MAX_LOOPS_WITHOUT_CHECK 16
+
+#ifdef UNIV_DEBUG
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+UNIV_INTERN ibool que_trace_on = FALSE;
+#endif /* UNIV_DEBUG */
+
+/* Short introduction to query graphs
+ ==================================
+
+A query graph consists of nodes linked to each other in various ways. The
+execution starts at que_run_threads() which takes a que_thr_t parameter.
+que_thr_t contains two fields that control query graph execution: run_node
+and prev_node. run_node is the next node to execute and prev_node is the
+last node executed.
+
+Each node has a pointer to a 'next' statement, i.e., its brother, and a
+pointer to its parent node. The next pointer is NULL in the last statement
+of a block.
+
+Loop nodes contain a link to the first statement of the enclosed statement
+list. While the loop runs, que_thr_step() checks if execution to the loop
+node came from its parent or from one of the statement nodes in the loop. If
+it came from the parent of the loop node it starts executing the first
+statement node in the loop. If it came from one of the statement nodes in
+the loop, then it checks if the statement node has another statement node
+following it, and runs it if so.
+
+To signify loop ending, the loop statements (see e.g. while_step()) set
+que_thr_t->run_node to the loop node's parent node. This is noticed on the
+next call of que_thr_step() and execution proceeds to the node pointed to by
+the loop node's 'next' pointer.
+
+For example, the code:
+
+X := 1;
+WHILE X < 5 LOOP
+ X := X + 1;
+ X := X + 1;
+X := 5
+
+will result in the following node hierarchy, with the X-axis indicating
+'next' links and the Y-axis indicating parent/child links:
+
+A - W - A
+ |
+ |
+ A - A
+
+A = assign_node_t, W = while_node_t. */
+
+/* How a stored procedure containing COMMIT or ROLLBACK commands
+is executed?
+
+The commit or rollback can be seen as a subprocedure call.
+The problem is that if there are several query threads
+currently running within the transaction, their action could
+mess the commit or rollback operation. Or, at the least, the
+operation would be difficult to visualize and keep in control.
+
+Therefore the query thread requesting a commit or a rollback
+sends to the transaction a signal, which moves the transaction
+to TRX_QUE_SIGNALED state. All running query threads of the
+transaction will eventually notice that the transaction is now in
+this state and voluntarily suspend themselves. Only the last
+query thread which suspends itself will trigger handling of
+the signal.
+
+When the transaction starts to handle a rollback or commit
+signal, it builds a query graph which, when executed, will
+roll back or commit the incomplete transaction. The transaction
+is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
+If specified, the SQL cursors opened by the transaction are closed.
+When the execution of the graph completes, it is like returning
+from a subprocedure: the query thread which requested the operation
+starts running again. */
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction.
+***NOTE***: This is the only function in which such a transition is allowed
+to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+ que_thr_t* thr); /*!< in: an query thread */
+
+/***********************************************************************//**
+Adds a query graph to the session's list of graphs. */
+UNIV_INTERN
+void
+que_graph_publish(
+/*==============*/
+ que_t* graph, /*!< in: graph */
+ sess_t* sess) /*!< in: session */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ UT_LIST_ADD_LAST(graphs, sess->graphs, graph);
+}
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+ que_t* graph, /*!< in: graph, if NULL then this
+ fork node is assumed to be the
+ graph root */
+ que_node_t* parent, /*!< in: parent node */
+ ulint fork_type, /*!< in: fork type */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ que_fork_t* fork;
+
+ ut_ad(heap);
+
+ fork = mem_heap_alloc(heap, sizeof(que_fork_t));
+
+ fork->common.type = QUE_NODE_FORK;
+ fork->n_active_thrs = 0;
+
+ fork->state = QUE_FORK_COMMAND_WAIT;
+
+ if (graph != NULL) {
+ fork->graph = graph;
+ } else {
+ fork->graph = fork;
+ }
+
+ fork->common.parent = parent;
+ fork->fork_type = fork_type;
+
+ fork->caller = NULL;
+
+ UT_LIST_INIT(fork->thrs);
+
+ fork->sym_tab = NULL;
+ fork->info = NULL;
+
+ fork->heap = heap;
+
+ return(fork);
+}
+
+/***********************************************************************//**
+Creates a query graph thread node.
+@return own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+ que_fork_t* parent, /*!< in: parent node, i.e., a fork node */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ que_thr_t* thr;
+
+ ut_ad(parent && heap);
+
+ thr = mem_heap_alloc(heap, sizeof(que_thr_t));
+
+ thr->common.type = QUE_NODE_THR;
+ thr->common.parent = parent;
+
+ thr->magic_n = QUE_THR_MAGIC_N;
+
+ thr->graph = parent->graph;
+
+ thr->state = QUE_THR_COMMAND_WAIT;
+
+ thr->is_active = FALSE;
+
+ thr->run_node = NULL;
+ thr->resource = 0;
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+ UT_LIST_ADD_LAST(thrs, parent->thrs, thr);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and may release
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+UNIV_INTERN
+void
+que_thr_end_wait(
+/*=============*/
+ que_thr_t* thr, /*!< in: query thread in the
+ QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if NULL is passed
+ as the parameter, it is ignored */
+{
+ ibool was_active;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(thr);
+ ut_ad((thr->state == QUE_THR_LOCK_WAIT)
+ || (thr->state == QUE_THR_PROCEDURE_WAIT)
+ || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+ ut_ad(thr->run_node);
+
+ thr->prev_node = thr->run_node;
+
+ was_active = thr->is_active;
+
+ que_thr_move_to_run_state(thr);
+
+ if (was_active) {
+
+ return;
+ }
+
+ if (next_thr && *next_thr == NULL) {
+ *next_thr = thr;
+ } else {
+ ut_a(0);
+ srv_que_task_enqueue_low(thr);
+ }
+}
+
+/**********************************************************************//**
+Same as que_thr_end_wait, but no parameter next_thr available. */
+UNIV_INTERN
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+ que_thr_t* thr) /*!< in: query thread in the QUE_THR_LOCK_WAIT,
+ or QUE_THR_PROCEDURE_WAIT, or
+ QUE_THR_SIG_REPLY_WAIT state */
+{
+ ibool was_active;
+
+ ut_a(thr->state == QUE_THR_LOCK_WAIT); /* In MySQL this is the
+ only possible state here */
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(thr);
+ ut_ad((thr->state == QUE_THR_LOCK_WAIT)
+ || (thr->state == QUE_THR_PROCEDURE_WAIT)
+ || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+
+ was_active = thr->is_active;
+
+ que_thr_move_to_run_state(thr);
+
+ if (was_active) {
+
+ return;
+ }
+
+ /* In MySQL we let the OS thread (not just the query thread) to wait
+ for the lock to be released: */
+
+ srv_release_mysql_thread_if_suspended(thr);
+
+ /* srv_que_task_enqueue_low(thr); */
+}
+
+/**********************************************************************//**
+Inits a query thread for a command. */
+UNIV_INLINE
+void
+que_thr_init_command(
+/*=================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ que_thr_move_to_run_state(thr);
+}
+
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+ que_fork_t* fork) /*!< in: a query fork */
+{
+ que_thr_t* thr;
+ que_thr_t* suspended_thr = NULL;
+ que_thr_t* completed_thr = NULL;
+
+ fork->state = QUE_FORK_ACTIVE;
+
+ fork->last_sel_node = NULL;
+
+ suspended_thr = NULL;
+ completed_thr = NULL;
+
+ /* Choose the query thread to run: usually there is just one thread,
+ but in a parallelized select, which necessarily is non-scrollable,
+ there may be several to choose from */
+
+ /* First we try to find a query thread in the QUE_THR_COMMAND_WAIT
+ state. Then we try to find a query thread in the QUE_THR_SUSPENDED
+ state, finally we try to find a query thread in the QUE_THR_COMPLETED
+ state */
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ /* We make a single pass over the thr list within which we note which
+ threads are ready to run. */
+ while (thr) {
+ switch (thr->state) {
+ case QUE_THR_COMMAND_WAIT:
+
+ /* We have to send the initial message to query thread
+ to start it */
+
+ que_thr_init_command(thr);
+
+ return(thr);
+
+ case QUE_THR_SUSPENDED:
+ /* In this case the execution of the thread was
+ suspended: no initial message is needed because
+ execution can continue from where it was left */
+ if (!suspended_thr) {
+ suspended_thr = thr;
+ }
+
+ break;
+
+ case QUE_THR_COMPLETED:
+ if (!completed_thr) {
+ completed_thr = thr;
+ }
+
+ break;
+
+ case QUE_THR_LOCK_WAIT:
+ ut_error;
+
+ }
+
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+ }
+
+ if (suspended_thr) {
+
+ thr = suspended_thr;
+ que_thr_move_to_run_state(thr);
+
+ } else if (completed_thr) {
+
+ thr = completed_thr;
+ que_thr_init_command(thr);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+UNIV_INTERN
+void
+que_fork_error_handle(
+/*==================*/
+ trx_t* trx __attribute__((unused)), /*!< in: trx */
+ que_t* fork) /*!< in: query graph which was run before signal
+ handling started, NULL not allowed */
+{
+ que_thr_t* thr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->sess->state == SESS_ERROR);
+ ut_ad(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ while (thr != NULL) {
+ ut_ad(!thr->is_active);
+ ut_ad(thr->state != QUE_THR_SIG_REPLY_WAIT);
+ ut_ad(thr->state != QUE_THR_LOCK_WAIT);
+
+ thr->run_node = thr;
+ thr->prev_node = thr->child;
+ thr->state = QUE_THR_COMPLETED;
+
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+ }
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ que_thr_move_to_run_state(thr);
+
+ ut_a(0);
+ srv_que_task_enqueue_low(thr);
+}
+
+/****************************************************************//**
+Tests if all the query threads in the same fork have a given state.
+@return TRUE if all the query threads in the same fork were in the
+given state */
+UNIV_INLINE
+ibool
+que_fork_all_thrs_in_state(
+/*=======================*/
+ que_fork_t* fork, /*!< in: query fork */
+ ulint state) /*!< in: state */
+{
+ que_thr_t* thr_node;
+
+ thr_node = UT_LIST_GET_FIRST(fork->thrs);
+
+ while (thr_node != NULL) {
+ if (thr_node->state != state) {
+
+ return(FALSE);
+ }
+
+ thr_node = UT_LIST_GET_NEXT(thrs, thr_node);
+ }
+
+ return(TRUE);
+}
+
+/**********************************************************************//**
+Calls que_graph_free_recursive for statements in a statement list. */
+static
+void
+que_graph_free_stat_list(
+/*=====================*/
+ que_node_t* node) /*!< in: first query graph node in the list */
+{
+ while (node) {
+ que_graph_free_recursive(node);
+
+ node = que_node_get_next(node);
+ }
+}
+
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ que_fork_t* fork;
+ que_thr_t* thr;
+ undo_node_t* undo;
+ sel_node_t* sel;
+ ins_node_t* ins;
+ upd_node_t* upd;
+ tab_node_t* cre_tab;
+ ind_node_t* cre_ind;
+ purge_node_t* purge;
+
+ if (node == NULL) {
+
+ return;
+ }
+
+ switch (que_node_get_type(node)) {
+
+ case QUE_NODE_FORK:
+ fork = node;
+
+ thr = UT_LIST_GET_FIRST(fork->thrs);
+
+ while (thr) {
+ que_graph_free_recursive(thr);
+
+ thr = UT_LIST_GET_NEXT(thrs, thr);
+ }
+
+ break;
+ case QUE_NODE_THR:
+
+ thr = node;
+
+ if (thr->magic_n != QUE_THR_MAGIC_N) {
+ fprintf(stderr,
+ "que_thr struct appears corrupt;"
+ " magic n %lu\n",
+ (unsigned long) thr->magic_n);
+ mem_analyze_corruption(thr);
+ ut_error;
+ }
+
+ thr->magic_n = QUE_THR_MAGIC_FREED;
+
+ que_graph_free_recursive(thr->child);
+
+ break;
+ case QUE_NODE_UNDO:
+
+ undo = node;
+
+ mem_heap_free(undo->heap);
+
+ break;
+ case QUE_NODE_SELECT:
+
+ sel = node;
+
+ sel_node_free_private(sel);
+
+ break;
+ case QUE_NODE_INSERT:
+
+ ins = node;
+
+ que_graph_free_recursive(ins->select);
+
+ mem_heap_free(ins->entry_sys_heap);
+
+ break;
+ case QUE_NODE_PURGE:
+ purge = node;
+
+ mem_heap_free(purge->heap);
+
+ break;
+
+ case QUE_NODE_UPDATE:
+
+ upd = node;
+
+ if (upd->in_mysql_interface) {
+
+ btr_pcur_free_for_mysql(upd->pcur);
+ }
+
+ que_graph_free_recursive(upd->cascade_node);
+
+ if (upd->cascade_heap) {
+ mem_heap_free(upd->cascade_heap);
+ }
+
+ que_graph_free_recursive(upd->select);
+
+ mem_heap_free(upd->heap);
+
+ break;
+ case QUE_NODE_CREATE_TABLE:
+ cre_tab = node;
+
+ que_graph_free_recursive(cre_tab->tab_def);
+ que_graph_free_recursive(cre_tab->col_def);
+ que_graph_free_recursive(cre_tab->commit_node);
+
+ mem_heap_free(cre_tab->heap);
+
+ break;
+ case QUE_NODE_CREATE_INDEX:
+ cre_ind = node;
+
+ que_graph_free_recursive(cre_ind->ind_def);
+ que_graph_free_recursive(cre_ind->field_def);
+ if (srv_use_sys_stats_table)
+ que_graph_free_recursive(cre_ind->stats_def);
+ que_graph_free_recursive(cre_ind->commit_node);
+
+ mem_heap_free(cre_ind->heap);
+
+ break;
+ case QUE_NODE_INSERT_STATS:
+ cre_ind = node;
+
+ que_graph_free_recursive(cre_ind->stats_def);
+ que_graph_free_recursive(cre_ind->commit_node);
+
+ mem_heap_free(cre_ind->heap);
+ break;
+ case QUE_NODE_PROC:
+ que_graph_free_stat_list(((proc_node_t*)node)->stat_list);
+
+ break;
+ case QUE_NODE_IF:
+ que_graph_free_stat_list(((if_node_t*)node)->stat_list);
+ que_graph_free_stat_list(((if_node_t*)node)->else_part);
+ que_graph_free_stat_list(((if_node_t*)node)->elsif_list);
+
+ break;
+ case QUE_NODE_ELSIF:
+ que_graph_free_stat_list(((elsif_node_t*)node)->stat_list);
+
+ break;
+ case QUE_NODE_WHILE:
+ que_graph_free_stat_list(((while_node_t*)node)->stat_list);
+
+ break;
+ case QUE_NODE_FOR:
+ que_graph_free_stat_list(((for_node_t*)node)->stat_list);
+
+ break;
+
+ case QUE_NODE_ASSIGNMENT:
+ case QUE_NODE_EXIT:
+ case QUE_NODE_RETURN:
+ case QUE_NODE_COMMIT:
+ case QUE_NODE_ROLLBACK:
+ case QUE_NODE_LOCK:
+ case QUE_NODE_FUNC:
+ case QUE_NODE_ORDER:
+ case QUE_NODE_ROW_PRINTF:
+ case QUE_NODE_OPEN:
+ case QUE_NODE_FETCH:
+ /* No need to do anything */
+
+ break;
+ default:
+ fprintf(stderr,
+ "que_node struct appears corrupt; type %lu\n",
+ (unsigned long) que_node_get_type(node));
+ mem_analyze_corruption(node);
+ ut_error;
+ }
+}
+
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+ que_t* graph) /*!< in: query graph; we assume that the memory
+ heap where this graph was created is private
+ to this graph: if not, then use
+ que_graph_free_recursive and free the heap
+ afterwards! */
+{
+ ut_ad(graph);
+
+ if (graph->sym_tab) {
+ /* The following call frees dynamic memory allocated
+ for variables etc. during execution. Frees also explicit
+ cursor definitions. */
+
+ sym_tab_free_private(graph->sym_tab);
+ }
+
+ if (graph->info && graph->info->graph_owns_us) {
+ pars_info_free(graph->info);
+ }
+
+ que_graph_free_recursive(graph);
+
+ mem_heap_free(graph->heap);
+}
+
+/****************************************************************//**
+Performs an execution step on a thr node.
+@return query thread to run next, or NULL if none */
+static
+que_thr_t*
+que_thr_node_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread where run_node must
+ be the thread node itself */
+{
+ ut_ad(thr->run_node == thr);
+
+ if (thr->prev_node == thr->common.parent) {
+ /* If control to the node came from above, it is just passed
+ on */
+
+ thr->run_node = thr->child;
+
+ return(thr);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ if (que_thr_peek_stop(thr)) {
+
+ mutex_exit(&kernel_mutex);
+
+ return(thr);
+ }
+
+ /* Thread execution completed */
+
+ thr->state = QUE_THR_COMPLETED;
+
+ mutex_exit(&kernel_mutex);
+
+ return(NULL);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active.
+***NOTE***: This and ..._mysql are the only functions in which such a
+transition is allowed to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+ que_thr_t* thr) /*!< in: an query thread */
+{
+ trx_t* trx;
+
+ ut_ad(thr->state != QUE_THR_RUNNING);
+
+ trx = thr_get_trx(thr);
+
+ if (!thr->is_active) {
+
+ (thr->graph)->n_active_thrs++;
+
+ trx->n_active_thrs++;
+
+ thr->is_active = TRUE;
+
+ ut_ad((thr->graph)->n_active_thrs == 1);
+ ut_ad(trx->n_active_thrs == 1);
+ }
+
+ thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+Decrements the query thread reference counts in the query graph and the
+transaction. May start signal handling, e.g., a rollback.
+*** NOTE ***:
+This and que_thr_stop_for_mysql are the only functions where the reference
+count can be decremented and this function may only be called from inside
+que_run_threads or que_thr_check_if_switch! These restrictions exist to make
+the rollback code easier to maintain. */
+static
+void
+que_thr_dec_refer_count(
+/*====================*/
+ que_thr_t* thr, /*!< in: query thread */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ que_fork_t* fork;
+ trx_t* trx;
+ ulint fork_type;
+ ibool stopped;
+
+ fork = thr->common.parent;
+ trx = thr_get_trx(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ ut_a(thr->is_active);
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ stopped = que_thr_stop(thr);
+
+ if (!stopped) {
+ /* The reason for the thr suspension or wait was
+ already canceled before we came here: continue
+ running the thread */
+
+ /* fputs("!!!!!!!! Wait already ended: continue thr\n",
+ stderr); */
+
+ if (next_thr && *next_thr == NULL) {
+ /* Normally srv_suspend_mysql_thread resets
+ the state to DB_SUCCESS before waiting, but
+ in this case we have to do it here,
+ otherwise nobody does it. */
+ trx->error_state = DB_SUCCESS;
+
+ *next_thr = thr;
+ } else {
+ ut_error;
+ srv_que_task_enqueue_low(thr);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+ }
+
+ ut_ad(fork->n_active_thrs == 1);
+ ut_ad(trx->n_active_thrs == 1);
+
+ fork->n_active_thrs--;
+ trx->n_active_thrs--;
+
+ thr->is_active = FALSE;
+
+ if (trx->n_active_thrs > 0) {
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ fork_type = fork->fork_type;
+
+ /* Check if all query threads in the same fork are completed */
+
+ if (que_fork_all_thrs_in_state(fork, QUE_THR_COMPLETED)) {
+
+ switch (fork_type) {
+ case QUE_FORK_ROLLBACK:
+ /* This is really the undo graph used in rollback,
+ no roll_node in this graph */
+
+ ut_ad(UT_LIST_GET_LEN(trx->signals) > 0);
+ ut_ad(trx->handling_signals == TRUE);
+
+ trx_finish_rollback_off_kernel(fork, trx, next_thr);
+ break;
+
+ case QUE_FORK_PURGE:
+ case QUE_FORK_RECOVERY:
+ case QUE_FORK_MYSQL_INTERFACE:
+
+ /* Do nothing */
+ break;
+
+ default:
+ ut_error; /*!< not used in MySQL */
+ }
+ }
+
+ if (UT_LIST_GET_LEN(trx->signals) > 0 && trx->n_active_thrs == 0) {
+
+ /* If the trx is signaled and its query thread count drops to
+ zero, then we start processing a signal; from it we may get
+ a new query thread to run */
+
+ trx_sig_start_handle(trx, next_thr);
+ }
+
+ if (trx->handling_signals && UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ trx_end_signal_handling(trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved.
+@return TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ que_t* graph;
+ ibool ret = TRUE;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ graph = thr->graph;
+ trx = graph->trx;
+
+ if (graph->state == QUE_FORK_COMMAND_WAIT) {
+ thr->state = QUE_THR_SUSPENDED;
+
+ } else if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ UT_LIST_ADD_FIRST(trx_thrs, trx->wait_thrs, thr);
+ thr->state = QUE_THR_LOCK_WAIT;
+
+ } else if (trx->error_state != DB_SUCCESS
+ && trx->error_state != DB_LOCK_WAIT) {
+
+ /* Error handling built for the MySQL interface */
+ thr->state = QUE_THR_COMPLETED;
+
+ } else if (UT_LIST_GET_LEN(trx->signals) > 0
+ && graph->fork_type != QUE_FORK_ROLLBACK) {
+
+ thr->state = QUE_THR_SUSPENDED;
+ } else {
+ ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+ ret = FALSE;
+ }
+
+ return(ret);
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.c, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ if (trx->error_state != DB_SUCCESS
+ && trx->error_state != DB_LOCK_WAIT) {
+
+ /* Error handling built for the MySQL interface */
+ thr->state = QUE_THR_COMPLETED;
+ } else {
+ /* It must have been a lock wait but the lock was
+ already released, or this transaction was chosen
+ as a victim in selective deadlock resolution */
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+ }
+
+ ut_ad(thr->is_active == TRUE);
+ ut_ad(trx->n_active_thrs == 1);
+ ut_ad(thr->graph->n_active_thrs == 1);
+
+ thr->is_active = FALSE;
+ (thr->graph)->n_active_thrs--;
+
+ trx->n_active_thrs--;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+ que_thr_t* thr, /*!< in: an query thread */
+ trx_t* trx) /*!< in: transaction */
+{
+ if (thr->magic_n != QUE_THR_MAGIC_N) {
+ fprintf(stderr,
+ "que_thr struct appears corrupt; magic n %lu\n",
+ (unsigned long) thr->magic_n);
+
+ mem_analyze_corruption(thr);
+
+ ut_error;
+ }
+
+ if (!thr->is_active) {
+
+ thr->graph->n_active_thrs++;
+
+ trx->n_active_thrs++;
+
+ thr->is_active = TRUE;
+ }
+
+ thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+ que_thr_t* thr, /*!< in: query thread */
+ trx_t* trx) /*!< in: transaction */
+{
+ ut_ad(thr->state == QUE_THR_RUNNING);
+ ut_ad(thr->is_active == TRUE);
+ ut_ad(trx->n_active_thrs == 1);
+ ut_ad(thr->graph->n_active_thrs == 1);
+
+ if (thr->magic_n != QUE_THR_MAGIC_N) {
+ fprintf(stderr,
+ "que_thr struct appears corrupt; magic n %lu\n",
+ (unsigned long) thr->magic_n);
+
+ mem_analyze_corruption(thr);
+
+ ut_error;
+ }
+
+ thr->state = QUE_THR_COMPLETED;
+
+ thr->is_active = FALSE;
+ (thr->graph)->n_active_thrs--;
+
+ trx->n_active_thrs--;
+}
+
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+ que_node_t* node) /*!< in: node */
+{
+ ut_ad(node);
+
+ for (;;) {
+ ulint type;
+
+ node = que_node_get_parent(node);
+
+ if (!node) {
+ break;
+ }
+
+ type = que_node_get_type(node);
+
+ if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) {
+ break;
+ }
+ }
+
+ return(node);
+}
+
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+ que_node_t* node) /*!< in: query graph node */
+{
+ ulint type;
+ const char* str;
+
+ type = que_node_get_type(node);
+
+ if (type == QUE_NODE_SELECT) {
+ str = "SELECT";
+ } else if (type == QUE_NODE_INSERT) {
+ str = "INSERT";
+ } else if (type == QUE_NODE_UPDATE) {
+ str = "UPDATE";
+ } else if (type == QUE_NODE_WHILE) {
+ str = "WHILE";
+ } else if (type == QUE_NODE_ASSIGNMENT) {
+ str = "ASSIGNMENT";
+ } else if (type == QUE_NODE_IF) {
+ str = "IF";
+ } else if (type == QUE_NODE_FETCH) {
+ str = "FETCH";
+ } else if (type == QUE_NODE_OPEN) {
+ str = "OPEN";
+ } else if (type == QUE_NODE_PROC) {
+ str = "STORED PROCEDURE";
+ } else if (type == QUE_NODE_FUNC) {
+ str = "FUNCTION";
+ } else if (type == QUE_NODE_LOCK) {
+ str = "LOCK";
+ } else if (type == QUE_NODE_THR) {
+ str = "QUERY THREAD";
+ } else if (type == QUE_NODE_COMMIT) {
+ str = "COMMIT";
+ } else if (type == QUE_NODE_UNDO) {
+ str = "UNDO ROW";
+ } else if (type == QUE_NODE_PURGE) {
+ str = "PURGE ROW";
+ } else if (type == QUE_NODE_ROLLBACK) {
+ str = "ROLLBACK";
+ } else if (type == QUE_NODE_CREATE_TABLE) {
+ str = "CREATE TABLE";
+ } else if (type == QUE_NODE_CREATE_INDEX) {
+ str = "CREATE INDEX";
+ } else if (type == QUE_NODE_INSERT_STATS) {
+ str = "INSERT TO SYS_STATS";
+ } else if (type == QUE_NODE_FOR) {
+ str = "FOR LOOP";
+ } else if (type == QUE_NODE_RETURN) {
+ str = "RETURN";
+ } else if (type == QUE_NODE_EXIT) {
+ str = "EXIT";
+ } else {
+ str = "UNKNOWN NODE TYPE";
+ }
+
+ fprintf(stderr, "Node type %lu: %s, address %p\n",
+ (ulong) type, str, (void*) node);
+}
+
+/**********************************************************************//**
+Performs an execution step on a query thread.
+@return query thread to run next: it may differ from the input
+parameter if, e.g., a subprocedure call is made */
+UNIV_INLINE
+que_thr_t*
+que_thr_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ que_node_t* node;
+ que_thr_t* old_thr;
+ trx_t* trx;
+ ulint type;
+
+ trx = thr_get_trx(thr);
+
+ ut_ad(thr->state == QUE_THR_RUNNING);
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ thr->resource++;
+
+ node = thr->run_node;
+ type = que_node_get_type(node);
+
+ old_thr = thr;
+
+#ifdef UNIV_DEBUG
+ if (que_trace_on) {
+ fputs("To execute: ", stderr);
+ que_node_print_info(node);
+ }
+#endif
+ if (type & QUE_NODE_CONTROL_STAT) {
+ if ((thr->prev_node != que_node_get_parent(node))
+ && que_node_get_next(thr->prev_node)) {
+
+ /* The control statements, like WHILE, always pass the
+ control to the next child statement if there is any
+ child left */
+
+ thr->run_node = que_node_get_next(thr->prev_node);
+
+ } else if (type == QUE_NODE_IF) {
+ if_step(thr);
+ } else if (type == QUE_NODE_FOR) {
+ for_step(thr);
+ } else if (type == QUE_NODE_PROC) {
+
+ /* We can access trx->undo_no without reserving
+ trx->undo_mutex, because there cannot be active query
+ threads doing updating or inserting at the moment! */
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ trx->last_sql_stat_start.least_undo_no
+ = trx->undo_no;
+ }
+
+ proc_step(thr);
+ } else if (type == QUE_NODE_WHILE) {
+ while_step(thr);
+ } else {
+ ut_error;
+ }
+ } else if (type == QUE_NODE_ASSIGNMENT) {
+ assign_step(thr);
+ } else if (type == QUE_NODE_SELECT) {
+ thr = row_sel_step(thr);
+ } else if (type == QUE_NODE_INSERT) {
+ thr = row_ins_step(thr);
+ } else if (type == QUE_NODE_UPDATE) {
+ thr = row_upd_step(thr);
+ } else if (type == QUE_NODE_FETCH) {
+ thr = fetch_step(thr);
+ } else if (type == QUE_NODE_OPEN) {
+ thr = open_step(thr);
+ } else if (type == QUE_NODE_FUNC) {
+ proc_eval_step(thr);
+
+ } else if (type == QUE_NODE_LOCK) {
+
+ ut_error;
+ /*
+ thr = que_lock_step(thr);
+ */
+ } else if (type == QUE_NODE_THR) {
+ thr = que_thr_node_step(thr);
+ } else if (type == QUE_NODE_COMMIT) {
+ thr = trx_commit_step(thr);
+ } else if (type == QUE_NODE_UNDO) {
+ thr = row_undo_step(thr);
+ } else if (type == QUE_NODE_PURGE) {
+ thr = row_purge_step(thr);
+ } else if (type == QUE_NODE_RETURN) {
+ thr = return_step(thr);
+ } else if (type == QUE_NODE_EXIT) {
+ thr = exit_step(thr);
+ } else if (type == QUE_NODE_ROLLBACK) {
+ thr = trx_rollback_step(thr);
+ } else if (type == QUE_NODE_CREATE_TABLE) {
+ thr = dict_create_table_step(thr);
+ } else if (type == QUE_NODE_CREATE_INDEX) {
+ thr = dict_create_index_step(thr);
+ } else if (type == QUE_NODE_INSERT_STATS) {
+ thr = dict_insert_stats_step(thr);
+ } else if (type == QUE_NODE_ROW_PRINTF) {
+ thr = row_printf_step(thr);
+ } else {
+ ut_error;
+ }
+
+ if (type == QUE_NODE_EXIT) {
+ old_thr->prev_node = que_node_get_containing_loop_node(node);
+ } else {
+ old_thr->prev_node = node;
+ }
+
+ if (thr) {
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Run a query thread until it finishes or encounters e.g. a lock wait. */
+static
+void
+que_run_threads_low(
+/*================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ que_thr_t* next_thr;
+ ulint cumul_resource;
+ ulint loop_count;
+
+ ut_ad(thr->state == QUE_THR_RUNNING);
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ /* cumul_resource counts how much resources the OS thread (NOT the
+ query thread) has spent in this function */
+
+ loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+ cumul_resource = 0;
+loop:
+ /* Check that there is enough space in the log to accommodate
+ possible log entries by this query step; if the operation can touch
+ more than about 4 pages, checks must be made also within the query
+ step! */
+
+ log_free_check();
+
+ /* Perform the actual query step: note that the query thread
+ may change if, e.g., a subprocedure call is made */
+
+ /*-------------------------*/
+ next_thr = que_thr_step(thr);
+ /*-------------------------*/
+
+ ut_a(!next_thr || (thr_get_trx(next_thr)->error_state == DB_SUCCESS));
+
+ loop_count++;
+
+ if (next_thr != thr) {
+ ut_a(next_thr == NULL);
+
+ /* This can change next_thr to a non-NULL value if there was
+ a lock wait that already completed. */
+ que_thr_dec_refer_count(thr, &next_thr);
+
+ if (next_thr == NULL) {
+
+ return;
+ }
+
+ loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+
+ thr = next_thr;
+ }
+
+ goto loop;
+}
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+loop:
+ ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+ que_run_threads_low(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ switch (thr->state) {
+
+ case QUE_THR_RUNNING:
+ /* There probably was a lock wait, but it already ended
+ before we came here: continue running thr */
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+
+ case QUE_THR_LOCK_WAIT:
+ mutex_exit(&kernel_mutex);
+
+ /* The ..._mysql_... function works also for InnoDB's
+ internal threads. Let us wait that the lock wait ends. */
+
+ srv_suspend_mysql_thread(thr);
+
+ if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
+ /* thr was chosen as a deadlock victim or there was
+ a lock wait timeout */
+
+ que_thr_dec_refer_count(thr, NULL);
+
+ return;
+ }
+
+ goto loop;
+
+ case QUE_THR_COMPLETED:
+ case QUE_THR_COMMAND_WAIT:
+ /* Do nothing */
+ break;
+
+ default:
+ ut_error;
+ }
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Evaluate the given SQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+que_eval_sql(
+/*=========*/
+ pars_info_t* info, /*!< in: info struct, or NULL */
+ const char* sql, /*!< in: SQL string */
+ ibool reserve_dict_mutex,
+ /*!< in: if TRUE, acquire/release
+ dict_sys->mutex around call to pars_sql. */
+ trx_t* trx) /*!< in: trx */
+{
+ que_thr_t* thr;
+ que_t* graph;
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ if (reserve_dict_mutex) {
+ mutex_enter(&dict_sys->mutex);
+ }
+
+ graph = pars_sql(info, sql);
+
+ if (reserve_dict_mutex) {
+ mutex_exit(&dict_sys->mutex);
+ }
+
+ ut_a(graph);
+
+ graph->trx = trx;
+ trx->graph = NULL;
+
+ graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+ ut_a(thr = que_fork_start_command(graph));
+
+ que_run_threads(thr);
+
+ que_graph_free(graph);
+
+ return(trx->error_state);
+}
diff --git a/storage/xtradb/read/read0read.c b/storage/xtradb/read/read0read.c
new file mode 100644
index 00000000000..85adae4ddff
--- /dev/null
+++ b/storage/xtradb/read/read0read.c
@@ -0,0 +1,540 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file read/read0read.c
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "read0read.h"
+
+#ifdef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#include "srv0srv.h"
+#include "trx0sys.h"
+
+/*
+-------------------------------------------------------------------------------
+FACT A: Cursor read view on a secondary index sees only committed versions
+-------
+of the records in the secondary index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: We must show that read goes always to the clustered index record
+to see that record is visible in the cursor read view. Consider e.g.
+following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a), index(b));
+insert into t1 values (1,1),(2,2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select b from t1 where b >= 1;
+
+This query will use secondary key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set b = 5 where b = 2;
+
+Now second fetch of the cursor should not see record (2,5) instead it should
+see record (2,2).
+
+We also should show that if we have delete t1 where b = 5; we still
+can see record (2,2).
+
+When we access a secondary key record maximum transaction id is fetched
+from this record and this trx_id is compared to up_limit_id in the view.
+If trx_id in the record is greater or equal than up_limit_id in the view
+cluster record is accessed. Because trx_id of the creating
+transaction is stored when this view was created to the list of
+trx_ids not seen by this read view previous version of the
+record is requested to be built. This is build using clustered record.
+If the secondary key record is delete marked it's corresponding
+clustered record can be already be purged only if records
+trx_id < low_limit_no. Purge can't remove any record deleted by a
+transaction which was active when cursor was created. But, we still
+may have a deleted secondary key record but no clustered record. But,
+this is not a problem because this case is handled in
+row_sel_get_clust_rec() function which is called
+whenever we note that this read view does not see trx_id in the
+record. Thus, we see correct version. Q. E. D.
+
+-------------------------------------------------------------------------------
+FACT B: Cursor read view on a clustered index sees only committed versions
+-------
+of the records in the clustered index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: Consider e.g.following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a));
+insert into t1 values (1),(2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select a from t1 where a >= 1;
+
+This query will use clustered key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set a = 5 where a = 2;
+
+Now second fetch of the cursor should not see record (5) instead it should
+see record (2).
+
+We also should show that if we have execute delete t1 where a = 5; after
+the cursor is opened we still can see record (2).
+
+When accessing clustered record we always check if this read view sees
+trx_id stored to clustered record. By default we don't see any changes
+if record trx_id >= low_limit_id i.e. change was made transaction
+which started after transaction which created the cursor. If row
+was changed by the future transaction a previous version of the
+clustered record is created. Thus we see only committed version in
+this case. We see all changes made by committed transactions i.e.
+record trx_id < up_limit_id. In this case we don't need to do anything,
+we already see correct version of the record. We don't see any changes
+made by active transaction except creating transaction. We have stored
+trx_id of creating transaction to list of trx_ids when this view was
+created. Thus we can easily see if this record was changed by the
+creating transaction. Because we already have clustered record we can
+access roll_ptr. Using this roll_ptr we can fetch undo record.
+We can now check that undo_no of the undo record is less than undo_no of the
+trancaction which created a view when cursor was created. We see this
+clustered record only in case when record undo_no is less than undo_no
+in the view. If this is not true we build based on undo_rec previous
+version of the record. This record is found because purge can't remove
+records accessed by active transaction. Thus we see correct version. Q. E. D.
+-------------------------------------------------------------------------------
+FACT C: Purge does not remove any delete marked row that is visible
+-------
+to cursor view.
+
+TODO: proof this
+
+*/
+
+/*********************************************************************//**
+Creates a read view object.
+@return own: read view struct */
+UNIV_INLINE
+read_view_t*
+read_view_create_low(
+/*=================*/
+ ulint n, /*!< in: number of cells in the trx_ids array */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ read_view_t* view;
+
+ view = mem_heap_alloc(heap, sizeof(read_view_t));
+
+ view->n_trx_ids = n;
+ view->trx_ids = mem_heap_alloc(heap, n * sizeof *view->trx_ids);
+
+ return(view);
+}
+
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, with the exception that also
+the creating trx of the oldest view is set as not visible in the 'copied'
+view. Opens a new view if no views currently exist. The view must be closed
+with ..._close. This is used in purge.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ allocated */
+{
+ read_view_t* old_view;
+ read_view_t* view_copy;
+ ibool needs_insert = TRUE;
+ ulint insert_done = 0;
+ ulint n;
+ ulint i;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ old_view = UT_LIST_GET_LAST(trx_sys->view_list);
+
+ if (old_view == NULL) {
+
+ return(read_view_open_now(cr_trx_id, heap));
+ }
+
+ n = old_view->n_trx_ids;
+
+ if (!ut_dulint_is_zero(old_view->creator_trx_id)) {
+ n++;
+ } else {
+ needs_insert = FALSE;
+ }
+
+ view_copy = read_view_create_low(n, heap);
+
+ /* Insert the id of the creator in the right place of the descending
+ array of ids, if needs_insert is TRUE: */
+
+ i = 0;
+ while (i < n) {
+ if (needs_insert
+ && (i >= old_view->n_trx_ids
+ || ut_dulint_cmp(old_view->creator_trx_id,
+ read_view_get_nth_trx_id(old_view, i))
+ > 0)) {
+
+ read_view_set_nth_trx_id(view_copy, i,
+ old_view->creator_trx_id);
+ needs_insert = FALSE;
+ insert_done = 1;
+ } else {
+ read_view_set_nth_trx_id(view_copy, i,
+ read_view_get_nth_trx_id(
+ old_view,
+ i - insert_done));
+ }
+
+ i++;
+ }
+
+ view_copy->creator_trx_id = cr_trx_id;
+
+ view_copy->low_limit_no = old_view->low_limit_no;
+ view_copy->low_limit_id = old_view->low_limit_id;
+
+
+ if (n > 0) {
+ /* The last active transaction has the smallest id: */
+ view_copy->up_limit_id = read_view_get_nth_trx_id(
+ view_copy, n - 1);
+ } else {
+ view_copy->up_limit_id = old_view->up_limit_id;
+ }
+
+ UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy);
+
+ return(view_copy);
+}
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+ trx_id_t cr_trx_id, /*!< in: trx_id of creating
+ transaction, or ut_dulint_zero
+ used in purge */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ allocated */
+{
+ read_view_t* view;
+ trx_t* trx;
+ ulint n;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ view = read_view_create_low(UT_LIST_GET_LEN(trx_sys->trx_list), heap);
+
+ view->creator_trx_id = cr_trx_id;
+ view->type = VIEW_NORMAL;
+ view->undo_no = ut_dulint_zero;
+
+ /* No future transactions should be visible in the view */
+
+ view->low_limit_no = trx_sys->max_trx_id;
+ view->low_limit_id = view->low_limit_no;
+
+ n = 0;
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ /* No active transaction should be visible, except cr_trx */
+
+ while (trx) {
+ if (ut_dulint_cmp(trx->id, cr_trx_id) != 0
+ && (trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED)) {
+
+ read_view_set_nth_trx_id(view, n, trx->id);
+
+ n++;
+
+ /* NOTE that a transaction whose trx number is <
+ trx_sys->max_trx_id can still be active, if it is
+ in the middle of its commit! Note that when a
+ transaction starts, we initialize trx->no to
+ ut_dulint_max. */
+
+ if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) {
+
+ view->low_limit_no = trx->no;
+ }
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ view->n_trx_ids = n;
+
+ if (n > 0) {
+ /* The last active transaction has the smallest id: */
+ view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+ } else {
+ view->up_limit_id = view->low_limit_id;
+ }
+
+
+ UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+
+ return(view);
+}
+
+/*********************************************************************//**
+Closes a read view. */
+UNIV_INTERN
+void
+read_view_close(
+/*============*/
+ read_view_t* view) /*!< in: read view */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+}
+
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+ trx_t* trx) /*!< in: trx which has a read view */
+{
+ ut_a(trx->global_read_view);
+
+ mutex_enter(&kernel_mutex);
+
+ read_view_close(trx->global_read_view);
+
+ mem_heap_empty(trx->global_read_view_heap);
+
+ trx->read_view = NULL;
+ trx->global_read_view = NULL;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+ const read_view_t* view) /*!< in: read view */
+{
+ ulint n_ids;
+ ulint i;
+
+ if (view->type == VIEW_HIGH_GRANULARITY) {
+ fprintf(stderr,
+ "High-granularity read view undo_n:o %lu %lu\n",
+ (ulong) ut_dulint_get_high(view->undo_no),
+ (ulong) ut_dulint_get_low(view->undo_no));
+ } else {
+ fprintf(stderr, "Normal read view\n");
+ }
+
+ fprintf(stderr, "Read view low limit trx n:o %lu %lu\n",
+ (ulong) ut_dulint_get_high(view->low_limit_no),
+ (ulong) ut_dulint_get_low(view->low_limit_no));
+
+ fprintf(stderr, "Read view up limit trx id " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(view->up_limit_id));
+
+ fprintf(stderr, "Read view low limit trx id " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(view->low_limit_id));
+
+ fprintf(stderr, "Read view individually stored trx ids:\n");
+
+ n_ids = view->n_trx_ids;
+
+ for (i = 0; i < n_ids; i++) {
+ fprintf(stderr, "Read view trx id " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(
+ read_view_get_nth_trx_id(view, i)));
+ }
+}
+
+/*********************************************************************//**
+Create a high-granularity consistent cursor view for mysql to be used
+in cursors. In this consistent read view modifications done by the
+creating transaction after the cursor is created or future transactions
+are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+ trx_t* cr_trx) /*!< in: trx where cursor view is created */
+{
+ cursor_view_t* curview;
+ read_view_t* view;
+ mem_heap_t* heap;
+ trx_t* trx;
+ ulint n;
+
+ ut_a(cr_trx);
+
+ /* Use larger heap than in trx_create when creating a read_view
+ because cursors are quite long. */
+
+ heap = mem_heap_create(512);
+
+ curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t));
+ curview->heap = heap;
+
+ /* Drop cursor tables from consideration when evaluating the need of
+ auto-commit */
+ curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use;
+ cr_trx->n_mysql_tables_in_use = 0;
+
+ mutex_enter(&kernel_mutex);
+
+ curview->read_view = read_view_create_low(
+ UT_LIST_GET_LEN(trx_sys->trx_list), curview->heap);
+
+ view = curview->read_view;
+ view->creator_trx_id = cr_trx->id;
+ view->type = VIEW_HIGH_GRANULARITY;
+ view->undo_no = cr_trx->undo_no;
+
+ /* No future transactions should be visible in the view */
+
+ view->low_limit_no = trx_sys->max_trx_id;
+ view->low_limit_id = view->low_limit_no;
+
+ n = 0;
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ /* No active transaction should be visible */
+
+ while (trx) {
+
+ if (trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED) {
+
+ read_view_set_nth_trx_id(view, n, trx->id);
+
+ n++;
+
+ /* NOTE that a transaction whose trx number is <
+ trx_sys->max_trx_id can still be active, if it is
+ in the middle of its commit! Note that when a
+ transaction starts, we initialize trx->no to
+ ut_dulint_max. */
+
+ if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) {
+
+ view->low_limit_no = trx->no;
+ }
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ view->n_trx_ids = n;
+
+ if (n > 0) {
+ /* The last active transaction has the smallest id: */
+ view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+ } else {
+ view->up_limit_id = view->low_limit_id;
+ }
+
+ UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+
+ mutex_exit(&kernel_mutex);
+
+ return(curview);
+}
+
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+ trx_t* trx, /*!< in: trx */
+ cursor_view_t* curview)/*!< in: cursor view to be closed */
+{
+ ut_a(curview);
+ ut_a(curview->read_view);
+ ut_a(curview->heap);
+
+ /* Add cursor's tables to the global count of active tables that
+ belong to this transaction */
+ trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use;
+
+ mutex_enter(&kernel_mutex);
+
+ read_view_close(curview->read_view);
+ trx->read_view = trx->global_read_view;
+
+ mutex_exit(&kernel_mutex);
+
+ mem_heap_free(curview->heap);
+}
+
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+ trx_t* trx, /*!< in: transaction where cursor is set */
+ cursor_view_t* curview)/*!< in: consistent cursor view to be set */
+{
+ ut_a(trx);
+
+ mutex_enter(&kernel_mutex);
+
+ if (UNIV_LIKELY(curview != NULL)) {
+ trx->read_view = curview->read_view;
+ } else {
+ trx->read_view = trx->global_read_view;
+ }
+
+ mutex_exit(&kernel_mutex);
+}
diff --git a/storage/xtradb/rem/rem0cmp.c b/storage/xtradb/rem/rem0cmp.c
new file mode 100644
index 00000000000..8ee434f85da
--- /dev/null
+++ b/storage/xtradb/rem/rem0cmp.c
@@ -0,0 +1,1204 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file rem/rem0cmp.c
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include "rem0cmp.h"
+
+#ifdef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#include "srv0srv.h"
+
+/* ALPHABETICAL ORDER
+ ==================
+
+The records are put into alphabetical order in the following
+way: let F be the first field where two records disagree.
+If there is a character in some position n where the
+records disagree, the order is determined by comparison of
+the characters at position n, possibly after
+collating transformation. If there is no such character,
+but the corresponding fields have different lengths, then
+if the data type of the fields is paddable,
+shorter field is padded with a padding character. If the
+data type is not paddable, longer field is considered greater.
+Finally, the SQL null is bigger than any other value.
+
+At the present, the comparison functions return 0 in the case,
+where two records disagree only in the way that one
+has more fields than the other. */
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields);/*!< in/out: number of already
+ completely matched fields; when function
+ returns, contains the value for current
+ comparison */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp(
+/*===============*/
+ int mysql_type, /*!< in: MySQL type */
+ uint charset_number, /*!< in: number of the charset */
+ const unsigned char* a, /*!< in: data field */
+ unsigned int a_length, /*!< in: data field length,
+ not UNIV_SQL_NULL */
+ const unsigned char* b, /*!< in: data field */
+ unsigned int b_length); /*!< in: data field length,
+ not UNIV_SQL_NULL */
+/*********************************************************************//**
+Transforms the character code so that it is ordered appropriately for the
+language. This is only used for the latin1 char set. MySQL does the
+comparisons for other char sets.
+@return collation order position */
+UNIV_INLINE
+ulint
+cmp_collate(
+/*========*/
+ ulint code) /*!< in: code of a character stored in database record */
+{
+ return((ulint) srv_latin1_ordering[code]);
+}
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+ const dict_col_t* col1, /*!< in: column 1 */
+ const dict_col_t* col2, /*!< in: column 2 */
+ ibool check_charsets)
+ /*!< in: whether to check charsets */
+{
+ if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype)
+ && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) {
+
+ /* Both are non-binary string types: they can be compared if
+ and only if the charset-collation is the same */
+
+ if (check_charsets) {
+ return(dtype_get_charset_coll(col1->prtype)
+ == dtype_get_charset_coll(col2->prtype));
+ } else {
+ return(TRUE);
+ }
+ }
+
+ if (dtype_is_binary_string_type(col1->mtype, col1->prtype)
+ && dtype_is_binary_string_type(col2->mtype, col2->prtype)) {
+
+ /* Both are binary string types: they can be compared */
+
+ return(TRUE);
+ }
+
+ if (col1->mtype != col2->mtype) {
+
+ return(FALSE);
+ }
+
+ if (col1->mtype == DATA_INT
+ && (col1->prtype & DATA_UNSIGNED)
+ != (col2->prtype & DATA_UNSIGNED)) {
+
+ /* The storage format of an unsigned integer is different
+ from a signed integer: in a signed integer we OR
+ 0x8000... to the value of positive integers. */
+
+ return(FALSE);
+ }
+
+ return(col1->mtype != DATA_INT || col1->len == col2->len);
+}
+
+/*************************************************************//**
+Innobase uses this function to compare two data fields for which the data type
+is such that we must compare whole fields or call MySQL to do the comparison
+@return 1, 0, -1, if a is greater, equal, less than b, respectively */
+static
+int
+cmp_whole_field(
+/*============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* a, /*!< in: data field */
+ unsigned int a_length, /*!< in: data field length,
+ not UNIV_SQL_NULL */
+ const byte* b, /*!< in: data field */
+ unsigned int b_length) /*!< in: data field length,
+ not UNIV_SQL_NULL */
+{
+ float f_1;
+ float f_2;
+ double d_1;
+ double d_2;
+ int swap_flag = 1;
+
+ switch (mtype) {
+
+ case DATA_DECIMAL:
+ /* Remove preceding spaces */
+ for (; a_length && *a == ' '; a++, a_length--);
+ for (; b_length && *b == ' '; b++, b_length--);
+
+ if (*a == '-') {
+ if (*b != '-') {
+ return(-1);
+ }
+
+ a++; b++;
+ a_length--;
+ b_length--;
+
+ swap_flag = -1;
+
+ } else if (*b == '-') {
+
+ return(1);
+ }
+
+ while (a_length > 0 && (*a == '+' || *a == '0')) {
+ a++; a_length--;
+ }
+
+ while (b_length > 0 && (*b == '+' || *b == '0')) {
+ b++; b_length--;
+ }
+
+ if (a_length != b_length) {
+ if (a_length < b_length) {
+ return(-swap_flag);
+ }
+
+ return(swap_flag);
+ }
+
+ while (a_length > 0 && *a == *b) {
+
+ a++; b++; a_length--;
+ }
+
+ if (a_length == 0) {
+
+ return(0);
+ }
+
+ if (*a > *b) {
+ return(swap_flag);
+ }
+
+ return(-swap_flag);
+ case DATA_DOUBLE:
+ d_1 = mach_double_read(a);
+ d_2 = mach_double_read(b);
+
+ if (d_1 > d_2) {
+ return(1);
+ } else if (d_2 > d_1) {
+ return(-1);
+ }
+
+ return(0);
+
+ case DATA_FLOAT:
+ f_1 = mach_float_read(a);
+ f_2 = mach_float_read(b);
+
+ if (f_1 > f_2) {
+ return(1);
+ } else if (f_2 > f_1) {
+ return(-1);
+ }
+
+ return(0);
+ case DATA_BLOB:
+ if (prtype & DATA_BINARY_TYPE) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: comparing a binary BLOB"
+ " with a character set sensitive\n"
+ "InnoDB: comparison!\n");
+ }
+ /* fall through */
+ case DATA_VARMYSQL:
+ case DATA_MYSQL:
+ return(innobase_mysql_cmp(
+ (int)(prtype & DATA_MYSQL_TYPE_MASK),
+ (uint)dtype_get_charset_coll(prtype),
+ a, a_length, b, b_length));
+ default:
+ fprintf(stderr,
+ "InnoDB: unknown type number %lu\n",
+ (ulong) mtype);
+ ut_error;
+ }
+
+ return(0);
+}
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return 1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ const byte* data1, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len1, /*!< in: data field length or UNIV_SQL_NULL */
+ const byte* data2, /*!< in: data field (== a pointer to a memory
+ buffer) */
+ ulint len2) /*!< in: data field length or UNIV_SQL_NULL */
+{
+ ulint data1_byte;
+ ulint data2_byte;
+ ulint cur_bytes;
+
+ if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) {
+
+ if (len1 == len2) {
+
+ return(0);
+ }
+
+ if (len1 == UNIV_SQL_NULL) {
+ /* We define the SQL null to be the smallest possible
+ value of a field in the alphabetical order */
+
+ return(-1);
+ }
+
+ return(1);
+ }
+
+ if (mtype >= DATA_FLOAT
+ || (mtype == DATA_BLOB
+ && 0 == (prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(prtype)
+ != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+ return(cmp_whole_field(mtype, prtype,
+ data1, (unsigned) len1,
+ data2, (unsigned) len2));
+ }
+
+ /* Compare then the fields */
+
+ cur_bytes = 0;
+
+ for (;;) {
+ if (len1 <= cur_bytes) {
+ if (len2 <= cur_bytes) {
+
+ return(0);
+ }
+
+ data1_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (data1_byte == ULINT_UNDEFINED) {
+
+ return(-1);
+ }
+ } else {
+ data1_byte = *data1;
+ }
+
+ if (len2 <= cur_bytes) {
+ data2_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (data2_byte == ULINT_UNDEFINED) {
+
+ return(1);
+ }
+ } else {
+ data2_byte = *data2;
+ }
+
+ if (data1_byte == data2_byte) {
+ /* If the bytes are equal, they will remain such even
+ after the collation transformation below */
+
+ goto next_byte;
+ }
+
+ if (mtype <= DATA_CHAR
+ || (mtype == DATA_BLOB
+ && 0 == (prtype & DATA_BINARY_TYPE))) {
+
+ data1_byte = cmp_collate(data1_byte);
+ data2_byte = cmp_collate(data2_byte);
+ }
+
+ if (data1_byte > data2_byte) {
+
+ return(1);
+ } else if (data1_byte < data2_byte) {
+
+ return(-1);
+ }
+next_byte:
+ /* Next byte */
+ cur_bytes++;
+ data1++;
+ data2++;
+ }
+
+ return(0); /* Not reached */
+}
+
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when function returns,
+ contains the value for current comparison */
+ ulint* matched_bytes) /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when function returns, contains the
+ value for current comparison */
+{
+ const dfield_t* dtuple_field; /* current field in logical record */
+ ulint dtuple_f_len; /* the length of the current field
+ in the logical record */
+ const byte* dtuple_b_ptr; /* pointer to the current byte in
+ logical field data */
+ ulint dtuple_byte; /* value of current byte to be compared
+ in dtuple*/
+ ulint rec_f_len; /* length of current field in rec */
+ const byte* rec_b_ptr; /* pointer to the current byte in
+ rec field */
+ ulint rec_byte; /* value of current byte to be
+ compared in rec */
+ ulint cur_field; /* current field number */
+ ulint cur_bytes; /* number of already matched bytes
+ in current field */
+ int ret = 3333; /* return value */
+
+ ut_ad(dtuple && rec && matched_fields && matched_bytes);
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ cur_field = *matched_fields;
+ cur_bytes = *matched_bytes;
+
+ ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple));
+ ut_ad(cur_field <= rec_offs_n_fields(offsets));
+
+ if (cur_bytes == 0 && cur_field == 0) {
+ ulint rec_info = rec_get_info_bits(rec,
+ rec_offs_comp(offsets));
+ ulint tup_info = dtuple_get_info_bits(dtuple);
+
+ if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
+ ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
+ goto order_resolved;
+ } else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
+ ret = -1;
+ goto order_resolved;
+ }
+ }
+
+ /* Match fields in a loop; stop if we run out of fields in dtuple
+ or find an externally stored field */
+
+ while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+
+ ulint mtype;
+ ulint prtype;
+
+ dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+ {
+ const dtype_t* type
+ = dfield_get_type(dtuple_field);
+
+ mtype = type->mtype;
+ prtype = type->prtype;
+ }
+
+ dtuple_f_len = dfield_get_len(dtuple_field);
+
+ rec_b_ptr = rec_get_nth_field(rec, offsets,
+ cur_field, &rec_f_len);
+
+ /* If we have matched yet 0 bytes, it may be that one or
+ both the fields are SQL null, or the record or dtuple may be
+ the predefined minimum record, or the field is externally
+ stored */
+
+ if (UNIV_LIKELY(cur_bytes == 0)) {
+ if (rec_offs_nth_extern(offsets, cur_field)) {
+ /* We do not compare to an externally
+ stored field */
+
+ ret = 0;
+
+ goto order_resolved;
+ }
+
+ if (dtuple_f_len == UNIV_SQL_NULL) {
+ if (rec_f_len == UNIV_SQL_NULL) {
+
+ goto next_field;
+ }
+
+ ret = -1;
+ goto order_resolved;
+ } else if (rec_f_len == UNIV_SQL_NULL) {
+ /* We define the SQL null to be the
+ smallest possible value of a field
+ in the alphabetical order */
+
+ ret = 1;
+ goto order_resolved;
+ }
+ }
+
+ if (mtype >= DATA_FLOAT
+ || (mtype == DATA_BLOB
+ && 0 == (prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(prtype)
+ != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+ ret = cmp_whole_field(mtype, prtype,
+ dfield_get_data(dtuple_field),
+ (unsigned) dtuple_f_len,
+ rec_b_ptr, (unsigned) rec_f_len);
+
+ if (ret != 0) {
+ cur_bytes = 0;
+
+ goto order_resolved;
+ } else {
+ goto next_field;
+ }
+ }
+
+ /* Set the pointers at the current byte */
+
+ rec_b_ptr = rec_b_ptr + cur_bytes;
+ dtuple_b_ptr = (byte*)dfield_get_data(dtuple_field)
+ + cur_bytes;
+ /* Compare then the fields */
+
+ for (;;) {
+ if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {
+ if (dtuple_f_len <= cur_bytes) {
+
+ goto next_field;
+ }
+
+ rec_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (rec_byte == ULINT_UNDEFINED) {
+ ret = 1;
+
+ goto order_resolved;
+ }
+ } else {
+ rec_byte = *rec_b_ptr;
+ }
+
+ if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {
+ dtuple_byte = dtype_get_pad_char(mtype,
+ prtype);
+
+ if (dtuple_byte == ULINT_UNDEFINED) {
+ ret = -1;
+
+ goto order_resolved;
+ }
+ } else {
+ dtuple_byte = *dtuple_b_ptr;
+ }
+
+ if (dtuple_byte == rec_byte) {
+ /* If the bytes are equal, they will
+ remain such even after the collation
+ transformation below */
+
+ goto next_byte;
+ }
+
+ if (mtype <= DATA_CHAR
+ || (mtype == DATA_BLOB
+ && !(prtype & DATA_BINARY_TYPE))) {
+
+ rec_byte = cmp_collate(rec_byte);
+ dtuple_byte = cmp_collate(dtuple_byte);
+ }
+
+ ret = (int) (dtuple_byte - rec_byte);
+ if (UNIV_LIKELY(ret)) {
+ if (ret < 0) {
+ ret = -1;
+ goto order_resolved;
+ } else {
+ ret = 1;
+ goto order_resolved;
+ }
+ }
+next_byte:
+ /* Next byte */
+ cur_bytes++;
+ rec_b_ptr++;
+ dtuple_b_ptr++;
+ }
+
+next_field:
+ cur_field++;
+ cur_bytes = 0;
+ }
+
+ ut_ad(cur_bytes == 0);
+
+ ret = 0; /* If we ran out of fields, dtuple was equal to rec
+ up to the common fields */
+order_resolved:
+ ut_ad((ret >= - 1) && (ret <= 1));
+ ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets,
+ matched_fields));
+ ut_ad(*matched_fields == cur_field); /* In the debug version, the
+ above cmp_debug_... sets
+ *matched_fields to a value */
+ *matched_fields = cur_field;
+ *matched_bytes = cur_bytes;
+
+ return(ret);
+}
+
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint matched_fields = 0;
+ ulint matched_bytes = 0;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+ &matched_fields, &matched_bytes));
+}
+
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint n_fields;
+ ulint matched_fields = 0;
+ ulint matched_bytes = 0;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ n_fields = dtuple_get_n_fields(dtuple);
+
+ if (n_fields > rec_offs_n_fields(offsets)) {
+
+ return(FALSE);
+ }
+
+ cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+ &matched_fields, &matched_bytes);
+ if (matched_fields == n_fields) {
+
+ return(TRUE);
+ }
+
+ if (matched_fields == n_fields - 1
+ && matched_bytes == dfield_get_len(
+ dtuple_get_nth_field(dtuple, n_fields - 1))) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+ const dict_index_t* index, /*!< in: data dictionary index */
+ ibool* null_eq)/*!< out: set to TRUE if
+ found matching null values */
+{
+ ulint rec1_f_len; /*!< length of current field in rec1 */
+ const byte* rec1_b_ptr; /*!< pointer to the current byte
+ in rec1 field */
+ ulint rec1_byte; /*!< value of current byte to be
+ compared in rec1 */
+ ulint rec2_f_len; /*!< length of current field in rec2 */
+ const byte* rec2_b_ptr; /*!< pointer to the current byte
+ in rec2 field */
+ ulint rec2_byte; /*!< value of current byte to be
+ compared in rec2 */
+ ulint cur_field; /*!< current field number */
+ ulint n_uniq;
+
+ n_uniq = dict_index_get_n_unique(index);
+ ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+ ut_ad(rec_offs_n_fields(offsets2) >= n_uniq);
+
+ ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+ for (cur_field = 0; cur_field < n_uniq; cur_field++) {
+
+ ulint cur_bytes;
+ ulint mtype;
+ ulint prtype;
+
+ {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, cur_field);
+
+ mtype = col->mtype;
+ prtype = col->prtype;
+ }
+
+ ut_ad(!rec_offs_nth_extern(offsets1, cur_field));
+ ut_ad(!rec_offs_nth_extern(offsets2, cur_field));
+
+ rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+ cur_field, &rec1_f_len);
+ rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+ cur_field, &rec2_f_len);
+
+ if (rec1_f_len == UNIV_SQL_NULL
+ || rec2_f_len == UNIV_SQL_NULL) {
+
+ if (rec1_f_len == rec2_f_len) {
+ if (null_eq) {
+ *null_eq = TRUE;
+ }
+
+ goto next_field;
+
+ } else if (rec2_f_len == UNIV_SQL_NULL) {
+
+ /* We define the SQL null to be the
+ smallest possible value of a field
+ in the alphabetical order */
+
+ return(1);
+ } else {
+ return(-1);
+ }
+ }
+
+ if (mtype >= DATA_FLOAT
+ || (mtype == DATA_BLOB
+ && 0 == (prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(prtype)
+ != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+ int ret = cmp_whole_field(mtype, prtype,
+ rec1_b_ptr,
+ (unsigned) rec1_f_len,
+ rec2_b_ptr,
+ (unsigned) rec2_f_len);
+ if (ret) {
+ return(ret);
+ }
+
+ goto next_field;
+ }
+
+ /* Compare the fields */
+ for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
+ if (rec2_f_len <= cur_bytes) {
+
+ if (rec1_f_len <= cur_bytes) {
+
+ goto next_field;
+ }
+
+ rec2_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (rec2_byte == ULINT_UNDEFINED) {
+ return(1);
+ }
+ } else {
+ rec2_byte = *rec2_b_ptr;
+ }
+
+ if (rec1_f_len <= cur_bytes) {
+ rec1_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (rec1_byte == ULINT_UNDEFINED) {
+ return(-1);
+ }
+ } else {
+ rec1_byte = *rec1_b_ptr;
+ }
+
+ if (rec1_byte == rec2_byte) {
+ /* If the bytes are equal, they will remain
+ such even after the collation transformation
+ below */
+
+ continue;
+ }
+
+ if (mtype <= DATA_CHAR
+ || (mtype == DATA_BLOB
+ && !(prtype & DATA_BINARY_TYPE))) {
+
+ rec1_byte = cmp_collate(rec1_byte);
+ rec2_byte = cmp_collate(rec2_byte);
+ }
+
+ if (rec1_byte < rec2_byte) {
+ return(-1);
+ } else if (rec1_byte > rec2_byte) {
+ return(1);
+ }
+ }
+next_field:
+ continue;
+ }
+
+ /* If we ran out of fields, rec1 was equal to rec2. */
+ return(0);
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+ const rec_t* rec1, /*!< in: physical record */
+ const rec_t* rec2, /*!< in: physical record */
+ const ulint* offsets1,/*!< in: rec_get_offsets(rec1, index) */
+ const ulint* offsets2,/*!< in: rec_get_offsets(rec2, index) */
+ dict_index_t* index, /*!< in: data dictionary index */
+ ulint* matched_fields, /*!< in/out: number of already completely
+ matched fields; when the function returns,
+ contains the value the for current
+ comparison */
+ ulint* matched_bytes, /*!< in/out: number of already matched
+ bytes within the first field not completely
+ matched; when the function returns, contains
+ the value for the current comparison */
+ ulint stats_method)
+{
+ ulint rec1_n_fields; /* the number of fields in rec */
+ ulint rec1_f_len; /* length of current field in rec */
+ const byte* rec1_b_ptr; /* pointer to the current byte
+ in rec field */
+ ulint rec1_byte; /* value of current byte to be
+ compared in rec */
+ ulint rec2_n_fields; /* the number of fields in rec */
+ ulint rec2_f_len; /* length of current field in rec */
+ const byte* rec2_b_ptr; /* pointer to the current byte
+ in rec field */
+ ulint rec2_byte; /* value of current byte to be
+ compared in rec */
+ ulint cur_field; /* current field number */
+ ulint cur_bytes; /* number of already matched
+ bytes in current field */
+ int ret = 0; /* return value */
+ ulint comp;
+
+ ut_ad(rec1 && rec2 && index);
+ ut_ad(rec_offs_validate(rec1, index, offsets1));
+ ut_ad(rec_offs_validate(rec2, index, offsets2));
+ ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+ comp = rec_offs_comp(offsets1);
+ rec1_n_fields = rec_offs_n_fields(offsets1);
+ rec2_n_fields = rec_offs_n_fields(offsets2);
+
+ cur_field = *matched_fields;
+ cur_bytes = *matched_bytes;
+
+ /* Match fields in a loop */
+
+ while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) {
+
+ ulint mtype;
+ ulint prtype;
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ /* This is for the insert buffer B-tree. */
+ mtype = DATA_BINARY;
+ prtype = 0;
+ } else {
+ const dict_col_t* col
+ = dict_index_get_nth_col(index, cur_field);
+
+ mtype = col->mtype;
+ prtype = col->prtype;
+ }
+
+ rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+ cur_field, &rec1_f_len);
+ rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+ cur_field, &rec2_f_len);
+
+ if (cur_bytes == 0) {
+ if (cur_field == 0) {
+ /* Test if rec is the predefined minimum
+ record */
+ if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp)
+ & REC_INFO_MIN_REC_FLAG)) {
+
+ if (!(rec_get_info_bits(rec2, comp)
+ & REC_INFO_MIN_REC_FLAG)) {
+ ret = -1;
+ }
+
+ goto order_resolved;
+
+ } else if (UNIV_UNLIKELY
+ (rec_get_info_bits(rec2, comp)
+ & REC_INFO_MIN_REC_FLAG)) {
+
+ ret = 1;
+
+ goto order_resolved;
+ }
+ }
+
+ if (rec_offs_nth_extern(offsets1, cur_field)
+ || rec_offs_nth_extern(offsets2, cur_field)) {
+ /* We do not compare to an externally
+ stored field */
+
+ goto order_resolved;
+ }
+
+ if (rec1_f_len == UNIV_SQL_NULL
+ || rec2_f_len == UNIV_SQL_NULL) {
+
+ if (rec1_f_len == rec2_f_len) {
+
+ if (stats_method == SRV_STATS_METHOD_NULLS_EQUAL) {
+ goto next_field;
+ } else {
+ ret = -1;
+ }
+
+ } else if (rec2_f_len == UNIV_SQL_NULL) {
+
+ /* We define the SQL null to be the
+ smallest possible value of a field
+ in the alphabetical order */
+
+ ret = 1;
+ } else {
+ ret = -1;
+ }
+
+ goto order_resolved;
+ }
+ }
+
+ if (mtype >= DATA_FLOAT
+ || (mtype == DATA_BLOB
+ && 0 == (prtype & DATA_BINARY_TYPE)
+ && dtype_get_charset_coll(prtype)
+ != DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+ ret = cmp_whole_field(mtype, prtype,
+ rec1_b_ptr,
+ (unsigned) rec1_f_len,
+ rec2_b_ptr,
+ (unsigned) rec2_f_len);
+ if (ret != 0) {
+ cur_bytes = 0;
+
+ goto order_resolved;
+ } else {
+ goto next_field;
+ }
+ }
+
+ /* Set the pointers at the current byte */
+ rec1_b_ptr = rec1_b_ptr + cur_bytes;
+ rec2_b_ptr = rec2_b_ptr + cur_bytes;
+
+ /* Compare then the fields */
+ for (;;) {
+ if (rec2_f_len <= cur_bytes) {
+
+ if (rec1_f_len <= cur_bytes) {
+
+ goto next_field;
+ }
+
+ rec2_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (rec2_byte == ULINT_UNDEFINED) {
+ ret = 1;
+
+ goto order_resolved;
+ }
+ } else {
+ rec2_byte = *rec2_b_ptr;
+ }
+
+ if (rec1_f_len <= cur_bytes) {
+ rec1_byte = dtype_get_pad_char(mtype, prtype);
+
+ if (rec1_byte == ULINT_UNDEFINED) {
+ ret = -1;
+
+ goto order_resolved;
+ }
+ } else {
+ rec1_byte = *rec1_b_ptr;
+ }
+
+ if (rec1_byte == rec2_byte) {
+ /* If the bytes are equal, they will remain
+ such even after the collation transformation
+ below */
+
+ goto next_byte;
+ }
+
+ if (mtype <= DATA_CHAR
+ || (mtype == DATA_BLOB
+ && !(prtype & DATA_BINARY_TYPE))) {
+
+ rec1_byte = cmp_collate(rec1_byte);
+ rec2_byte = cmp_collate(rec2_byte);
+ }
+
+ if (rec1_byte < rec2_byte) {
+ ret = -1;
+ goto order_resolved;
+ } else if (rec1_byte > rec2_byte) {
+ ret = 1;
+ goto order_resolved;
+ }
+next_byte:
+ /* Next byte */
+
+ cur_bytes++;
+ rec1_b_ptr++;
+ rec2_b_ptr++;
+ }
+
+next_field:
+ cur_field++;
+ cur_bytes = 0;
+ }
+
+ ut_ad(cur_bytes == 0);
+
+ /* If we ran out of fields, rec1 was equal to rec2 up
+ to the common fields */
+ ut_ad(ret == 0);
+order_resolved:
+
+ ut_ad((ret >= - 1) && (ret <= 1));
+
+ *matched_fields = cur_field;
+ *matched_bytes = cur_bytes;
+
+ return(ret);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has. If encounters an
+externally stored field, returns 0.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ const rec_t* rec, /*!< in: physical record which differs from
+ dtuple in some of the common fields, or which
+ has an equal number or more fields than
+ dtuple */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint* matched_fields) /*!< in/out: number of already
+ completely matched fields; when function
+ returns, contains the value for current
+ comparison */
+{
+ const dfield_t* dtuple_field; /* current field in logical record */
+ ulint dtuple_f_len; /* the length of the current field
+ in the logical record */
+ const byte* dtuple_f_data; /* pointer to the current logical
+ field data */
+ ulint rec_f_len; /* length of current field in rec */
+ const byte* rec_f_data; /* pointer to the current rec field */
+ int ret = 3333; /* return value */
+ ulint cur_field; /* current field number */
+
+ ut_ad(dtuple && rec && matched_fields);
+ ut_ad(dtuple_check_typed(dtuple));
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple));
+ ut_ad(*matched_fields <= rec_offs_n_fields(offsets));
+
+ cur_field = *matched_fields;
+
+ if (cur_field == 0) {
+ if (UNIV_UNLIKELY
+ (rec_get_info_bits(rec, rec_offs_comp(offsets))
+ & REC_INFO_MIN_REC_FLAG)) {
+
+ ret = !(dtuple_get_info_bits(dtuple)
+ & REC_INFO_MIN_REC_FLAG);
+
+ goto order_resolved;
+ }
+
+ if (UNIV_UNLIKELY
+ (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG)) {
+ ret = -1;
+
+ goto order_resolved;
+ }
+ }
+
+ /* Match fields in a loop; stop if we run out of fields in dtuple */
+
+ while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+
+ ulint mtype;
+ ulint prtype;
+
+ dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+ {
+ const dtype_t* type
+ = dfield_get_type(dtuple_field);
+
+ mtype = type->mtype;
+ prtype = type->prtype;
+ }
+
+ dtuple_f_data = dfield_get_data(dtuple_field);
+ dtuple_f_len = dfield_get_len(dtuple_field);
+
+ rec_f_data = rec_get_nth_field(rec, offsets,
+ cur_field, &rec_f_len);
+
+ if (rec_offs_nth_extern(offsets, cur_field)) {
+ /* We do not compare to an externally stored field */
+
+ ret = 0;
+
+ goto order_resolved;
+ }
+
+ ret = cmp_data_data(mtype, prtype, dtuple_f_data, dtuple_f_len,
+ rec_f_data, rec_f_len);
+ if (ret != 0) {
+ goto order_resolved;
+ }
+
+ cur_field++;
+ }
+
+ ret = 0; /* If we ran out of fields, dtuple was equal to rec
+ up to the common fields */
+order_resolved:
+ ut_ad((ret >= - 1) && (ret <= 1));
+
+ *matched_fields = cur_field;
+
+ return(ret);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c
new file mode 100644
index 00000000000..37ba8ca2ffe
--- /dev/null
+++ b/storage/xtradb/rem/rem0rec.c
@@ -0,0 +1,1774 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file rem/rem0rec.c
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "rem0rec.h"
+
+#ifdef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+
+/* PHYSICAL RECORD (OLD STYLE)
+ ===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| offset of the end of the last field of data, the most significant
+ bit is set to 1 if and only if the field is SQL-null,
+ if the offset is 2-byte, then the second most significant
+ bit is set to 1 if the field is stored on another page:
+ mostly this will occur in the case of big BLOB fields |
+...
+| offset of the end of the first field of data + the SQL-null bit |
+| 4 bits used to delete mark a record, and mark a predefined
+ minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+ (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+ heap of the index page |
+| 10 bits giving the number of fields in this record |
+| 1 bit which is set to 1 if the offsets above are given in
+ one byte format, 0 if in two byte format |
+| two bytes giving an absolute pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* PHYSICAL RECORD (NEW STYLE)
+ ===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| length of the last non-null variable-length field of data:
+ if the maximum length is 255, one byte; otherwise,
+ 0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes,
+ length=128..16383, extern storage flag) |
+...
+| length of first variable-length field of data |
+| SQL-null flags (1 bit per nullable field), padded to full bytes |
+| 4 bits used to delete mark a record, and mark a predefined
+ minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+ (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+ heap of the index page |
+| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree),
+ 010=infimum, 011=supremum, 1xx=reserved |
+| two bytes giving a relative pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* CANONICAL COORDINATES. A record can be seen as a single
+string of 'characters' in the following way: catenate the bytes
+in each field, in the order of fields. An SQL-null field
+is taken to be an empty sequence of bytes. Then after
+the position of each field insert in the string
+the 'character' <FIELD-END>, except that after an SQL-null field
+insert <NULL-FIELD-END>. Now the ordinal position of each
+byte in this canonical string is its canonical coordinate.
+So, for the record ("AA", SQL-NULL, "BB", ""), the canonical
+string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>".
+We identify prefixes (= initial segments) of a record
+with prefixes of the canonical string. The canonical
+length of the prefix is the length of the corresponding
+prefix of the canonical string. The canonical length of
+a record is the length of its canonical string.
+
+For example, the maximal common prefix of records
+("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C")
+is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical
+length is 5.
+
+A complete-field prefix of a record is a prefix which ends at the
+end of some field (containing also <FIELD-END>).
+A record is a complete-field prefix of another record, if
+the corresponding canonical strings have the same property. */
+
+/* this is used to fool compiler in rec_validate */
+UNIV_INTERN ulint rec_dummy;
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+ const rec_t* rec); /*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+ const rec_t* rec, /*!< in: compact physical record */
+ dict_index_t* index, /*!< in: record descriptor */
+ ulint n) /*!< in: number of columns to scan */
+{
+ const byte* nulls;
+ const byte* lens;
+ dict_field_t* field;
+ ulint null_mask;
+ ulint n_extern;
+ ulint i;
+
+ ut_ad(dict_table_is_comp(index->table));
+ ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+ ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index));
+
+ if (n == ULINT_UNDEFINED) {
+ n = dict_index_get_n_fields(index);
+ }
+
+ nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ null_mask = 1;
+ n_extern = 0;
+ i = 0;
+
+ /* read the lengths of fields 0..n */
+ do {
+ ulint len;
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields. */
+ continue;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if (UNIV_UNLIKELY(col->len > 255)
+ || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ if (len & 0x40) {
+ n_extern++;
+ }
+ lens--;
+ }
+ }
+ }
+ } while (++i < n);
+
+ return(n_extern);
+}
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT. This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INTERN
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+ const rec_t* rec, /*!< in: physical record in
+ ROW_FORMAT=COMPACT */
+ ulint extra, /*!< in: number of bytes to reserve
+ between the record header and
+ the data payload
+ (usually REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets)/*!< in/out: array of offsets;
+ in: n=rec_offs_n_fields(offsets) */
+{
+ ulint i = 0;
+ ulint offs = 0;
+ ulint any_ext = 0;
+ const byte* nulls = rec - (extra + 1);
+ const byte* lens = nulls
+ - UT_BITS_IN_BYTES(index->n_nullable);
+ dict_field_t* field;
+ ulint null_mask = 1;
+
+#ifdef UNIV_DEBUG
+ /* We cannot invoke rec_offs_make_valid() here, because it can hold
+ that extra != REC_N_NEW_EXTRA_BYTES. Similarly, rec_offs_validate()
+ will fail in that case, because it invokes rec_get_status(). */
+ offsets[2] = (ulint) rec;
+ offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+ /* read the lengths of fields 0..n */
+ do {
+ ulint len;
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype
+ & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = offs | REC_OFFS_SQL_NULL;
+ goto resolved;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ len = *lens--;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if (UNIV_UNLIKELY(col->len > 255)
+ || UNIV_UNLIKELY(col->mtype
+ == DATA_BLOB)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ len <<= 8;
+ len |= *lens--;
+
+ offs += len & 0x3fff;
+ if (UNIV_UNLIKELY(len
+ & 0x4000)) {
+ ut_ad(dict_index_is_clust
+ (index));
+ any_ext = REC_OFFS_EXTERNAL;
+ len = offs
+ | REC_OFFS_EXTERNAL;
+ } else {
+ len = offs;
+ }
+
+ goto resolved;
+ }
+ }
+
+ len = offs += len;
+ } else {
+ len = offs += field->fixed_len;
+ }
+resolved:
+ rec_offs_base(offsets)[i + 1] = len;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ *rec_offs_base(offsets)
+ = (rec - (lens + 1)) | REC_OFFS_COMPACT | any_ext;
+}
+
+/******************************************************//**
+The following function determines the offsets to each field in the
+record. The offsets are written to a previously allocated array of
+ulint, where rec_offs_n_fields(offsets) has been initialized to the
+number of fields in the record. The rest of the array will be
+initialized by this function. rec_offs_base(offsets)[0] will be set
+to the extra size (if REC_OFFS_COMPACT is set, the record is in the
+new format; if REC_OFFS_EXTERNAL is set, the record contains externally
+stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to
+offsets past the end of fields 0..n_fields, or to the beginning of
+fields 1..n_fields+1. When the high-order bit of the offset at [i+1]
+is set (REC_OFFS_SQL_NULL), the field i is NULL. When the second
+high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the
+field i is being stored externally. */
+static
+void
+rec_init_offsets(
+/*=============*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets)/*!< in/out: array of offsets;
+ in: n=rec_offs_n_fields(offsets) */
+{
+ ulint i = 0;
+ ulint offs;
+
+ rec_offs_make_valid(rec, index, offsets);
+
+ if (dict_table_is_comp(index->table)) {
+ const byte* nulls;
+ const byte* lens;
+ dict_field_t* field;
+ ulint null_mask;
+ ulint status = rec_get_status(rec);
+ ulint n_node_ptr_field = ULINT_UNDEFINED;
+
+ switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* the field is 8 bytes long */
+ rec_offs_base(offsets)[0]
+ = REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT;
+ rec_offs_base(offsets)[1] = 8;
+ return;
+ case REC_STATUS_NODE_PTR:
+ n_node_ptr_field
+ = dict_index_get_n_unique_in_tree(index);
+ break;
+ case REC_STATUS_ORDINARY:
+ rec_init_offsets_comp_ordinary(rec,
+ REC_N_NEW_EXTRA_BYTES,
+ index, offsets);
+ return;
+ }
+
+ nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ offs = 0;
+ null_mask = 1;
+
+ /* read the lengths of fields 0..n */
+ do {
+ ulint len;
+ if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ len = offs += 4;
+ goto resolved;
+ }
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype
+ & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = offs | REC_OFFS_SQL_NULL;
+ goto resolved;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ len = *lens--;
+ /* If the maximum length of the field
+ is up to 255 bytes, the actual length
+ is always stored in one byte. If the
+ maximum length is more than 255 bytes,
+ the actual length is stored in one
+ byte for 0..127. The length will be
+ encoded in two bytes when it is 128 or
+ more, or when the field is stored
+ externally. */
+ if (UNIV_UNLIKELY(col->len > 255)
+ || UNIV_UNLIKELY(col->mtype
+ == DATA_BLOB)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+
+ len <<= 8;
+ len |= *lens--;
+
+ /* B-tree node pointers
+ must not contain externally
+ stored columns. Thus
+ the "e" flag must be 0. */
+ ut_a(!(len & 0x4000));
+ offs += len & 0x3fff;
+ len = offs;
+
+ goto resolved;
+ }
+ }
+
+ len = offs += len;
+ } else {
+ len = offs += field->fixed_len;
+ }
+resolved:
+ rec_offs_base(offsets)[i + 1] = len;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ *rec_offs_base(offsets)
+ = (rec - (lens + 1)) | REC_OFFS_COMPACT;
+ } else {
+ /* Old-style record: determine extra size and end offsets */
+ offs = REC_N_OLD_EXTRA_BYTES;
+ if (rec_get_1byte_offs_flag(rec)) {
+ offs += rec_offs_n_fields(offsets);
+ *rec_offs_base(offsets) = offs;
+ /* Determine offsets to fields */
+ do {
+ offs = rec_1_get_field_end_info(rec, i);
+ if (offs & REC_1BYTE_SQL_NULL_MASK) {
+ offs &= ~REC_1BYTE_SQL_NULL_MASK;
+ offs |= REC_OFFS_SQL_NULL;
+ }
+ rec_offs_base(offsets)[1 + i] = offs;
+ } while (++i < rec_offs_n_fields(offsets));
+ } else {
+ offs += 2 * rec_offs_n_fields(offsets);
+ *rec_offs_base(offsets) = offs;
+ /* Determine offsets to fields */
+ do {
+ offs = rec_2_get_field_end_info(rec, i);
+ if (offs & REC_2BYTE_SQL_NULL_MASK) {
+ offs &= ~REC_2BYTE_SQL_NULL_MASK;
+ offs |= REC_OFFS_SQL_NULL;
+ }
+ if (offs & REC_2BYTE_EXTERN_MASK) {
+ offs &= ~REC_2BYTE_EXTERN_MASK;
+ offs |= REC_OFFS_EXTERNAL;
+ *rec_offs_base(offsets) |= REC_OFFS_EXTERNAL;
+ }
+ rec_offs_base(offsets)[1 + i] = offs;
+ } while (++i < rec_offs_n_fields(offsets));
+ }
+ }
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously returned array.
+@return the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint* offsets,/*!< in/out: array consisting of
+ offsets[0] allocated elements,
+ or an array from rec_get_offsets(),
+ or NULL */
+ ulint n_fields,/*!< in: maximum number of
+ initialized fields
+ (ULINT_UNDEFINED if all fields) */
+ mem_heap_t** heap, /*!< in/out: memory heap */
+ const char* file, /*!< in: file name where called */
+ ulint line) /*!< in: line number where called */
+{
+ ulint n;
+ ulint size;
+
+ ut_ad(rec);
+ ut_ad(index);
+ ut_ad(heap);
+
+ if (dict_table_is_comp(index->table)) {
+ switch (UNIV_EXPECT(rec_get_status(rec),
+ REC_STATUS_ORDINARY)) {
+ case REC_STATUS_ORDINARY:
+ n = dict_index_get_n_fields(index);
+ break;
+ case REC_STATUS_NODE_PTR:
+ n = dict_index_get_n_unique_in_tree(index) + 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* infimum or supremum record */
+ n = 1;
+ break;
+ default:
+ ut_error;
+ return(NULL);
+ }
+ } else {
+ n = rec_get_n_fields_old(rec);
+ }
+
+ if (UNIV_UNLIKELY(n_fields < n)) {
+ n = n_fields;
+ }
+
+ size = n + (1 + REC_OFFS_HEADER_SIZE);
+
+ if (UNIV_UNLIKELY(!offsets)
+ || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) {
+ if (UNIV_UNLIKELY(!*heap)) {
+ *heap = mem_heap_create_func(size * sizeof(ulint),
+ MEM_HEAP_DYNAMIC,
+ file, line);
+ }
+ offsets = mem_heap_alloc(*heap, size * sizeof(ulint));
+ rec_offs_set_n_alloc(offsets, size);
+ }
+
+ rec_offs_set_n_fields(offsets, n);
+ rec_init_offsets(rec, index, offsets);
+ return(offsets);
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record. It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+ const byte* extra, /*!< in: the extra bytes of a
+ compact record in reverse order,
+ excluding the fixed-size
+ REC_N_NEW_EXTRA_BYTES */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint node_ptr,/*!< in: nonzero=node pointer,
+ 0=leaf node */
+ ulint* offsets)/*!< in/out: array consisting of
+ offsets[0] allocated elements */
+{
+ ulint n;
+ ulint i;
+ ulint offs;
+ ulint any_ext;
+ const byte* nulls;
+ const byte* lens;
+ dict_field_t* field;
+ ulint null_mask;
+ ulint n_node_ptr_field;
+
+ ut_ad(extra);
+ ut_ad(index);
+ ut_ad(offsets);
+ ut_ad(dict_table_is_comp(index->table));
+
+ if (UNIV_UNLIKELY(node_ptr)) {
+ n_node_ptr_field = dict_index_get_n_unique_in_tree(index);
+ n = n_node_ptr_field + 1;
+ } else {
+ n_node_ptr_field = ULINT_UNDEFINED;
+ n = dict_index_get_n_fields(index);
+ }
+
+ ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE));
+ rec_offs_set_n_fields(offsets, n);
+
+ nulls = extra;
+ lens = nulls + UT_BITS_IN_BYTES(index->n_nullable);
+ i = offs = 0;
+ null_mask = 1;
+ any_ext = 0;
+
+ /* read the lengths of fields 0..n */
+ do {
+ ulint len;
+ if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ len = offs += 4;
+ goto resolved;
+ }
+
+ field = dict_index_get_nth_field(index, i);
+ if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls++;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ /* No length is stored for NULL fields.
+ We do not advance offs, and we set
+ the length to zero and enable the
+ SQL NULL flag in offsets[]. */
+ len = offs | REC_OFFS_SQL_NULL;
+ goto resolved;
+ }
+ null_mask <<= 1;
+ }
+
+ if (UNIV_UNLIKELY(!field->fixed_len)) {
+ /* Variable-length field: read the length */
+ const dict_col_t* col
+ = dict_field_get_col(field);
+ len = *lens++;
+ /* If the maximum length of the field is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the field is stored externally. */
+ if (UNIV_UNLIKELY(col->len > 255)
+ || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+ if (len & 0x80) {
+ /* 1exxxxxxx xxxxxxxx */
+ len <<= 8;
+ len |= *lens++;
+
+ offs += len & 0x3fff;
+ if (UNIV_UNLIKELY(len & 0x4000)) {
+ any_ext = REC_OFFS_EXTERNAL;
+ len = offs | REC_OFFS_EXTERNAL;
+ } else {
+ len = offs;
+ }
+
+ goto resolved;
+ }
+ }
+
+ len = offs += len;
+ } else {
+ len = offs += field->fixed_len;
+ }
+resolved:
+ rec_offs_base(offsets)[i + 1] = len;
+ } while (++i < rec_offs_n_fields(offsets));
+
+ ut_ad(lens >= extra);
+ *rec_offs_base(offsets) = (lens - extra + REC_N_NEW_EXTRA_BYTES)
+ | REC_OFFS_COMPACT | any_ext;
+}
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: record */
+ ulint n, /*!< in: index of the field */
+ ulint* len) /*!< out: length of the field;
+ UNIV_SQL_NULL if SQL null */
+{
+ ulint os;
+ ulint next_os;
+
+ ut_ad(len);
+ ut_a(rec);
+ ut_a(n < rec_get_n_fields_old(rec));
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ os = rec_1_get_field_start_offs(rec, n);
+
+ next_os = rec_1_get_field_end_info(rec, n);
+
+ if (next_os & REC_1BYTE_SQL_NULL_MASK) {
+ *len = UNIV_SQL_NULL;
+
+ return(os);
+ }
+
+ next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ os = rec_2_get_field_start_offs(rec, n);
+
+ next_os = rec_2_get_field_end_info(rec, n);
+
+ if (next_os & REC_2BYTE_SQL_NULL_MASK) {
+ *len = UNIV_SQL_NULL;
+
+ return(os);
+ }
+
+ next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
+ | REC_2BYTE_EXTERN_MASK);
+ }
+
+ *len = next_os - os;
+
+ ut_ad(*len < UNIV_PAGE_SIZE);
+
+ return(os);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra) /*!< out: extra size */
+{
+ ulint extra_size;
+ ulint data_size;
+ ulint i;
+ ut_ad(index);
+ ut_ad(fields);
+ ut_ad(n_fields > 0);
+ ut_ad(n_fields <= dict_index_get_n_fields(index));
+
+ extra_size = REC_N_NEW_EXTRA_BYTES
+ + UT_BITS_IN_BYTES(index->n_nullable);
+ data_size = 0;
+
+ /* read the lengths of fields 0..n */
+ for (i = 0; i < n_fields; i++) {
+ const dict_field_t* field;
+ ulint len;
+ const dict_col_t* col;
+
+ field = dict_index_get_nth_field(index, i);
+ len = dfield_get_len(&fields[i]);
+ col = dict_field_get_col(field);
+
+ ut_ad(dict_col_type_assert_equal(col,
+ dfield_get_type(&fields[i])));
+
+ if (dfield_is_null(&fields[i])) {
+ /* No length is stored for NULL fields. */
+ ut_ad(!(col->prtype & DATA_NOT_NULL));
+ continue;
+ }
+
+ ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
+
+ if (field->fixed_len) {
+ ut_ad(len == field->fixed_len);
+ /* dict_index_add_col() should guarantee this */
+ ut_ad(!field->prefix_len
+ || field->fixed_len == field->prefix_len);
+ } else if (dfield_is_ext(&fields[i])) {
+ ut_ad(col->len >= 256 || col->mtype == DATA_BLOB);
+ extra_size += 2;
+ } else if (len < 128
+ || (col->len < 256 && col->mtype != DATA_BLOB)) {
+ extra_size++;
+ } else {
+ /* For variable-length columns, we look up the
+ maximum length from the column itself. If this
+ is a prefix index column shorter than 256 bytes,
+ this will waste one byte. */
+ extra_size += 2;
+ }
+ data_size += len;
+ }
+
+ if (UNIV_LIKELY_NULL(extra)) {
+ *extra = extra_size;
+ }
+
+ return(extra_size + data_size);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+ const dict_index_t* index, /*!< in: record descriptor;
+ dict_table_is_comp() is
+ assumed to hold, even if
+ it does not */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields,/*!< in: number of data fields */
+ ulint* extra) /*!< out: extra size */
+{
+ ulint size;
+ ut_ad(index);
+ ut_ad(fields);
+ ut_ad(n_fields > 0);
+
+ switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_fields == dict_index_get_n_fields(index));
+ size = 0;
+ break;
+ case REC_STATUS_NODE_PTR:
+ n_fields--;
+ ut_ad(n_fields == dict_index_get_n_unique_in_tree(index));
+ ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE);
+ size = REC_NODE_PTR_SIZE; /* child page number */
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* infimum or supremum record, 8 data bytes */
+ if (UNIV_LIKELY_NULL(extra)) {
+ *extra = REC_N_NEW_EXTRA_BYTES;
+ }
+ return(REC_N_NEW_EXTRA_BYTES + 8);
+ default:
+ ut_error;
+ return(ULINT_UNDEFINED);
+ }
+
+ return(size + rec_get_converted_size_comp_prefix(index, fields,
+ n_fields, extra));
+}
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint i, /*!< in: ith field */
+ ibool val) /*!< in: value to set */
+{
+ ulint info;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+
+ info = rec_1_get_field_end_info(rec, i);
+
+ if (val) {
+ info = info | REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ info = info & ~REC_1BYTE_SQL_NULL_MASK;
+ }
+
+ rec_1_set_field_end_info(rec, i, info);
+
+ return;
+ }
+
+ info = rec_2_get_field_end_info(rec, i);
+
+ if (val) {
+ info = info | REC_2BYTE_SQL_NULL_MASK;
+ } else {
+ info = info & ~REC_2BYTE_SQL_NULL_MASK;
+ }
+
+ rec_2_set_field_end_info(rec, i, info);
+}
+
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+ rec_t* rec, /*!< in: record */
+ ulint n) /*!< in: index of the field */
+{
+ ulint offset;
+
+ offset = rec_get_field_start_offs(rec, n);
+
+ data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n));
+
+ rec_set_nth_field_null_bit(rec, n, TRUE);
+}
+
+/*********************************************************//**
+Builds an old-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_old(
+/*==========================*/
+ byte* buf, /*!< in: start address of the physical record */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of externally stored columns */
+{
+ const dfield_t* field;
+ ulint n_fields;
+ ulint data_size;
+ rec_t* rec;
+ ulint end_offset;
+ ulint ored_offset;
+ ulint len;
+ ulint i;
+
+ ut_ad(buf && dtuple);
+ ut_ad(dtuple_validate(dtuple));
+ ut_ad(dtuple_check_typed(dtuple));
+
+ n_fields = dtuple_get_n_fields(dtuple);
+ data_size = dtuple_get_data_size(dtuple, 0);
+
+ ut_ad(n_fields > 0);
+
+ /* Calculate the offset of the origin in the physical record */
+
+ rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext);
+#ifdef UNIV_DEBUG
+ /* Suppress Valgrind warnings of ut_ad()
+ in mach_write_to_1(), mach_write_to_2() et al. */
+ memset(buf, 0xff, rec - buf + data_size);
+#endif /* UNIV_DEBUG */
+ /* Store the number of fields */
+ rec_set_n_fields_old(rec, n_fields);
+
+ /* Set the info bits of the record */
+ rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple)
+ & REC_INFO_BITS_MASK);
+
+ /* Store the data and the offsets */
+
+ end_offset = 0;
+
+ if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+ rec_set_1byte_offs_flag(rec, TRUE);
+
+ for (i = 0; i < n_fields; i++) {
+
+ field = dtuple_get_nth_field(dtuple, i);
+
+ if (dfield_is_null(field)) {
+ len = dtype_get_sql_null_size(
+ dfield_get_type(field), 0);
+ data_write_sql_null(rec + end_offset, len);
+
+ end_offset += len;
+ ored_offset = end_offset
+ | REC_1BYTE_SQL_NULL_MASK;
+ } else {
+ /* If the data is not SQL null, store it */
+ len = dfield_get_len(field);
+
+ memcpy(rec + end_offset,
+ dfield_get_data(field), len);
+
+ end_offset += len;
+ ored_offset = end_offset;
+ }
+
+ rec_1_set_field_end_info(rec, i, ored_offset);
+ }
+ } else {
+ rec_set_1byte_offs_flag(rec, FALSE);
+
+ for (i = 0; i < n_fields; i++) {
+
+ field = dtuple_get_nth_field(dtuple, i);
+
+ if (dfield_is_null(field)) {
+ len = dtype_get_sql_null_size(
+ dfield_get_type(field), 0);
+ data_write_sql_null(rec + end_offset, len);
+
+ end_offset += len;
+ ored_offset = end_offset
+ | REC_2BYTE_SQL_NULL_MASK;
+ } else {
+ /* If the data is not SQL null, store it */
+ len = dfield_get_len(field);
+
+ memcpy(rec + end_offset,
+ dfield_get_data(field), len);
+
+ end_offset += len;
+ ored_offset = end_offset;
+
+ if (dfield_is_ext(field)) {
+ ored_offset |= REC_2BYTE_EXTERN_MASK;
+ }
+ }
+
+ rec_2_set_field_end_info(rec, i, ored_offset);
+ }
+ }
+
+ return(rec);
+}
+
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+ rec_t* rec, /*!< in: origin of record */
+ ulint extra, /*!< in: number of bytes to
+ reserve between the record
+ header and the data payload
+ (normally REC_N_NEW_EXTRA_BYTES) */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint status, /*!< in: status bits of the record */
+ const dfield_t* fields, /*!< in: array of data fields */
+ ulint n_fields)/*!< in: number of data fields */
+{
+ const dfield_t* field;
+ const dtype_t* type;
+ byte* end;
+ byte* nulls;
+ byte* lens;
+ ulint len;
+ ulint i;
+ ulint n_node_ptr_field;
+ ulint fixed_len;
+ ulint null_mask = 1;
+ ut_ad(extra == 0 || dict_table_is_comp(index->table));
+ ut_ad(extra == 0 || extra == REC_N_NEW_EXTRA_BYTES);
+ ut_ad(n_fields > 0);
+
+ switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_fields <= dict_index_get_n_fields(index));
+ n_node_ptr_field = ULINT_UNDEFINED;
+ break;
+ case REC_STATUS_NODE_PTR:
+ ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1);
+ n_node_ptr_field = n_fields - 1;
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ ut_ad(n_fields == 1);
+ n_node_ptr_field = ULINT_UNDEFINED;
+ break;
+ default:
+ ut_error;
+ return;
+ }
+
+ end = rec;
+ nulls = rec - (extra + 1);
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ /* clear the SQL-null flags */
+ memset(lens + 1, 0, nulls - lens);
+
+ /* Store the data and the offsets */
+
+ for (i = 0, field = fields; i < n_fields; i++, field++) {
+ const dict_field_t* ifield;
+
+ type = dfield_get_type(field);
+ len = dfield_get_len(field);
+
+ if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+ ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL);
+ ut_ad(len == 4);
+ memcpy(end, dfield_get_data(field), len);
+ end += 4;
+ break;
+ }
+
+ if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
+ /* nullable field */
+ ut_ad(index->n_nullable > 0);
+
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ ut_ad(*nulls < null_mask);
+
+ /* set the null flag if necessary */
+ if (dfield_is_null(field)) {
+ *nulls |= null_mask;
+ null_mask <<= 1;
+ continue;
+ }
+
+ null_mask <<= 1;
+ }
+ /* only nullable fields can be null */
+ ut_ad(!dfield_is_null(field));
+
+ ifield = dict_index_get_nth_field(index, i);
+ fixed_len = ifield->fixed_len;
+ /* If the maximum length of a variable-length field
+ is up to 255 bytes, the actual length is always stored
+ in one byte. If the maximum length is more than 255
+ bytes, the actual length is stored in one byte for
+ 0..127. The length will be encoded in two bytes when
+ it is 128 or more, or when the field is stored externally. */
+ if (fixed_len) {
+ ut_ad(len == fixed_len);
+ ut_ad(!dfield_is_ext(field));
+ } else if (dfield_is_ext(field)) {
+ ut_ad(ifield->col->len >= 256
+ || ifield->col->mtype == DATA_BLOB);
+ ut_ad(len <= REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ *lens-- = (byte) (len >> 8) | 0xc0;
+ *lens-- = (byte) len;
+ } else {
+ ut_ad(len <= dtype_get_len(type)
+ || dtype_get_mtype(type) == DATA_BLOB);
+ if (len < 128
+ || (dtype_get_len(type) < 256
+ && dtype_get_mtype(type) != DATA_BLOB)) {
+
+ *lens-- = (byte) len;
+ } else {
+ ut_ad(len < 16384);
+ *lens-- = (byte) (len >> 8) | 0x80;
+ *lens-- = (byte) len;
+ }
+ }
+
+ memcpy(end, dfield_get_data(field), len);
+ end += len;
+ }
+}
+
+/*********************************************************//**
+Builds a new-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_new(
+/*==========================*/
+ byte* buf, /*!< in: start address of
+ the physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple) /*!< in: data tuple */
+{
+ ulint extra_size;
+ ulint status;
+ rec_t* rec;
+
+ status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK;
+ rec_get_converted_size_comp(index, status,
+ dtuple->fields, dtuple->n_fields,
+ &extra_size);
+ rec = buf + extra_size;
+
+ rec_convert_dtuple_to_rec_comp(
+ rec, REC_N_NEW_EXTRA_BYTES, index, status,
+ dtuple->fields, dtuple->n_fields);
+
+ /* Set the info bits of the record */
+ rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple));
+
+ return(rec);
+}
+
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+ byte* buf, /*!< in: start address of the
+ physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ const dtuple_t* dtuple, /*!< in: data tuple */
+ ulint n_ext) /*!< in: number of
+ externally stored columns */
+{
+ rec_t* rec;
+
+ ut_ad(buf && index && dtuple);
+ ut_ad(dtuple_validate(dtuple));
+ ut_ad(dtuple_check_typed(dtuple));
+
+ if (dict_table_is_comp(index->table)) {
+ rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple);
+ } else {
+ rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext);
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ ulint i;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index,
+ offsets_, ULINT_UNDEFINED, &heap);
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(dtuple_get_n_fields(dtuple)
+ == rec_offs_n_fields(offsets));
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ ut_ad(!dfield_is_ext(dtuple_get_nth_field(dtuple, i))
+ == !rec_offs_nth_extern(offsets, i));
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+#endif /* UNIV_DEBUG */
+ return(rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple. The fields
+are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+ dtuple_t* tuple, /*!< out: data tuple */
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint i;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap);
+
+ ut_ad(rec_validate(rec, offsets));
+ ut_ad(dtuple_check_typed(tuple));
+
+ dtuple_set_info_bits(tuple, rec_get_info_bits(
+ rec, dict_table_is_comp(index->table)));
+
+ for (i = 0; i < n_fields; i++) {
+ dfield_t* field;
+ const byte* data;
+ ulint len;
+
+ field = dtuple_get_nth_field(tuple, i);
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (len != UNIV_SQL_NULL) {
+ dfield_set_data(field,
+ mem_heap_dup(heap, data, len), len);
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ } else {
+ dfield_set_null(field);
+ }
+ }
+}
+
+/**************************************************************//**
+Copies the first n fields of an old-style physical record
+to a new physical record in a buffer.
+@return own: copied record */
+static
+rec_t*
+rec_copy_prefix_to_buf_old(
+/*=======================*/
+ const rec_t* rec, /*!< in: physical record */
+ ulint n_fields, /*!< in: number of fields to copy */
+ ulint area_end, /*!< in: end of the prefix data */
+ byte** buf, /*!< in/out: memory buffer for
+ the copied prefix, or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+{
+ rec_t* copy_rec;
+ ulint area_start;
+ ulint prefix_len;
+
+ if (rec_get_1byte_offs_flag(rec)) {
+ area_start = REC_N_OLD_EXTRA_BYTES + n_fields;
+ } else {
+ area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields;
+ }
+
+ prefix_len = area_start + area_end;
+
+ if ((*buf == NULL) || (*buf_size < prefix_len)) {
+ if (*buf != NULL) {
+ mem_free(*buf);
+ }
+
+ *buf = mem_alloc2(prefix_len, buf_size);
+ }
+
+ ut_memcpy(*buf, rec - area_start, prefix_len);
+
+ copy_rec = *buf + area_start;
+
+ rec_set_n_fields_old(copy_rec, n_fields);
+
+ return(copy_rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+ const rec_t* rec, /*!< in: physical record */
+ const dict_index_t* index, /*!< in: record descriptor */
+ ulint n_fields, /*!< in: number of fields
+ to copy */
+ byte** buf, /*!< in/out: memory buffer
+ for the copied prefix,
+ or NULL */
+ ulint* buf_size) /*!< in/out: buffer size */
+{
+ const byte* nulls;
+ const byte* lens;
+ ulint i;
+ ulint prefix_len;
+ ulint null_mask;
+ ulint status;
+
+ UNIV_PREFETCH_RW(*buf);
+
+ if (!dict_table_is_comp(index->table)) {
+ ut_ad(rec_validate_old(rec));
+ return(rec_copy_prefix_to_buf_old(
+ rec, n_fields,
+ rec_get_field_start_offs(rec, n_fields),
+ buf, buf_size));
+ }
+
+ status = rec_get_status(rec);
+
+ switch (status) {
+ case REC_STATUS_ORDINARY:
+ ut_ad(n_fields <= dict_index_get_n_fields(index));
+ break;
+ case REC_STATUS_NODE_PTR:
+ /* it doesn't make sense to copy the child page number field */
+ ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index));
+ break;
+ case REC_STATUS_INFIMUM:
+ case REC_STATUS_SUPREMUM:
+ /* infimum or supremum record: no sense to copy anything */
+ default:
+ ut_error;
+ return(NULL);
+ }
+
+ nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+ lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+ UNIV_PREFETCH_R(lens);
+ prefix_len = 0;
+ null_mask = 1;
+
+ /* read the lengths of fields 0..n */
+ for (i = 0; i < n_fields; i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+
+ if (!(col->prtype & DATA_NOT_NULL)) {
+ /* nullable field => read the null flag */
+ if (UNIV_UNLIKELY(!(byte) null_mask)) {
+ nulls--;
+ null_mask = 1;
+ }
+
+ if (*nulls & null_mask) {
+ null_mask <<= 1;
+ continue;
+ }
+
+ null_mask <<= 1;
+ }
+
+ if (field->fixed_len) {
+ prefix_len += field->fixed_len;
+ } else {
+ ulint len = *lens--;
+ /* If the maximum length of the column is up
+ to 255 bytes, the actual length is always
+ stored in one byte. If the maximum length is
+ more than 255 bytes, the actual length is
+ stored in one byte for 0..127. The length
+ will be encoded in two bytes when it is 128 or
+ more, or when the column is stored externally. */
+ if (col->len > 255 || col->mtype == DATA_BLOB) {
+ if (len & 0x80) {
+ /* 1exxxxxx */
+ len &= 0x3f;
+ len <<= 8;
+ len |= *lens--;
+ UNIV_PREFETCH_R(lens);
+ }
+ }
+ prefix_len += len;
+ }
+ }
+
+ UNIV_PREFETCH_R(rec + prefix_len);
+
+ prefix_len += rec - (lens + 1);
+
+ if ((*buf == NULL) || (*buf_size < prefix_len)) {
+ if (*buf != NULL) {
+ mem_free(*buf);
+ }
+
+ *buf = mem_alloc2(prefix_len, buf_size);
+ }
+
+ memcpy(*buf, lens + 1, prefix_len);
+
+ return(*buf + (rec - (lens + 1)));
+}
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+ const rec_t* rec) /*!< in: physical record */
+{
+ const byte* data;
+ ulint len;
+ ulint n_fields;
+ ulint len_sum = 0;
+ ulint sum = 0;
+ ulint i;
+
+ ut_a(rec);
+ n_fields = rec_get_n_fields_old(rec);
+
+ if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+ fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+ (ulong) n_fields);
+ return(FALSE);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field_old(rec, i, &len);
+
+ if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+ fprintf(stderr,
+ "InnoDB: Error: record field %lu len %lu\n",
+ (ulong) i,
+ (ulong) len);
+ return(FALSE);
+ }
+
+ if (len != UNIV_SQL_NULL) {
+ len_sum += len;
+ sum += *(data + len -1); /* dereference the
+ end of the field to
+ cause a memory trap
+ if possible */
+ } else {
+ len_sum += rec_get_nth_field_size(rec, i);
+ }
+ }
+
+ if (len_sum != rec_get_data_size_old(rec)) {
+ fprintf(stderr,
+ "InnoDB: Error: record len should be %lu, len %lu\n",
+ (ulong) len_sum,
+ rec_get_data_size_old(rec));
+ return(FALSE);
+ }
+
+ rec_dummy = sum; /* This is here only to fool the compiler */
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ const byte* data;
+ ulint len;
+ ulint n_fields;
+ ulint len_sum = 0;
+ ulint sum = 0;
+ ulint i;
+
+ ut_a(rec);
+ n_fields = rec_offs_n_fields(offsets);
+
+ if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+ fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+ (ulong) n_fields);
+ return(FALSE);
+ }
+
+ ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec));
+
+ for (i = 0; i < n_fields; i++) {
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+ fprintf(stderr,
+ "InnoDB: Error: record field %lu len %lu\n",
+ (ulong) i,
+ (ulong) len);
+ return(FALSE);
+ }
+
+ if (len != UNIV_SQL_NULL) {
+ len_sum += len;
+ sum += *(data + len -1); /* dereference the
+ end of the field to
+ cause a memory trap
+ if possible */
+ } else if (!rec_offs_comp(offsets)) {
+ len_sum += rec_get_nth_field_size(rec, i);
+ }
+ }
+
+ if (len_sum != rec_offs_data_size(offsets)) {
+ fprintf(stderr,
+ "InnoDB: Error: record len should be %lu, len %lu\n",
+ (ulong) len_sum,
+ (ulong) rec_offs_data_size(offsets));
+ return(FALSE);
+ }
+
+ rec_dummy = sum; /* This is here only to fool the compiler */
+
+ if (!rec_offs_comp(offsets)) {
+ ut_a(rec_validate_old(rec));
+ }
+
+ return(TRUE);
+}
+
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec) /*!< in: physical record */
+{
+ const byte* data;
+ ulint len;
+ ulint n;
+ ulint i;
+
+ ut_ad(rec);
+
+ n = rec_get_n_fields_old(rec);
+
+ fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+ " %u-byte offsets; info bits %lu\n",
+ (ulong) n,
+ rec_get_1byte_offs_flag(rec) ? 1 : 2,
+ (ulong) rec_get_info_bits(rec, FALSE));
+
+ for (i = 0; i < n; i++) {
+
+ data = rec_get_nth_field_old(rec, i, &len);
+
+ fprintf(file, " %lu:", (ulong) i);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total %lu bytes)",
+ (ulong) len);
+ }
+ } else {
+ fprintf(file, " SQL NULL, size %lu ",
+ rec_get_nth_field_size(rec, i));
+ }
+
+ putc(';', file);
+ putc('\n', file);
+ }
+
+ rec_validate_old(rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT. Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ulint i;
+
+ for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+ const byte* data;
+ ulint len;
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ fprintf(file, " %lu:", (ulong) i);
+
+ if (len != UNIV_SQL_NULL) {
+ if (len <= 30) {
+
+ ut_print_buf(file, data, len);
+ } else {
+ ut_print_buf(file, data, 30);
+
+ fprintf(file, " (total %lu bytes)",
+ (ulong) len);
+ }
+ } else {
+ fputs(" SQL NULL", file);
+ }
+ putc(';', file);
+ putc('\n', file);
+ }
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ const ulint* offsets)/*!< in: array returned by rec_get_offsets() */
+{
+ ut_ad(rec);
+ ut_ad(offsets);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (!rec_offs_comp(offsets)) {
+ rec_print_old(file, rec);
+ return;
+ }
+
+ fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+ " compact format; info bits %lu\n",
+ (ulong) rec_offs_n_fields(offsets),
+ (ulong) rec_get_info_bits(rec, TRUE));
+
+ rec_print_comp(file, rec, offsets);
+ rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+ FILE* file, /*!< in: file where to print */
+ const rec_t* rec, /*!< in: physical record */
+ dict_index_t* index) /*!< in: record descriptor */
+{
+ ut_ad(index);
+
+ if (!dict_table_is_comp(index->table)) {
+ rec_print_old(file, rec);
+ return;
+ } else {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ rec_print_new(file, rec,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/row/row0ext.c b/storage/xtradb/row/row0ext.c
new file mode 100644
index 00000000000..7320f5b1dca
--- /dev/null
+++ b/storage/xtradb/row/row0ext.c
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.c
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+
+#ifdef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#include "btr0cur.h"
+
+/********************************************************************//**
+Fills the column prefix cache of an externally stored column. */
+static
+void
+row_ext_cache_fill(
+/*===============*/
+ row_ext_t* ext, /*!< in/out: column prefix cache */
+ ulint i, /*!< in: index of ext->ext[] */
+ ulint zip_size,/*!< compressed page size in bytes, or 0 */
+ const dfield_t* dfield) /*!< in: data field */
+{
+ const byte* field = dfield_get_data(dfield);
+ ulint f_len = dfield_get_len(dfield);
+ byte* buf = ext->buf + i * REC_MAX_INDEX_COL_LEN;
+
+ ut_ad(i < ext->n_ext);
+ ut_ad(dfield_is_ext(dfield));
+ ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+ if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+ field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE))) {
+ /* The BLOB pointer is not set: we cannot fetch it */
+ ext->len[i] = 0;
+ } else {
+ /* Fetch at most REC_MAX_INDEX_COL_LEN of the column.
+ The column should be non-empty. However,
+ trx_rollback_or_clean_all_recovered() may try to
+ access a half-deleted BLOB if the server previously
+ crashed during the execution of
+ btr_free_externally_stored_field(). */
+ ext->len[i] = btr_copy_externally_stored_field_prefix(
+ buf, REC_MAX_INDEX_COL_LEN, zip_size, field, f_len);
+ }
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+ ulint n_ext, /*!< in: number of externally stored columns */
+ const ulint* ext, /*!< in: col_no's of externally stored columns
+ in the InnoDB table object, as reported by
+ dict_col_get_no(); NOT relative to the records
+ in the clustered index */
+ const dtuple_t* tuple, /*!< in: data tuple containing the field
+ references of the externally stored
+ columns; must be indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch
+ to prevent deletion (rollback or purge). */
+ ulint zip_size,/*!< compressed page size in bytes, or 0 */
+ mem_heap_t* heap) /*!< in: heap where created */
+{
+ ulint i;
+ row_ext_t* ret = mem_heap_alloc(heap, (sizeof *ret)
+ + (n_ext - 1) * sizeof ret->len);
+
+ ut_ad(ut_is_2pow(zip_size));
+ ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+ ret->n_ext = n_ext;
+ ret->ext = ext;
+ ret->buf = mem_heap_alloc(heap, n_ext * REC_MAX_INDEX_COL_LEN);
+#ifdef UNIV_DEBUG
+ memset(ret->buf, 0xaa, n_ext * REC_MAX_INDEX_COL_LEN);
+ UNIV_MEM_ALLOC(ret->buf, n_ext * REC_MAX_INDEX_COL_LEN);
+#endif
+
+ /* Fetch the BLOB prefixes */
+ for (i = 0; i < n_ext; i++) {
+ const dfield_t* dfield;
+
+ dfield = dtuple_get_nth_field(tuple, ext[i]);
+ row_ext_cache_fill(ret, i, zip_size, dfield);
+ }
+
+ return(ret);
+}
diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c
new file mode 100644
index 00000000000..d4925e46f97
--- /dev/null
+++ b/storage/xtradb/row/row0ins.c
@@ -0,0 +1,2533 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.c
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+#include "buf0lru.h"
+
+#define ROW_INS_PREV 1
+#define ROW_INS_NEXT 2
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/*********************************************************************//**
+Creates an insert node struct.
+@return own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+ ulint ins_type, /*!< in: INS_VALUES, ... */
+ dict_table_t* table, /*!< in: table where to insert */
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ ins_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+ node->common.type = QUE_NODE_INSERT;
+
+ node->ins_type = ins_type;
+
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->table = table;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->select = NULL;
+
+ node->trx_id = ut_dulint_zero;
+
+ node->entry_sys_heap = mem_heap_create(128);
+
+ node->magic_n = INS_NODE_MAGIC_N;
+
+ return(node);
+}
+
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ dict_index_t* index;
+ dtuple_t* entry;
+
+ ut_ad(node->entry_sys_heap);
+
+ UT_LIST_INIT(node->entry_list);
+
+ index = dict_table_get_first_index(node->table);
+
+ while (index != NULL) {
+ entry = row_build_index_entry(node->row, NULL, index,
+ node->entry_sys_heap);
+ UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+ index = dict_table_get_next_index(index);
+ }
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+ ins_node_t* node) /*!< in: insert node */
+{
+ dtuple_t* row;
+ dict_table_t* table;
+ mem_heap_t* heap;
+ const dict_col_t* col;
+ dfield_t* dfield;
+ byte* ptr;
+
+ row = node->row;
+ table = node->table;
+ heap = node->entry_sys_heap;
+
+ ut_ad(row && table && heap);
+ ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+ /* 1. Allocate buffer for row id */
+
+ col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+ ptr = mem_heap_zalloc(heap, DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+ node->row_id_buf = ptr;
+
+ /* 3. Allocate buffer for trx id */
+
+ col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_zalloc(heap, DATA_TRX_ID_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+ node->trx_id_buf = ptr;
+
+ /* 4. Allocate buffer for roll ptr */
+
+ col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+ dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+ ptr = mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN);
+
+ dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+ ins_node_t* node, /*!< in: insert node */
+ dtuple_t* row) /*!< in: new row (or first row) for the node */
+{
+ node->state = INS_NODE_SET_IX_LOCK;
+ node->index = NULL;
+ node->entry = NULL;
+
+ node->row = row;
+
+ mem_heap_empty(node->entry_sys_heap);
+
+ /* Create templates for index entries */
+
+ ins_node_create_entry_list(node);
+
+ /* Allocate from entry_sys_heap buffers for sys fields */
+
+ row_ins_alloc_sys_fields(node);
+
+ /* As we allocated a new trx id buf, the trx id should be written
+ there again: */
+
+ node->trx_id = ut_dulint_zero;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ big_rec_t* dummy_big_rec;
+ mem_heap_t* heap;
+ upd_t* update;
+ rec_t* rec;
+ ulint err;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(!dict_index_is_clust(cursor->index));
+ ut_ad(rec_get_deleted_flag(rec,
+ dict_table_is_comp(cursor->index->table)));
+
+ /* We know that in the alphabetical ordering, entry and rec are
+ identified. But in their binary form there may be differences if
+ there are char fields in them. Therefore we have to calculate the
+ difference. */
+
+ heap = mem_heap_create(1024);
+
+ update = row_upd_build_sec_rec_difference_binary(
+ cursor->index, entry, rec, thr_get_trx(thr), heap);
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+ update, 0, thr, mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ if (buf_LRU_buf_pool_running_out()) {
+
+ err = DB_LOCK_TABLE_FULL;
+
+ goto func_exit;
+ }
+
+ err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+ &heap, &dummy_big_rec, update,
+ 0, thr, mtr);
+ ut_ad(!dummy_big_rec);
+ }
+func_exit:
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return DB_SUCCESS, DB_FAIL, or error code */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether mtr holds just a leaf
+ latch or also a tree latch */
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
+ big_rec_t** big_rec,/*!< out: possible big rec vector of fields
+ which have to be stored externally by the
+ caller */
+ const dtuple_t* entry, /*!< in: index entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; must be committed before
+ latching any further pages */
+{
+ rec_t* rec;
+ upd_t* update;
+ ulint err;
+
+ ut_ad(dict_index_is_clust(cursor->index));
+
+ *big_rec = NULL;
+
+ rec = btr_cur_get_rec(cursor);
+
+ ut_ad(rec_get_deleted_flag(rec,
+ dict_table_is_comp(cursor->index->table)));
+
+ if (!*heap) {
+ *heap = mem_heap_create(1024);
+ }
+
+ /* Build an update vector containing all the fields to be modified;
+ NOTE that this vector may NOT contain system columns trx_id or
+ roll_ptr */
+
+ update = row_upd_build_difference_binary(cursor->index, entry, rec,
+ thr_get_trx(thr), *heap);
+ if (mode == BTR_MODIFY_LEAF) {
+ /* Try optimistic updating of the record, keeping changes
+ within the page */
+
+ err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+ mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ if (buf_LRU_buf_pool_running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+
+ }
+ err = btr_cur_pessimistic_update(0, cursor,
+ heap, big_rec, update,
+ 0, thr, mtr);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+ que_node_t* node, /*!< in: node in a query graph */
+ dict_table_t* table) /*!< in: table */
+{
+ que_node_t* parent;
+ upd_node_t* upd_node;
+
+ parent = que_node_get_parent(node);
+
+ while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+
+ upd_node = parent;
+
+ if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+ return(TRUE);
+ }
+
+ parent = que_node_get_parent(parent);
+
+ ut_a(parent);
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return number of ancestors */
+static
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+ que_node_t* node) /*!< in: node in a query graph */
+{
+ que_node_t* parent;
+ ulint n_ancestors = 0;
+
+ parent = que_node_get_parent(node);
+
+ while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+ n_ancestors++;
+
+ parent = que_node_get_parent(parent);
+
+ ut_a(parent);
+ }
+
+ return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return number of fields in the calculated update vector; the value
+can also be 0 if no foreign key fields changed; the returned value is
+ULINT_UNDEFINED if the column type in the child table is too short to
+fit the new value in the parent table: that means the update fails */
+static
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+ upd_node_t* node, /*!< in: update node of the parent
+ table */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ mem_heap_t* heap) /*!< in: memory heap to use as
+ temporary storage */
+{
+ upd_node_t* cascade = node->cascade_node;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index = foreign->foreign_index;
+ upd_t* update;
+ upd_field_t* ufield;
+ dict_table_t* parent_table;
+ dict_index_t* parent_index;
+ upd_t* parent_update;
+ upd_field_t* parent_ufield;
+ ulint n_fields_updated;
+ ulint parent_field_no;
+ ulint i;
+ ulint j;
+
+ ut_a(node);
+ ut_a(foreign);
+ ut_a(cascade);
+ ut_a(table);
+ ut_a(index);
+
+ /* Calculate the appropriate update vector which will set the fields
+ in the child index record to the same value (possibly padded with
+ spaces if the column is a fixed length CHAR or FIXBINARY column) as
+ the referenced index record will get in the update. */
+
+ parent_table = node->table;
+ ut_a(parent_table == foreign->referenced_table);
+ parent_index = foreign->referenced_index;
+ parent_update = node->update;
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+
+ n_fields_updated = 0;
+
+ for (i = 0; i < foreign->n_fields; i++) {
+
+ parent_field_no = dict_table_get_nth_col_pos(
+ parent_table,
+ dict_index_get_nth_col_no(parent_index, i));
+
+ for (j = 0; j < parent_update->n_fields; j++) {
+ parent_ufield = parent_update->fields + j;
+
+ if (parent_ufield->field_no == parent_field_no) {
+
+ ulint min_size;
+ const dict_col_t* col;
+ ulint ufield_len;
+
+ col = dict_index_get_nth_col(index, i);
+
+ /* A field in the parent index record is
+ updated. Let us make the update vector
+ field for the child table. */
+
+ ufield = update->fields + n_fields_updated;
+
+ ufield->field_no
+ = dict_table_get_nth_col_pos(
+ table, dict_col_get_no(col));
+ ufield->exp = NULL;
+
+ ufield->new_val = parent_ufield->new_val;
+ ufield_len = dfield_get_len(&ufield->new_val);
+
+ /* Clear the "external storage" flag */
+ dfield_set_len(&ufield->new_val, ufield_len);
+
+ /* Do not allow a NOT NULL column to be
+ updated as NULL */
+
+ if (dfield_is_null(&ufield->new_val)
+ && (col->prtype & DATA_NOT_NULL)) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ /* If the new value would not fit in the
+ column, do not allow the update */
+
+ if (!dfield_is_null(&ufield->new_val)
+ && dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ col->len,
+ ufield_len,
+ dfield_get_data(&ufield->new_val))
+ < ufield_len) {
+
+ return(ULINT_UNDEFINED);
+ }
+
+ /* If the parent column type has a different
+ length than the child column type, we may
+ need to pad with spaces the new value of the
+ child column */
+
+ min_size = dict_col_get_min_size(col);
+
+ /* Because UNIV_SQL_NULL (the marker
+ of SQL NULL values) exceeds all possible
+ values of min_size, the test below will
+ not hold for SQL NULL columns. */
+
+ if (min_size > ufield_len) {
+
+ char* pad_start;
+ const char* pad_end;
+ char* padded_data
+ = mem_heap_alloc(
+ heap, min_size);
+ pad_start = padded_data + ufield_len;
+ pad_end = padded_data + min_size;
+
+ memcpy(padded_data,
+ dfield_get_data(&ufield
+ ->new_val),
+ dfield_get_len(&ufield
+ ->new_val));
+
+ switch (UNIV_EXPECT(col->mbminlen,1)) {
+ default:
+ ut_error;
+ return(ULINT_UNDEFINED);
+ case 1:
+ if (UNIV_UNLIKELY
+ (dtype_get_charset_coll(
+ col->prtype)
+ == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+ /* Do not pad BINARY
+ columns. */
+ return(ULINT_UNDEFINED);
+ }
+
+ /* space=0x20 */
+ memset(pad_start, 0x20,
+ pad_end - pad_start);
+ break;
+ case 2:
+ /* space=0x0020 */
+ ut_a(!(ufield_len % 2));
+ ut_a(!(min_size % 2));
+ do {
+ *pad_start++ = 0x00;
+ *pad_start++ = 0x20;
+ } while (pad_start < pad_end);
+ break;
+ }
+
+ dfield_set_data(&ufield->new_val,
+ padded_data, min_size);
+ }
+
+ n_fields_updated++;
+ }
+ }
+ }
+
+ update->n_fields = n_fields_updated;
+
+ return(n_fields_updated);
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign) /*!< in: foreign key constraint */
+{
+ mutex_enter(&srv_misc_tmpfile_mutex);
+ rewind(srv_misc_tmpfile);
+
+ if (os_file_set_eof(srv_misc_tmpfile)) {
+ ut_print_name(srv_misc_tmpfile, trx, TRUE,
+ foreign->foreign_table_name);
+ dict_print_info_on_foreign_key_in_create_format(
+ srv_misc_tmpfile, trx, foreign, FALSE);
+ trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+ } else {
+ trx_set_detailed_error(trx, "temp file operation failed");
+ }
+
+ mutex_exit(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+ const char* errstr, /*!< in: error string from the viewpoint
+ of the parent table */
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a matching index record in the
+ child table */
+ const dtuple_t* entry) /*!< in: index entry in the parent
+ table */
+{
+ FILE* ef = dict_foreign_err_file;
+ trx_t* trx = thr_get_trx(thr);
+
+ row_ins_set_detailed(trx, foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx, 600);
+
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+ TRUE);
+ putc('\n', ef);
+ fputs(errstr, ef);
+ fputs(" in parent table, in index ", ef);
+ ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in child table ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+ fputs(", in index ", ef);
+ ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+ if (rec) {
+ fputs(", there is a record:\n", ef);
+ rec_print(ef, rec, foreign->foreign_index);
+ } else {
+ fputs(", the record is not available\n", ef);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint */
+ const rec_t* rec, /*!< in: a record in the parent table:
+ it does not match entry because we
+ have an error! */
+ const dtuple_t* entry) /*!< in: index entry to insert in the
+ child table */
+{
+ FILE* ef = dict_foreign_err_file;
+
+ row_ins_set_detailed(trx, foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx, 600);
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+ TRUE);
+ fputs("\nTrying to add in child table, in index ", ef);
+ ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+ if (entry) {
+ fputs(" tuple:\n", ef);
+ /* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+ It would be better to only display the user columns. */
+ dtuple_print(ef, entry);
+ }
+ fputs("\nBut in parent table ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->referenced_table_name);
+ fputs(", in index ", ef);
+ ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+ fputs(",\nthe closest match we can find is record:\n", ef);
+ if (rec && page_rec_is_supremum(rec)) {
+ /* If the cursor ended on a supremum record, it is better
+ to report the previous record in the error message, so that
+ the user gets a more descriptive error message. */
+ rec = page_rec_get_prev_const(rec);
+ }
+
+ if (rec) {
+ rec_print(ef, rec, foreign->referenced_index);
+ }
+ putc('\n', ef);
+
+ mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ const char* name) /*!< in: table name prefixed with
+ database name and a '/' character */
+{
+ char* buf;
+ char* ptr;
+ ulint len = strlen(name) + 1;
+
+ buf = mem_strdupl(name, len);
+
+ ptr = strchr(buf, '/');
+ ut_a(ptr);
+ *ptr = '\0';
+
+ innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+ mem_free(buf);
+}
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static
+ulint
+row_ins_foreign_check_on_constraint(
+/*================================*/
+ que_thr_t* thr, /*!< in: query thread whose run_node
+ is an update node */
+ dict_foreign_t* foreign, /*!< in: foreign key constraint whose
+ type is != 0 */
+ btr_pcur_t* pcur, /*!< in: cursor placed on a matching
+ index record in the child table */
+ dtuple_t* entry, /*!< in: index entry in the parent
+ table */
+ mtr_t* mtr) /*!< in: mtr holding the latch of pcur
+ page */
+{
+ upd_node_t* node;
+ upd_node_t* cascade;
+ dict_table_t* table = foreign->foreign_table;
+ dict_index_t* index;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ mem_heap_t* upd_vec_heap = NULL;
+ const rec_t* rec;
+ const rec_t* clust_rec;
+ const buf_block_t* clust_block;
+ upd_t* update;
+ ulint n_to_update;
+ ulint err;
+ ulint i;
+ trx_t* trx;
+ mem_heap_t* tmp_heap = NULL;
+
+ ut_a(thr);
+ ut_a(foreign);
+ ut_a(pcur);
+ ut_a(mtr);
+
+ trx = thr_get_trx(thr);
+
+ /* Since we are going to delete or update a row, we have to invalidate
+ the MySQL query cache for table. A deadlock of threads is not possible
+ here because the caller of this function does not hold any latches with
+ the sync0sync.h rank above the kernel mutex. The query cache mutex has
+ a rank just above the kernel mutex. */
+
+ row_ins_invalidate_query_cache(thr, table->name);
+
+ node = thr->run_node;
+
+ if (node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_DELETE_CASCADE
+ | DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+ row_ins_foreign_report_err("Trying to delete",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ return(DB_ROW_IS_REFERENCED);
+ }
+
+ if (!node->is_delete && 0 == (foreign->type
+ & (DICT_FOREIGN_ON_UPDATE_CASCADE
+ | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* This is an UPDATE */
+
+ row_ins_foreign_report_err("Trying to update",
+ thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ return(DB_ROW_IS_REFERENCED);
+ }
+
+ if (node->cascade_node == NULL) {
+ /* Extend our query graph by creating a child to current
+ update node. The child is used in the cascade or set null
+ operation. */
+
+ node->cascade_heap = mem_heap_create(128);
+ node->cascade_node = row_create_update_node_for_mysql(
+ table, node->cascade_heap);
+ que_node_set_parent(node->cascade_node, node);
+ }
+
+ /* Initialize cascade_node to do the operation we want. Note that we
+ use the SAME cascade node to do all foreign key operations of the
+ SQL DELETE: the table of the cascade node may change if there are
+ several child tables to the table where the delete is done! */
+
+ cascade = node->cascade_node;
+
+ cascade->table = table;
+
+ cascade->foreign = foreign;
+
+ if (node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+ cascade->is_delete = TRUE;
+ } else {
+ cascade->is_delete = FALSE;
+
+ if (foreign->n_fields > cascade->update_n_fields) {
+ /* We have to make the update vector longer */
+
+ cascade->update = upd_create(foreign->n_fields,
+ node->cascade_heap);
+ cascade->update_n_fields = foreign->n_fields;
+ }
+ }
+
+ /* We do not allow cyclic cascaded updating (DELETE is allowed,
+ but not UPDATE) of the same table, as this can lead to an infinite
+ cycle. Check that we are not updating the same table which is
+ already being modified in this cascade chain. We have to check
+ this also because the modification of the indexes of a 'parent'
+ table may still be incomplete, and we must avoid seeing the indexes
+ of the parent table in an inconsistent state! */
+
+ if (!cascade->is_delete
+ && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+ /* We do not know if this would break foreign key
+ constraints, but play safe and return an error */
+
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying an update, possibly causing a cyclic"
+ " cascaded update\n"
+ "in the child table,", thr, foreign,
+ btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying a too deep cascaded delete or update\n",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ ut_a(index == foreign->foreign_index);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (dict_index_is_clust(index)) {
+ /* pcur is already positioned in the clustered index of
+ the child table */
+
+ clust_index = index;
+ clust_rec = rec;
+ clust_block = btr_pcur_get_block(pcur);
+ } else {
+ /* We have to look for the record in the clustered index
+ in the child table */
+
+ clust_index = dict_table_get_first_index(table);
+
+ tmp_heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+ tmp_heap);
+ btr_pcur_open_with_no_init(clust_index, ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ cascade->pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(cascade->pcur);
+ clust_block = btr_pcur_get_block(cascade->pcur);
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(cascade->pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ fputs("InnoDB: error in cascade of a foreign key op\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ fputs("\n"
+ "InnoDB: clustered record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report to"
+ " http://bugs.mysql.com\n", stderr);
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Set an X-lock on the row to delete or update in the child table */
+
+ err = lock_table(0, table, LOCK_IX, thr);
+
+ if (err == DB_SUCCESS) {
+ /* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+ we already have a normal shared lock on the appropriate
+ gap if the search criterion was not unique */
+
+ err = lock_clust_rec_read_check_and_lock_alt(
+ 0, clust_block, clust_rec, clust_index,
+ LOCK_X, LOCK_REC_NOT_GAP, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto nonstandard_exit_func;
+ }
+
+ if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+ /* This can happen if there is a circular reference of
+ rows such that cascading delete comes to delete a row
+ already in the process of being delete marked */
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+
+ if ((node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
+ || (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+ /* Build the appropriate update vector which sets
+ foreign->n_fields first fields in rec to SQL NULL */
+
+ update = cascade->update;
+
+ update->info_bits = 0;
+ update->n_fields = foreign->n_fields;
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ upd_field_t* ufield = &update->fields[i];
+
+ ufield->field_no = dict_table_get_nth_col_pos(
+ table,
+ dict_index_get_nth_col_no(index, i));
+ ufield->orig_len = 0;
+ ufield->exp = NULL;
+ dfield_set_null(&ufield->new_val);
+ }
+ }
+
+ if (!node->is_delete
+ && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+ /* Build the appropriate update vector which sets changing
+ foreign->n_fields first fields in rec to new values */
+
+ upd_vec_heap = mem_heap_create(256);
+
+ n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
+ upd_vec_heap);
+ if (n_to_update == ULINT_UNDEFINED) {
+ err = DB_ROW_IS_REFERENCED;
+
+ row_ins_foreign_report_err(
+ "Trying a cascaded update where the"
+ " updated value in the child\n"
+ "table would not fit in the length"
+ " of the column, or the value would\n"
+ "be NULL and the column is"
+ " declared as not NULL in the child table,",
+ thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+ goto nonstandard_exit_func;
+ }
+
+ if (cascade->update->n_fields == 0) {
+
+ /* The update does not change any columns referred
+ to in this foreign key constraint: no need to do
+ anything */
+
+ err = DB_SUCCESS;
+
+ goto nonstandard_exit_func;
+ }
+ }
+
+ /* Store pcur position and initialize or store the cascade node
+ pcur stored position */
+
+ btr_pcur_store_position(pcur, mtr);
+
+ if (index == clust_index) {
+ btr_pcur_copy_stored_position(cascade->pcur, pcur);
+ } else {
+ btr_pcur_store_position(cascade->pcur, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+ cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ err = row_update_cascade_for_mysql(thr, cascade,
+ foreign->foreign_table);
+
+ if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+ fprintf(stderr,
+ "InnoDB: error: table %s has the counter 0"
+ " though there is\n"
+ "InnoDB: a FOREIGN KEY check running on it.\n",
+ foreign->foreign_table->name);
+ }
+
+ /* Release the data dictionary latch for a while, so that we do not
+ starve other threads from doing CREATE TABLE etc. if we have a huge
+ cascaded operation running. The counter n_foreign_key_checks_running
+ will prevent other users from dropping or ALTERing the table when we
+ release the latch. */
+
+ row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+ row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+ mtr_start(mtr);
+
+ /* Restore pcur position */
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (upd_vec_heap) {
+ mem_heap_free(upd_vec_heap);
+ }
+
+ return(err);
+
+nonstandard_exit_func:
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ if (upd_vec_heap) {
+ mem_heap_free(upd_vec_heap);
+ }
+
+ btr_pcur_store_position(pcur, mtr);
+
+ mtr_commit(mtr);
+ mtr_start(mtr);
+
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_ins_set_shared_rec_lock(
+/*========================*/
+ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ enum db_err err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_S, type, thr);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOCK_REC_NOT_GAP type lock */
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ enum db_err err;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, LOCK_X, type, thr);
+ }
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock.
+@return DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+ ibool check_ref,/*!< in: TRUE if we want to check that
+ the referenced table is ok, FALSE if we
+ want to check the foreign key table */
+ dict_foreign_t* foreign,/*!< in: foreign constraint; NOTE that the
+ tables mentioned in it must be in the
+ dictionary cache if they exist at all */
+ dict_table_t* table, /*!< in: if check_ref is TRUE, then the foreign
+ table, else the referenced table */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ upd_node_t* upd_node;
+ dict_table_t* check_table;
+ dict_index_t* check_index;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ int cmp;
+ ulint err;
+ ulint i;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ err = DB_SUCCESS;
+
+ if (trx->check_foreigns == FALSE) {
+ /* The user has suppressed foreign key checks currently for
+ this session */
+ goto exit_func;
+ }
+
+ /* If any of the foreign key fields in entry is SQL NULL, we
+ suppress the foreign key check: this is compatible with Oracle,
+ for example */
+
+ for (i = 0; i < foreign->n_fields; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ goto exit_func;
+ }
+ }
+
+ if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+ upd_node = thr->run_node;
+
+ if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+ /* If a cascaded update is done as defined by a
+ foreign key constraint, do not check that
+ constraint for the child row. In ON UPDATE CASCADE
+ the update of the parent row is only half done when
+ we come here: if we would check the constraint here
+ for the child row it would fail.
+
+ A QUESTION remains: if in the child table there are
+ several constraints which refer to the same parent
+ table, we should merge all updates to the child as
+ one update? And the updates can be contradictory!
+ Currently we just perform the update associated
+ with each foreign key constraint, one after
+ another, and the user has problems predicting in
+ which order they are performed. */
+
+ goto exit_func;
+ }
+ }
+
+ if (check_ref) {
+ check_table = foreign->referenced_table;
+ check_index = foreign->referenced_index;
+ } else {
+ check_table = foreign->foreign_table;
+ check_index = foreign->foreign_index;
+ }
+
+ if (check_table == NULL || check_table->ibd_file_missing) {
+ if (check_ref) {
+ FILE* ef = dict_foreign_err_file;
+
+ row_ins_set_detailed(trx, foreign);
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+ fputs(" Transaction:\n", ef);
+ trx_print(ef, trx, 600);
+ fputs("Foreign key constraint fails for table ", ef);
+ ut_print_name(ef, trx, TRUE,
+ foreign->foreign_table_name);
+ fputs(":\n", ef);
+ dict_print_info_on_foreign_key_in_create_format(
+ ef, trx, foreign, TRUE);
+ fputs("\nTrying to add to index ", ef);
+ ut_print_name(ef, trx, FALSE,
+ foreign->foreign_index->name);
+ fputs(" tuple:\n", ef);
+ dtuple_print(ef, entry);
+ fputs("\nBut the parent table ", ef);
+ ut_print_name(ef, trx, TRUE,
+ foreign->referenced_table_name);
+ fputs("\nor its .ibd file does"
+ " not currently exist!\n", ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ err = DB_NO_REFERENCED_ROW;
+ }
+
+ goto exit_func;
+ }
+
+ ut_a(check_table);
+ ut_a(check_index);
+
+ if (check_table != table) {
+ /* We already have a LOCK_IX on table, but not necessarily
+ on check_table */
+
+ err = lock_table(0, check_table, LOCK_IS, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto do_possible_lock_wait;
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+ btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ /* Scan index records and check if there is a matching record */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+
+ if (srv_pass_corrupt_table && !block) {
+ err = DB_CORRUPTION;
+ break;
+ }
+ ut_a(block);
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, check_index,
+ offsets, ULINT_UNDEFINED, &heap);
+
+ if (page_rec_is_supremum(rec)) {
+
+ err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+ rec, check_index,
+ offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ continue;
+ default:
+ goto end_scan;
+ }
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (rec_get_deleted_flag(rec,
+ rec_offs_comp(offsets))) {
+ err = row_ins_set_shared_rec_lock(
+ LOCK_ORDINARY, block,
+ rec, check_index, offsets, thr);
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+ } else {
+ /* Found a matching record. Lock only
+ a record because we can allow inserts
+ into gaps */
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (check_ref) {
+ err = DB_SUCCESS;
+
+ goto end_scan;
+ } else if (foreign->type != 0) {
+ /* There is an ON UPDATE or ON DELETE
+ condition: check them in a separate
+ function */
+
+ err = row_ins_foreign_check_on_constraint(
+ thr, foreign, &pcur, entry,
+ &mtr);
+ if (err != DB_SUCCESS) {
+ /* Since reporting a plain
+ "duplicate key" error
+ message to the user in
+ cases where a long CASCADE
+ operation would lead to a
+ duplicate key in some
+ other table is very
+ confusing, map duplicate
+ key errors resulting from
+ FK constraints to a
+ separate error code. */
+
+ if (err == DB_DUPLICATE_KEY) {
+ err = DB_FOREIGN_DUPLICATE_KEY;
+ }
+
+ goto end_scan;
+ }
+
+ /* row_ins_foreign_check_on_constraint
+ may have repositioned pcur on a
+ different block */
+ block = btr_pcur_get_block(&pcur);
+ } else {
+ row_ins_foreign_report_err(
+ "Trying to delete or update",
+ thr, foreign, rec, entry);
+
+ err = DB_ROW_IS_REFERENCED;
+ goto end_scan;
+ }
+ }
+ } else {
+ ut_a(cmp < 0);
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_GAP, block,
+ rec, check_index, offsets, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ if (check_ref) {
+ err = DB_NO_REFERENCED_ROW;
+ row_ins_foreign_report_add_err(
+ trx, foreign, rec, entry);
+ } else {
+ err = DB_SUCCESS;
+ }
+ }
+
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, &mtr));
+
+ if (check_ref) {
+ row_ins_foreign_report_add_err(
+ trx, foreign, btr_pcur_get_rec(&pcur), entry);
+ err = DB_NO_REFERENCED_ROW;
+ } else {
+ err = DB_SUCCESS;
+ }
+
+end_scan:
+ btr_pcur_close(&pcur);
+
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+ if (err == DB_LOCK_WAIT) {
+ trx->error_state = err;
+
+ que_thr_stop_for_mysql(thr);
+
+ srv_suspend_mysql_thread(thr);
+
+ if (trx->error_state == DB_SUCCESS) {
+
+ goto run_again;
+ }
+
+ err = trx->error_state;
+ }
+
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_ins_check_foreign_constraints(
+/*==============================*/
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry for index */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_foreign_t* foreign;
+ ulint err;
+ trx_t* trx;
+ ibool got_s_lock = FALSE;
+
+ trx = thr_get_trx(thr);
+
+ foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+ while (foreign) {
+ if (foreign->foreign_index == index) {
+
+ if (foreign->referenced_table == NULL) {
+ dict_table_get(foreign->referenced_table_name,
+ FALSE);
+ }
+
+ if (0 == trx->dict_operation_lock_mode) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ if (foreign->referenced_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ (foreign->referenced_table
+ ->n_foreign_key_checks_running)++;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_operation_lock temporarily!
+ But the counter on the table protects the referenced
+ table from being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ TRUE, foreign, table, entry, thr);
+
+ if (foreign->referenced_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ ut_a(foreign->referenced_table
+ ->n_foreign_key_checks_running > 0);
+ (foreign->referenced_table
+ ->n_foreign_key_checks_running)--;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+ const rec_t* rec, /*!< in: user record; NOTE that we assume
+ that the caller already has a record lock on
+ the record! */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint matched_fields;
+ ulint matched_bytes;
+ ulint n_unique;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ n_unique = dict_index_get_n_unique(index);
+
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ cmp_dtuple_rec_with_match(entry, rec, offsets,
+ &matched_fields, &matched_bytes);
+
+ if (matched_fields < n_unique) {
+
+ return(FALSE);
+ }
+
+ /* In a unique secondary index we allow equal key values if they
+ contain SQL NULLs */
+
+ if (!dict_index_is_clust(index)) {
+
+ for (i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ return(FALSE);
+ }
+ }
+ }
+
+ return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+ dict_index_t* index, /*!< in: non-clustered unique index */
+ dtuple_t* entry, /*!< in: index entry */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint n_unique;
+ ulint i;
+ int cmp;
+ ulint n_fields_cmp;
+ btr_pcur_t pcur;
+ ulint err = DB_SUCCESS;
+ unsigned allow_duplicates;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ n_unique = dict_index_get_n_unique(index);
+
+ /* If the secondary index is unique, but one of the fields in the
+ n_unique first fields is NULL, a unique key violation cannot occur,
+ since we define NULL != NULL in this case */
+
+ for (i = 0; i < n_unique; i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(entry, i))) {
+
+ return(DB_SUCCESS);
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /* Store old value on n_fields_cmp */
+
+ n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+ dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+
+ btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+ allow_duplicates = thr_get_trx(thr)->duplicates & TRX_DUP_IGNORE;
+
+ /* Scan index records and check if there is a duplicate */
+
+ do {
+ const rec_t* rec = btr_pcur_get_rec(&pcur);
+ const buf_block_t* block = btr_pcur_get_block(&pcur);
+
+ if (page_rec_is_infimum(rec)) {
+
+ continue;
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (allow_duplicates) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_ORDINARY, block,
+ rec, index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_ORDINARY, block,
+ rec, index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto end_scan;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ continue;
+ }
+
+ cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+ if (cmp == 0) {
+ if (row_ins_dupl_error_with_rec(rec, entry,
+ index, offsets)) {
+ err = DB_DUPLICATE_KEY;
+
+ thr_get_trx(thr)->error_info = index;
+
+ goto end_scan;
+ }
+ } else {
+ ut_a(cmp < 0);
+ goto end_scan;
+ }
+ } while (btr_pcur_move_to_next(&pcur, &mtr));
+
+end_scan:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ mtr_commit(&mtr);
+
+ /* Restore old value */
+ dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+ return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error,
+DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static
+ulint
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+ btr_cur_t* cursor, /*!< in: B-tree cursor */
+ dtuple_t* entry, /*!< in: entry to insert */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint err;
+ rec_t* rec;
+ ulint n_unique;
+ trx_t* trx = thr_get_trx(thr);
+ mem_heap_t*heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ UT_NOT_USED(mtr);
+
+ ut_a(dict_index_is_clust(cursor->index));
+ ut_ad(dict_index_is_unique(cursor->index));
+
+ /* NOTE: For unique non-clustered indexes there may be any number
+ of delete marked records with the same value for the non-clustered
+ index key (remember multiversioning), and which differ only in
+ the row refererence part of the index record, containing the
+ clustered index key fields. For such a secondary index record,
+ to avoid race condition, we must FIRST do the insertion and after
+ that check that the uniqueness condition is not breached! */
+
+ /* NOTE: A problem is that in the B-tree node pointers on an
+ upper level may match more to the entry than the actual existing
+ user records on the leaf level. So, even if low_match would suggest
+ that a duplicate key violation may occur, this may not be the case. */
+
+ n_unique = dict_index_get_n_unique(cursor->index);
+
+ if (cursor->low_match >= n_unique) {
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_infimum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ /* We set a lock on the possible duplicate: this
+ is needed in logical logging of MySQL to make
+ sure that in roll-forward we get the same duplicate
+ errors as in original execution */
+
+ if (trx->duplicates & TRX_DUP_IGNORE) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor), rec,
+ cursor->index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index, offsets)) {
+ trx->error_info = cursor->index;
+ err = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ }
+ }
+
+ if (cursor->up_match >= n_unique) {
+
+ rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+ if (!page_rec_is_supremum(rec)) {
+ offsets = rec_get_offsets(rec, cursor->index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (trx->duplicates & TRX_DUP_IGNORE) {
+
+ /* If the SQL-query will update or replace
+ duplicate key we will take X-lock for
+ duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+ INSERT ON DUPLICATE KEY UPDATE). */
+
+ err = row_ins_set_exclusive_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ } else {
+
+ err = row_ins_set_shared_rec_lock(
+ LOCK_REC_NOT_GAP,
+ btr_cur_get_block(cursor),
+ rec, cursor->index, offsets, thr);
+ }
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto func_exit;
+ }
+
+ if (row_ins_dupl_error_with_rec(
+ rec, entry, cursor->index, offsets)) {
+ trx->error_info = cursor->index;
+ err = DB_DUPLICATE_KEY;
+ goto func_exit;
+ }
+ }
+
+ ut_a(!dict_index_is_clust(cursor->index));
+ /* This should never happen */
+ }
+
+ err = DB_SUCCESS;
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal.
+@return 0 if no update, ROW_INS_PREV if previous should be updated;
+currently we do the search so that only the low_match record can match
+enough to the search tuple, not the next record */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+ btr_cur_t* cursor) /*!< in: B-tree cursor */
+{
+ ulint enough_match;
+ rec_t* rec;
+
+ /* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+ pointers on upper levels of the B-tree may match more to entry than
+ to actual user records on the leaf level, we have to check if the
+ candidate record is actually a user record. In a clustered index
+ node pointers contain index->n_unique first fields, and in the case
+ of a secondary index, all fields of the index. */
+
+ enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+
+ if (cursor->low_match >= enough_match) {
+
+ rec = btr_cur_get_rec(cursor);
+
+ if (!page_rec_is_infimum(rec)) {
+
+ return(ROW_INS_PREV);
+ }
+ }
+
+ return(0);
+}
+
+/***************************************************************//**
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed,
+or error code */
+static
+ulint
+row_ins_index_entry_low(
+/*====================*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_cur_t cursor;
+ ulint ignore_sec_unique = 0;
+ ulint modify = 0; /* remove warning */
+ rec_t* insert_rec;
+ rec_t* rec;
+ ulint err;
+ ulint n_unique;
+ big_rec_t* big_rec = NULL;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+
+ log_free_check();
+
+ mtr_start(&mtr);
+
+ cursor.thr = thr;
+
+ /* Note that we use PAGE_CUR_LE as the search mode, because then
+ the function will return in both low_match and up_match of the
+ cursor sensible values */
+
+ if (!(thr_get_trx(thr)->check_unique_secondary)) {
+ ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+ }
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ mode | BTR_INSERT | ignore_sec_unique,
+ &cursor, 0, __FILE__, __LINE__, &mtr);
+
+ if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+ /* The insertion was made to the insert buffer already during
+ the search: we are done */
+
+ err = DB_SUCCESS;
+
+ goto function_exit;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ page_t* page = btr_cur_get_page(&cursor);
+ rec_t* first_rec = page_rec_get_next(
+ page_get_infimum_rec(page));
+
+ ut_ad(page_rec_is_supremum(first_rec)
+ || rec_get_n_fields(first_rec, index)
+ == dtuple_get_n_fields(entry));
+ }
+#endif
+
+ n_unique = dict_index_get_n_unique(index);
+
+ if (dict_index_is_unique(index) && (cursor.up_match >= n_unique
+ || cursor.low_match >= n_unique)) {
+
+ if (dict_index_is_clust(index)) {
+ /* Note that the following may return also
+ DB_LOCK_WAIT */
+
+ err = row_ins_duplicate_error_in_clust(
+ &cursor, entry, thr, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+ } else {
+ mtr_commit(&mtr);
+ err = row_ins_scan_sec_index_for_duplicate(
+ index, entry, thr);
+ mtr_start(&mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ /* We did not find a duplicate and we have now
+ locked with s-locks the necessary records to
+ prevent any insertion of a duplicate by another
+ transaction. Let us now reposition the cursor and
+ continue the insertion. */
+
+ btr_cur_search_to_nth_level(index, 0, entry,
+ PAGE_CUR_LE,
+ mode | BTR_INSERT,
+ &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+ }
+ }
+
+ modify = row_ins_must_modify(&cursor);
+
+ if (modify != 0) {
+ /* There is already an index entry with a long enough common
+ prefix, we must convert the insert into a modify of an
+ existing record */
+
+ if (modify == ROW_INS_NEXT) {
+ rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+ btr_cur_position(index, rec,
+ btr_cur_get_block(&cursor),&cursor);
+ }
+
+ if (dict_index_is_clust(index)) {
+ err = row_ins_clust_index_entry_by_modify(
+ mode, &cursor, &heap, &big_rec, entry,
+ thr, &mtr);
+ } else {
+ ut_ad(!n_ext);
+ err = row_ins_sec_index_entry_by_modify(
+ mode, &cursor, entry, thr, &mtr);
+ }
+ } else {
+ if (mode == BTR_MODIFY_LEAF) {
+ err = btr_cur_optimistic_insert(
+ 0, &cursor, entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ if (buf_LRU_buf_pool_running_out()) {
+
+ err = DB_LOCK_TABLE_FULL;
+
+ goto function_exit;
+ }
+ err = btr_cur_pessimistic_insert(
+ 0, &cursor, entry, &insert_rec, &big_rec,
+ n_ext, thr, &mtr);
+ }
+ }
+
+function_exit:
+ mtr_commit(&mtr);
+
+ if (UNIV_LIKELY_NULL(big_rec)) {
+ rec_t* rec;
+ ulint* offsets;
+ mtr_start(&mtr);
+
+ btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+ BTR_MODIFY_TREE, &cursor, 0,
+ __FILE__, __LINE__, &mtr);
+ rec = btr_cur_get_rec(&cursor);
+ offsets = rec_get_offsets(rec, index, NULL,
+ ULINT_UNDEFINED, &heap);
+
+ err = btr_store_big_rec_extern_fields(
+ index, btr_cur_get_block(&cursor),
+ rec, offsets, big_rec, &mtr);
+
+ if (modify) {
+ dtuple_big_rec_free(big_rec);
+ } else {
+ dtuple_convert_back_big_rec(index, entry, big_rec);
+ }
+
+ mtr_commit(&mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to insert */
+ ulint n_ext, /*!< in: number of externally stored columns */
+ ibool foreign,/*!< in: TRUE=check foreign key constraints */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+
+ if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) {
+ err = row_ins_check_foreign_constraints(index->table, index,
+ entry, thr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+ n_ext, thr);
+ if (err != DB_FAIL) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+
+ err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+ n_ext, thr);
+ return(err);
+}
+
+/***********************************************************//**
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry to make */
+ const dtuple_t* row) /*!< in: row */
+{
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(entry && row);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ for (i = 0; i < n_fields; i++) {
+ dict_field_t* ind_field;
+ dfield_t* field;
+ const dfield_t* row_field;
+ ulint len;
+
+ field = dtuple_get_nth_field(entry, i);
+ ind_field = dict_index_get_nth_field(index, i);
+ row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+ len = dfield_get_len(row_field);
+
+ /* Check column prefix indexes */
+ if (ind_field->prefix_len > 0
+ && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ind_field->prefix_len,
+ len, dfield_get_data(row_field));
+
+ ut_ad(!dfield_is_ext(row_field));
+ }
+
+ dfield_set_data(field, dfield_get_data(row_field), len);
+ if (dfield_is_ext(row_field)) {
+ ut_ad(dict_index_is_clust(index));
+ dfield_set_ext(field);
+ }
+ }
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+
+ ut_ad(dtuple_check_typed(node->row));
+
+ row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+
+ ut_ad(dtuple_check_typed(node->entry));
+
+ err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ dulint row_id;
+
+ ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+ if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
+
+ /* No row id is stored if the clustered index is unique */
+
+ return;
+ }
+
+ /* Fill in row id value to row */
+
+ row_id = dict_sys_get_new_row_id();
+
+ dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->values_list;
+
+ while (list_node) {
+ eval_exp(list_node);
+
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+ ins_node_t* node) /*!< in: row insert node */
+{
+ que_node_t* list_node;
+ dfield_t* dfield;
+ dtuple_t* row;
+ ulint i;
+
+ /* The field values are copied in the buffers of the select node and
+ it is safe to use them until we fetch from select again: therefore
+ we can just copy the pointers */
+
+ row = node->row;
+
+ i = 0;
+ list_node = node->select->select_list;
+
+ while (list_node) {
+ dfield = dtuple_get_nth_field(row, i);
+ dfield_copy_data(dfield, que_node_get_val(list_node));
+
+ i++;
+ list_node = que_node_get_next(list_node);
+ }
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins(
+/*====*/
+ ins_node_t* node, /*!< in: row insert node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+
+ ut_ad(node && thr);
+
+ if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+ row_ins_alloc_row_id_step(node);
+
+ node->index = dict_table_get_first_index(node->table);
+ node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+ if (node->ins_type == INS_SEARCHED) {
+
+ row_ins_get_row_from_select(node);
+
+ } else if (node->ins_type == INS_VALUES) {
+
+ row_ins_get_row_from_values(node);
+ }
+
+ node->state = INS_NODE_INSERT_ENTRIES;
+ }
+
+ ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+ while (node->index != NULL) {
+ err = row_ins_index_entry_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+ }
+
+ ut_ad(node->entry == NULL);
+
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ins_node_t* node;
+ que_node_t* parent;
+ sel_node_t* sel_node;
+ trx_t* trx;
+ ulint err;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ trx_start_if_not_started(trx);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+ parent = que_node_get_parent(node);
+ sel_node = node->select;
+
+ if (thr->prev_node == parent) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ }
+
+ /* If this is the first time this node is executed (or when
+ execution resumes after wait for the table IX lock), set an
+ IX lock on the table and reset the possible select node. MySQL's
+ partitioned table code may also call an insert within the same
+ SQL statement AFTER it has used this table handle to do a search.
+ This happens, for example, when a row update moves it to another
+ partition. In that case, we have already set the IX lock on the
+ table during the search operation, and there is no need to set
+ it again here. But we must write trx->id to node->trx_id_buf. */
+
+ trx_write_trx_id(node->trx_id_buf, trx->id);
+
+ if (node->state == INS_NODE_SET_IX_LOCK) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+ /* No need to do IX-locking */
+
+ goto same_trx;
+ }
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+
+ node->trx_id = trx->id;
+same_trx:
+ node->state = INS_NODE_ALLOC_ROW_ID;
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ if ((node->ins_type == INS_SEARCHED)
+ && (sel_node->state != SEL_NODE_FETCH)) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to insert */
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_ins(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* err == DB_LOCK_WAIT or SQL error detected */
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->ins_type == INS_SEARCHED) {
+ /* Fetch a row to insert */
+
+ thr->run_node = sel_node;
+ } else {
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ return(thr);
+}
diff --git a/storage/xtradb/row/row0merge.c b/storage/xtradb/row/row0merge.c
new file mode 100644
index 00000000000..65102851bdf
--- /dev/null
+++ b/storage/xtradb/row/row0merge.c
@@ -0,0 +1,2644 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.c
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "os0file.h"
+#include "lock0lock.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "log0log.h"
+#include "ut0sort.h"
+#include "handler0alter.h"
+
+#ifdef UNIV_DEBUG
+/** Set these in order ot enable debug printout. */
+/* @{ */
+/** Log the outcome of each row_merge_cmp() call, comparing records. */
+static ibool row_merge_print_cmp;
+/** Log each record read from temporary file. */
+static ibool row_merge_print_read;
+/** Log each record write to temporary file. */
+static ibool row_merge_print_write;
+/** Log each row_merge_blocks() call, merging two blocks of records to
+a bigger one. */
+static ibool row_merge_print_block;
+/** Log each block read from temporary file. */
+static ibool row_merge_print_block_read;
+/** Log each block read from temporary file. */
+static ibool row_merge_print_block_write;
+/* @} */
+#endif /* UNIV_DEBUG */
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2. See the comment above
+ut_ad(data_size < sizeof(row_merge_block_t)). */
+typedef byte row_merge_block_t[1048576];
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t. Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte mrec_t;
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_struct {
+ mem_heap_t* heap; /*!< memory heap where allocated */
+ dict_index_t* index; /*!< the index the tuples belong to */
+ ulint total_size; /*!< total amount of data bytes */
+ ulint n_tuples; /*!< number of data tuples */
+ ulint max_tuples; /*!< maximum number of data tuples */
+ const dfield_t**tuples; /*!< array of pointers to
+ arrays of fields that form
+ the data tuples */
+ const dfield_t**tmp_tuples; /*!< temporary copy of tuples,
+ for sorting */
+};
+
+/** Buffer for sorting in main memory. */
+typedef struct row_merge_buf_struct row_merge_buf_t;
+
+/** Information about temporary files used in merge sort */
+struct merge_file_struct {
+ int fd; /*!< file descriptor */
+ ulint offset; /*!< file offset (end of file) */
+ ib_uint64_t n_rec; /*!< number of records in the file */
+};
+
+/** Information about temporary files used in merge sort */
+typedef struct merge_file_struct merge_file_t;
+
+#ifdef UNIV_DEBUG
+/******************************************************//**
+Display a merge tuple. */
+static
+void
+row_merge_tuple_print(
+/*==================*/
+ FILE* f, /*!< in: output stream */
+ const dfield_t* entry, /*!< in: tuple to print */
+ ulint n_fields)/*!< in: number of fields in the tuple */
+{
+ ulint j;
+
+ for (j = 0; j < n_fields; j++) {
+ const dfield_t* field = &entry[j];
+
+ if (dfield_is_null(field)) {
+ fputs("\n NULL;", f);
+ } else {
+ ulint field_len = dfield_get_len(field);
+ ulint len = ut_min(field_len, 20);
+ if (dfield_is_ext(field)) {
+ fputs("\nE", f);
+ } else {
+ fputs("\n ", f);
+ }
+ ut_print_buf(f, dfield_get_data(field), len);
+ if (len != field_len) {
+ fprintf(f, " (total %lu bytes)", field_len);
+ }
+ }
+ }
+ putc('\n', f);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+ mem_heap_t* heap, /*!< in: heap where allocated */
+ dict_index_t* index, /*!< in: secondary index */
+ ulint max_tuples, /*!< in: maximum number of data tuples */
+ ulint buf_size) /*!< in: size of the buffer, in bytes */
+{
+ row_merge_buf_t* buf;
+
+ ut_ad(max_tuples > 0);
+ ut_ad(max_tuples <= sizeof(row_merge_block_t));
+ ut_ad(max_tuples < buf_size);
+
+ buf = mem_heap_zalloc(heap, buf_size);
+ buf->heap = heap;
+ buf->index = index;
+ buf->max_tuples = max_tuples;
+ buf->tuples = mem_heap_alloc(heap,
+ 2 * max_tuples * sizeof *buf->tuples);
+ buf->tmp_tuples = buf->tuples + max_tuples;
+
+ return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+ dict_index_t* index) /*!< in: secondary index */
+{
+ row_merge_buf_t* buf;
+ ulint max_tuples;
+ ulint buf_size;
+ mem_heap_t* heap;
+
+ max_tuples = sizeof(row_merge_block_t)
+ / ut_max(1, dict_index_get_min_size(index));
+
+ buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+ heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
+
+ buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+ return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer */
+{
+ ulint buf_size;
+ ulint max_tuples = buf->max_tuples;
+ mem_heap_t* heap = buf->heap;
+ dict_index_t* index = buf->index;
+
+ buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+ mem_heap_empty(heap);
+
+ return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+static
+void
+row_merge_buf_free(
+/*===============*/
+ row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */
+{
+ mem_heap_free(buf->heap);
+}
+
+/******************************************************//**
+Insert a data tuple into a sort buffer.
+@return TRUE if added, FALSE if out of space */
+static
+ibool
+row_merge_buf_add(
+/*==============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ const dtuple_t* row, /*!< in: row in clustered index */
+ const row_ext_t* ext) /*!< in: cache of externally stored
+ column prefixes, or NULL */
+{
+ ulint i;
+ ulint n_fields;
+ ulint data_size;
+ ulint extra_size;
+ const dict_index_t* index;
+ dfield_t* entry;
+ dfield_t* field;
+
+ if (buf->n_tuples >= buf->max_tuples) {
+ return(FALSE);
+ }
+
+ UNIV_PREFETCH_R(row->fields);
+
+ index = buf->index;
+
+ n_fields = dict_index_get_n_fields(index);
+
+ entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
+ buf->tuples[buf->n_tuples] = entry;
+ field = entry;
+
+ data_size = 0;
+ extra_size = UT_BITS_IN_BYTES(index->n_nullable);
+
+ for (i = 0; i < n_fields; i++, field++) {
+ const dict_field_t* ifield;
+ const dict_col_t* col;
+ ulint col_no;
+ const dfield_t* row_field;
+ ulint len;
+
+ ifield = dict_index_get_nth_field(index, i);
+ col = ifield->col;
+ col_no = dict_col_get_no(col);
+ row_field = dtuple_get_nth_field(row, col_no);
+ dfield_copy(field, row_field);
+ len = dfield_get_len(field);
+
+ if (dfield_is_null(field)) {
+ ut_ad(!(col->prtype & DATA_NOT_NULL));
+ continue;
+ } else if (UNIV_LIKELY(!ext)) {
+ } else if (dict_index_is_clust(index)) {
+ /* Flag externally stored fields. */
+ const byte* buf = row_ext_lookup(ext, col_no,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ if (i < dict_index_get_n_unique(index)) {
+ dfield_set_data(field, buf, len);
+ } else {
+ dfield_set_ext(field);
+ len = dfield_get_len(field);
+ }
+ }
+ } else {
+ const byte* buf = row_ext_lookup(ext, col_no,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ ut_a(buf != field_ref_zero);
+ dfield_set_data(field, buf, len);
+ }
+ }
+
+ /* If a column prefix index, take only the prefix */
+
+ if (ifield->prefix_len) {
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ ifield->prefix_len,
+ len, dfield_get_data(field));
+ dfield_set_len(field, len);
+ }
+
+ ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+ if (ifield->fixed_len) {
+ ut_ad(len == ifield->fixed_len);
+ ut_ad(!dfield_is_ext(field));
+ } else if (dfield_is_ext(field)) {
+ extra_size += 2;
+ } else if (len < 128
+ || (col->len < 256 && col->mtype != DATA_BLOB)) {
+ extra_size++;
+ } else {
+ /* For variable-length columns, we look up the
+ maximum length from the column itself. If this
+ is a prefix index column shorter than 256 bytes,
+ this will waste one byte. */
+ extra_size += 2;
+ }
+ data_size += len;
+ }
+
+#ifdef UNIV_DEBUG
+ {
+ ulint size;
+ ulint extra;
+
+ size = rec_get_converted_size_comp(index,
+ REC_STATUS_ORDINARY,
+ entry, n_fields, &extra);
+
+ ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
+ ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
+ }
+#endif /* UNIV_DEBUG */
+
+ /* Add to the total size of the record in row_merge_block_t
+ the encoded length of extra_size and the extra bytes (extra_size).
+ See row_merge_buf_write() for the variable-length encoding
+ of extra_size. */
+ data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+ /* The following assertion may fail if row_merge_block_t is
+ declared very small and a PRIMARY KEY is being created with
+ many prefix columns. In that case, the record may exceed the
+ page_zip_rec_needs_ext() limit. However, no further columns
+ will be moved to external storage until the record is inserted
+ to the clustered index B-tree. */
+ ut_ad(data_size < sizeof(row_merge_block_t));
+
+ /* Reserve one byte for the end marker of row_merge_block_t. */
+ if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
+ return(FALSE);
+ }
+
+ buf->total_size += data_size;
+ buf->n_tuples++;
+
+ field = entry;
+
+ /* Copy the data fields. */
+
+ do {
+ dfield_dup(field++, buf->heap);
+ } while (--n_fields);
+
+ return(TRUE);
+}
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_struct {
+ const dict_index_t* index; /*!< index being sorted */
+ TABLE* table; /*!< MySQL table object */
+ ulint n_dup; /*!< number of duplicates */
+};
+
+/** Structure for reporting duplicate records. */
+typedef struct row_merge_dup_struct row_merge_dup_t;
+
+/*************************************************************//**
+Report a duplicate key. */
+static
+void
+row_merge_dup_report(
+/*=================*/
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t* entry) /*!< in: duplicate index entry */
+{
+ mrec_buf_t* buf;
+ const dtuple_t* tuple;
+ dtuple_t tuple_store;
+ const rec_t* rec;
+ const dict_index_t* index = dup->index;
+ ulint n_fields= dict_index_get_n_fields(index);
+ mem_heap_t* heap;
+ ulint* offsets;
+ ulint n_ext;
+
+ if (dup->n_dup++) {
+ /* Only report the first duplicate record,
+ but count all duplicate records. */
+ return;
+ }
+
+ /* Convert the tuple to a record and then to MySQL format. */
+ heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+ * sizeof *offsets
+ + sizeof *buf);
+
+ buf = mem_heap_alloc(heap, sizeof *buf);
+
+ tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
+ n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
+
+ rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+ offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+ innobase_rec_to_mysql(dup->table, rec, index, offsets);
+
+ mem_heap_free(heap);
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
+static
+int
+row_merge_tuple_cmp(
+/*================*/
+ ulint n_field,/*!< in: number of fields */
+ const dfield_t* a, /*!< in: first tuple to be compared */
+ const dfield_t* b, /*!< in: second tuple to be compared */
+ row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
+{
+ int cmp;
+ const dfield_t* field = a;
+
+ /* Compare the fields of the tuples until a difference is
+ found or we run out of fields to compare. If !cmp at the
+ end, the tuples are equal. */
+ do {
+ cmp = cmp_dfield_dfield(a++, b++);
+ } while (!cmp && --n_field);
+
+ if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
+ /* Report a duplicate value error if the tuples are
+ logically equal. NULL columns are logically inequal,
+ although they are equal in the sorting order. Find
+ out if any of the fields are NULL. */
+ for (b = field; b != a; b++) {
+ if (dfield_is_null(b)) {
+
+ goto func_exit;
+ }
+ }
+
+ row_merge_dup_report(dup, field);
+ }
+
+func_exit:
+ return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a array of tuples that being sorted
+@param b aux (work area), same size as tuples[]
+@param c lower bound of the sorting area, inclusive
+@param d upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(a,b,c,d) \
+ row_merge_tuple_sort(n_field, dup, a, b, c, d)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a first tuple to be compared
+@param b second tuple to be compared
+@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
+#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+ ulint n_field,/*!< in: number of fields */
+ row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
+ const dfield_t** tuples, /*!< in/out: tuples */
+ const dfield_t** aux, /*!< in/out: work area */
+ ulint low, /*!< in: lower bound of the
+ sorting area, inclusive */
+ ulint high) /*!< in: upper bound of the
+ sorting area, exclusive */
+{
+ UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+ tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+static
+void
+row_merge_buf_sort(
+/*===============*/
+ row_merge_buf_t* buf, /*!< in/out: sort buffer */
+ row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
+{
+ row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
+ buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+static
+void
+row_merge_buf_write(
+/*================*/
+ const row_merge_buf_t* buf, /*!< in: sorted buffer */
+#ifdef UNIV_DEBUG
+ const merge_file_t* of, /*!< in: output file */
+#endif /* UNIV_DEBUG */
+ row_merge_block_t* block) /*!< out: buffer for writing to file */
+#ifndef UNIV_DEBUG
+# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
+#endif /* !UNIV_DEBUG */
+{
+ const dict_index_t* index = buf->index;
+ ulint n_fields= dict_index_get_n_fields(index);
+ byte* b = &(*block)[0];
+
+ ulint i;
+
+ for (i = 0; i < buf->n_tuples; i++) {
+ ulint size;
+ ulint extra_size;
+ const dfield_t* entry = buf->tuples[i];
+
+ size = rec_get_converted_size_comp(index,
+ REC_STATUS_ORDINARY,
+ entry, n_fields,
+ &extra_size);
+ ut_ad(size > extra_size);
+ ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
+ extra_size -= REC_N_NEW_EXTRA_BYTES;
+ size -= REC_N_NEW_EXTRA_BYTES;
+
+ /* Encode extra_size + 1 */
+ if (extra_size + 1 < 0x80) {
+ *b++ = (byte) (extra_size + 1);
+ } else {
+ ut_ad((extra_size + 1) < 0x8000);
+ *b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+ *b++ = (byte) (extra_size + 1);
+ }
+
+ ut_ad(b + size < block[1]);
+
+ rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
+ REC_STATUS_ORDINARY,
+ entry, n_fields);
+
+ b += size;
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_write) {
+ fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
+ (void*) b, of->fd, (ulong) of->offset,
+ (ulong) i);
+ row_merge_tuple_print(stderr, entry, n_fields);
+ }
+#endif /* UNIV_DEBUG */
+ }
+
+ /* Write an "end-of-chunk" marker. */
+ ut_a(b < block[1]);
+ ut_a(b == block[0] + buf->total_size);
+ *b++ = 0;
+#ifdef UNIV_DEBUG_VALGRIND
+ /* The rest of the block is uninitialized. Initialize it
+ to avoid bogus warnings. */
+ memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+#ifdef UNIV_DEBUG
+ if (row_merge_print_write) {
+ fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
+ (void*) b, of->fd, (ulong) of->offset);
+ }
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+ const dict_index_t* index, /*!< in: record descriptor */
+ mrec_buf_t** buf, /*!< out: 3 buffers */
+ ulint** offsets1, /*!< out: offsets */
+ ulint** offsets2) /*!< out: offsets */
+{
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
+ + 3 * sizeof **buf);
+
+ *buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+ *offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+ *offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
+
+ (*offsets1)[0] = (*offsets2)[0] = i;
+ (*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
+
+ return(heap);
+}
+
+/**********************************************************************//**
+Search an index object by name and column names. If several indexes match,
+return the index with the max id.
+@return matching index, NULL if not found */
+static
+dict_index_t*
+row_merge_dict_table_get_index(
+/*===========================*/
+ dict_table_t* table, /*!< in: table */
+ const merge_index_def_t*index_def) /*!< in: index definition */
+{
+ ulint i;
+ dict_index_t* index;
+ const char** column_names;
+
+ column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
+
+ for (i = 0; i < index_def->n_fields; ++i) {
+ column_names[i] = index_def->fields[i].field_name;
+ }
+
+ index = dict_table_get_index_by_max_id(
+ table, index_def->name, column_names, index_def->n_fields);
+
+ mem_free((void*) column_names);
+
+ return(index);
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_read(
+/*===========*/
+ int fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read */
+ row_merge_block_t* buf) /*!< out: data */
+{
+ ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
+ ibool success;
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_block_read) {
+ fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+ fd, (ulong) offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
+ (ulint) (ofs & 0xFFFFFFFF),
+ (ulint) (ofs >> 32),
+ sizeof *buf);
+ if (UNIV_UNLIKELY(!success)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: failed to read merge block at %llu\n", ofs);
+ }
+
+ return(UNIV_LIKELY(success));
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_write(
+/*============*/
+ int fd, /*!< in: file descriptor */
+ ulint offset, /*!< in: offset where to read
+ in number of row_merge_block_t
+ elements */
+ const void* buf) /*!< in: data */
+{
+ ib_uint64_t ofs = ((ib_uint64_t) offset)
+ * sizeof(row_merge_block_t);
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_block_write) {
+ fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
+ fd, (ulong) offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
+ (ulint) (ofs & 0xFFFFFFFF),
+ (ulint) (ofs >> 32),
+ sizeof(row_merge_block_t))));
+}
+
+/********************************************************************//**
+Read a merge record.
+@return pointer to next record, or NULL on I/O error or end of list */
+static __attribute__((nonnull))
+const byte*
+row_merge_read_rec(
+/*===============*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ const byte* b, /*!< in: pointer to record */
+ const dict_index_t* index, /*!< in: index of the record */
+ int fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t** mrec, /*!< out: pointer to merge record,
+ or NULL on end of list
+ (non-NULL on I/O error) */
+ ulint* offsets)/*!< out: offsets of mrec */
+{
+ ulint extra_size;
+ ulint data_size;
+ ulint avail_size;
+
+ ut_ad(block);
+ ut_ad(buf);
+ ut_ad(b >= block[0]);
+ ut_ad(b < block[1]);
+ ut_ad(index);
+ ut_ad(foffs);
+ ut_ad(mrec);
+ ut_ad(offsets);
+
+ ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index));
+
+ extra_size = *b++;
+
+ if (UNIV_UNLIKELY(!extra_size)) {
+ /* End of list */
+ *mrec = NULL;
+#ifdef UNIV_DEBUG
+ if (row_merge_print_read) {
+ fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
+ (const void*) b, (const void*) block,
+ fd, (ulong) *foffs);
+ }
+#endif /* UNIV_DEBUG */
+ return(NULL);
+ }
+
+ if (extra_size >= 0x80) {
+ /* Read another byte of extra_size. */
+
+ if (UNIV_UNLIKELY(b >= block[1])) {
+ if (!row_merge_read(fd, ++(*foffs), block)) {
+err_exit:
+ /* Signal I/O error. */
+ *mrec = b;
+ return(NULL);
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = block[0];
+ }
+
+ extra_size = (extra_size & 0x7f) << 8;
+ extra_size |= *b++;
+ }
+
+ /* Normalize extra_size. Above, value 0 signals "end of list". */
+ extra_size--;
+
+ /* Read the extra bytes. */
+
+ if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
+ /* The record spans two blocks. Copy the entire record
+ to the auxiliary buffer and handle this as a special
+ case. */
+
+ avail_size = block[1] - b;
+
+ memcpy(*buf, b, avail_size);
+
+ if (!row_merge_read(fd, ++(*foffs), block)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = block[0];
+
+ /* Copy the record. */
+ memcpy(*buf + avail_size, b, extra_size - avail_size);
+ b += extra_size - avail_size;
+
+ *mrec = *buf + extra_size;
+
+ rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+
+ /* These overflows should be impossible given that
+ records are much smaller than either buffer, and
+ the record starts near the beginning of each buffer. */
+ ut_a(extra_size + data_size < sizeof *buf);
+ ut_a(b + data_size < block[1]);
+
+ /* Copy the data bytes. */
+ memcpy(*buf + extra_size, b, data_size);
+ b += data_size;
+
+ goto func_exit;
+ }
+
+ *mrec = b + extra_size;
+
+ rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+ data_size = rec_offs_data_size(offsets);
+ ut_ad(extra_size + data_size < sizeof *buf);
+
+ b += extra_size + data_size;
+
+ if (UNIV_LIKELY(b < block[1])) {
+ /* The record fits entirely in the block.
+ This is the normal case. */
+ goto func_exit;
+ }
+
+ /* The record spans two blocks. Copy it to buf. */
+
+ b -= extra_size + data_size;
+ avail_size = block[1] - b;
+ memcpy(*buf, b, avail_size);
+ *mrec = *buf + extra_size;
+#ifdef UNIV_DEBUG
+ /* We cannot invoke rec_offs_make_valid() here, because there
+ are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
+ Similarly, rec_offs_validate() would fail, because it invokes
+ rec_get_status(). */
+ offsets[2] = (ulint) *mrec;
+ offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+ if (!row_merge_read(fd, ++(*foffs), block)) {
+
+ goto err_exit;
+ }
+
+ /* Wrap around to the beginning of the buffer. */
+ b = block[0];
+
+ /* Copy the rest of the record. */
+ memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+ b += extra_size + data_size - avail_size;
+
+func_exit:
+#ifdef UNIV_DEBUG
+ if (row_merge_print_read) {
+ fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
+ (const void*) b, (const void*) block,
+ fd, (ulong) *foffs);
+ rec_print_comp(stderr, *mrec, offsets);
+ putc('\n', stderr);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+ byte* b, /*!< out: buffer */
+ ulint e, /*!< in: encoded extra_size */
+#ifdef UNIV_DEBUG
+ ulint size, /*!< in: total size to write */
+ int fd, /*!< in: file descriptor */
+ ulint foffs, /*!< in: file offset */
+#endif /* UNIV_DEBUG */
+ const mrec_t* mrec, /*!< in: record to write */
+ const ulint* offsets)/*!< in: offsets of mrec */
+#ifndef UNIV_DEBUG
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
+ row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* !UNIV_DEBUG */
+{
+#ifdef UNIV_DEBUG
+ const byte* const end = b + size;
+ ut_ad(e == rec_offs_extra_size(offsets) + 1);
+
+ if (row_merge_print_write) {
+ fprintf(stderr, "row_merge_write %p,%d,%lu ",
+ (void*) b, fd, (ulong) foffs);
+ rec_print_comp(stderr, mrec, offsets);
+ putc('\n', stderr);
+ }
+#endif /* UNIV_DEBUG */
+
+ if (e < 0x80) {
+ *b++ = (byte) e;
+ } else {
+ *b++ = (byte) (0x80 | (e >> 8));
+ *b++ = (byte) e;
+ }
+
+ memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+ ut_ad(b + rec_offs_size(offsets) == end);
+}
+
+/********************************************************************//**
+Write a merge record.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ mrec_buf_t* buf, /*!< in/out: secondary buffer */
+ byte* b, /*!< in: pointer to end of block */
+ int fd, /*!< in: file descriptor */
+ ulint* foffs, /*!< in/out: file offset */
+ const mrec_t* mrec, /*!< in: record to write */
+ const ulint* offsets)/*!< in: offsets of mrec */
+{
+ ulint extra_size;
+ ulint size;
+ ulint avail_size;
+
+ ut_ad(block);
+ ut_ad(buf);
+ ut_ad(b >= block[0]);
+ ut_ad(b < block[1]);
+ ut_ad(mrec);
+ ut_ad(foffs);
+ ut_ad(mrec < block[0] || mrec > block[1]);
+ ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+ /* Normalize extra_size. Value 0 signals "end of list". */
+ extra_size = rec_offs_extra_size(offsets) + 1;
+
+ size = extra_size + (extra_size >= 0x80)
+ + rec_offs_data_size(offsets);
+
+ if (UNIV_UNLIKELY(b + size >= block[1])) {
+ /* The record spans two blocks.
+ Copy it to the temporary buffer first. */
+ avail_size = block[1] - b;
+
+ row_merge_write_rec_low(buf[0],
+ extra_size, size, fd, *foffs,
+ mrec, offsets);
+
+ /* Copy the head of the temporary buffer, write
+ the completed block, and copy the tail of the
+ record to the head of the new block. */
+ memcpy(b, buf[0], avail_size);
+
+ if (!row_merge_write(fd, (*foffs)++, block)) {
+ return(NULL);
+ }
+
+ UNIV_MEM_INVALID(block[0], sizeof block[0]);
+
+ /* Copy the rest. */
+ b = block[0];
+ memcpy(b, buf[0] + avail_size, size - avail_size);
+ b += size - avail_size;
+ } else {
+ row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+ mrec, offsets);
+ b += size;
+ }
+
+ return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+ row_merge_block_t* block, /*!< in/out: file buffer */
+ byte* b, /*!< in: pointer to end of block */
+ int fd, /*!< in: file descriptor */
+ ulint* foffs) /*!< in/out: file offset */
+{
+ ut_ad(block);
+ ut_ad(b >= block[0]);
+ ut_ad(b < block[1]);
+ ut_ad(foffs);
+#ifdef UNIV_DEBUG
+ if (row_merge_print_write) {
+ fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
+ (void*) b, (void*) block, fd, (ulong) *foffs);
+ }
+#endif /* UNIV_DEBUG */
+
+ *b++ = 0;
+ UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
+ UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
+#ifdef UNIV_DEBUG_VALGRIND
+ /* The rest of the block is uninitialized. Initialize it
+ to avoid bogus warnings. */
+ memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+ if (!row_merge_write(fd, (*foffs)++, block)) {
+ return(NULL);
+ }
+
+ UNIV_MEM_INVALID(block[0], sizeof block[0]);
+ return(block[0]);
+}
+
+/*************************************************************//**
+Compare two merge records.
+@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
+static
+int
+row_merge_cmp(
+/*==========*/
+ const mrec_t* mrec1, /*!< in: first merge
+ record to be compared */
+ const mrec_t* mrec2, /*!< in: second merge
+ record to be compared */
+ const ulint* offsets1, /*!< in: first record offsets */
+ const ulint* offsets2, /*!< in: second record offsets */
+ const dict_index_t* index, /*!< in: index */
+ ibool* null_eq) /*!< out: set to TRUE if
+ found matching null values */
+{
+ int cmp;
+
+ cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
+ null_eq);
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_cmp) {
+ fputs("row_merge_cmp1 ", stderr);
+ rec_print_comp(stderr, mrec1, offsets1);
+ fputs("\nrow_merge_cmp2 ", stderr);
+ rec_print_comp(stderr, mrec2, offsets2);
+ fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
+ }
+#endif /* UNIV_DEBUG */
+
+ return(cmp);
+}
+
+/********************************************************************//**
+Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@return DB_SUCCESS or error */
+static __attribute__((nonnull))
+ulint
+row_merge_read_clustered_index(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction */
+ TABLE* table, /*!< in/out: MySQL table object,
+ for reporting erroneous records */
+ const dict_table_t* old_table,/*!< in: table where rows are
+ read from */
+ const dict_table_t* new_table,/*!< in: table where indexes are
+ created; identical to old_table
+ unless creating a PRIMARY KEY */
+ dict_index_t** index, /*!< in: indexes to be created */
+ merge_file_t* files, /*!< in: temporary files */
+ ulint n_index,/*!< in: number of indexes to create */
+ row_merge_block_t* block) /*!< in/out: file buffer */
+{
+ dict_index_t* clust_index; /* Clustered index */
+ mem_heap_t* row_heap; /* Heap memory to create
+ clustered index records */
+ row_merge_buf_t** merge_buf; /* Temporary list for records*/
+ btr_pcur_t pcur; /* Persistent cursor on the
+ clustered index */
+ mtr_t mtr; /* Mini transaction */
+ ulint err = DB_SUCCESS;/* Return code */
+ ulint i;
+ ulint n_nonnull = 0; /* number of columns
+ changed to NOT NULL */
+ ulint* nonnull = NULL; /* NOT NULL columns */
+
+ trx->op_info = "reading clustered index";
+
+ ut_ad(trx);
+ ut_ad(old_table);
+ ut_ad(new_table);
+ ut_ad(index);
+ ut_ad(files);
+
+ /* Create and initialize memory for record buffers */
+
+ merge_buf = mem_alloc(n_index * sizeof *merge_buf);
+
+ for (i = 0; i < n_index; i++) {
+ merge_buf[i] = row_merge_buf_create(index[i]);
+ }
+
+ mtr_start(&mtr);
+
+ /* Find the clustered index and create a persistent cursor
+ based on that. */
+
+ clust_index = dict_table_get_first_index(old_table);
+
+ btr_pcur_open_at_index_side(
+ TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ if (UNIV_UNLIKELY(old_table != new_table)) {
+ ulint n_cols = dict_table_get_n_cols(old_table);
+
+ /* A primary key will be created. Identify the
+ columns that were flagged NOT NULL in the new table,
+ so that we can quickly check that the records in the
+ (old) clustered index do not violate the added NOT
+ NULL constraints. */
+
+ ut_a(n_cols == dict_table_get_n_cols(new_table));
+
+ nonnull = mem_alloc(n_cols * sizeof *nonnull);
+
+ for (i = 0; i < n_cols; i++) {
+ if (dict_table_get_nth_col(old_table, i)->prtype
+ & DATA_NOT_NULL) {
+
+ continue;
+ }
+
+ if (dict_table_get_nth_col(new_table, i)->prtype
+ & DATA_NOT_NULL) {
+
+ nonnull[n_nonnull++] = i;
+ }
+ }
+
+ if (!n_nonnull) {
+ mem_free(nonnull);
+ nonnull = NULL;
+ }
+ }
+
+ row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+ /* Scan the clustered index. */
+ for (;;) {
+ const rec_t* rec;
+ ulint* offsets;
+ dtuple_t* row = NULL;
+ row_ext_t* ext;
+ ibool has_next = TRUE;
+
+ btr_pcur_move_to_next_on_page(&pcur);
+
+ /* When switching pages, commit the mini-transaction
+ in order to release the latch on the old page. */
+
+ if (btr_pcur_is_after_last_on_page(&pcur)) {
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ i = 0;
+ err = DB_INTERRUPTED;
+ goto err_exit;
+ }
+
+ btr_pcur_store_position(&pcur, &mtr);
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ if (UNIV_LIKELY(has_next)) {
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (srv_pass_corrupt_table && !rec) {
+ err = DB_CORRUPTION;
+ goto err_exit;
+ }
+ ut_a(rec);
+
+ offsets = rec_get_offsets(rec, clust_index, NULL,
+ ULINT_UNDEFINED, &row_heap);
+
+ /* Skip delete marked records. */
+ if (rec_get_deleted_flag(
+ rec, dict_table_is_comp(old_table))) {
+ continue;
+ }
+
+ srv_n_rows_inserted++;
+
+ /* Build a row based on the clustered index. */
+
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, offsets,
+ new_table, &ext, row_heap);
+
+ if (UNIV_LIKELY_NULL(nonnull)) {
+ for (i = 0; i < n_nonnull; i++) {
+ dfield_t* field
+ = &row->fields[nonnull[i]];
+ dtype_t* field_type
+ = dfield_get_type(field);
+
+ ut_a(!(field_type->prtype
+ & DATA_NOT_NULL));
+
+ if (dfield_is_null(field)) {
+ err = DB_PRIMARY_KEY_IS_NULL;
+ i = 0;
+ goto err_exit;
+ }
+
+ field_type->prtype |= DATA_NOT_NULL;
+ }
+ }
+ }
+
+ /* Build all entries for all the indexes to be created
+ in a single scan of the clustered index. */
+
+ for (i = 0; i < n_index; i++) {
+ row_merge_buf_t* buf = merge_buf[i];
+ merge_file_t* file = &files[i];
+ const dict_index_t* index = buf->index;
+
+ if (UNIV_LIKELY
+ (row && row_merge_buf_add(buf, row, ext))) {
+ file->n_rec++;
+ continue;
+ }
+
+ /* The buffer must be sufficiently large
+ to hold at least one record. */
+ ut_ad(buf->n_tuples || !has_next);
+
+ /* We have enough data tuples to form a block.
+ Sort them and write to disk. */
+
+ if (buf->n_tuples) {
+ if (dict_index_is_unique(index)) {
+ row_merge_dup_t dup;
+ dup.index = buf->index;
+ dup.table = table;
+ dup.n_dup = 0;
+
+ row_merge_buf_sort(buf, &dup);
+
+ if (dup.n_dup) {
+ err = DB_DUPLICATE_KEY;
+err_exit:
+ trx->error_key_num = i;
+ goto func_exit;
+ }
+ } else {
+ row_merge_buf_sort(buf, NULL);
+ }
+ }
+
+ row_merge_buf_write(buf, file, block);
+
+ if (!row_merge_write(file->fd, file->offset++,
+ block)) {
+ err = DB_OUT_OF_FILE_SPACE;
+ goto err_exit;
+ }
+
+ UNIV_MEM_INVALID(block[0], sizeof block[0]);
+ merge_buf[i] = row_merge_buf_empty(buf);
+
+ if (UNIV_LIKELY(row != NULL)) {
+ /* Try writing the record again, now
+ that the buffer has been written out
+ and emptied. */
+
+ if (UNIV_UNLIKELY
+ (!row_merge_buf_add(buf, row, ext))) {
+ /* An empty buffer should have enough
+ room for at least one record. */
+ ut_error;
+ }
+
+ file->n_rec++;
+ }
+ }
+
+ mem_heap_empty(row_heap);
+
+ if (UNIV_UNLIKELY(!has_next)) {
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(row_heap);
+
+ if (UNIV_LIKELY_NULL(nonnull)) {
+ mem_free(nonnull);
+ }
+
+ for (i = 0; i < n_index; i++) {
+ row_merge_buf_free(merge_buf[i]);
+ }
+
+ mem_free(merge_buf);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N number of the buffer (0 or 1)
+@param AT_END statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
+ do { \
+ b2 = row_merge_write_rec(&block[2], &buf[2], b2, \
+ of->fd, &of->offset, \
+ mrec##N, offsets##N); \
+ if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \
+ goto corrupt; \
+ } \
+ b##N = row_merge_read_rec(&block[N], &buf[N], \
+ b##N, index, \
+ file->fd, foffs##N, \
+ &mrec##N, offsets##N); \
+ if (UNIV_UNLIKELY(!b##N)) { \
+ if (mrec##N) { \
+ goto corrupt; \
+ } \
+ AT_END; \
+ } \
+ } while (0)
+
+/*************************************************************//**
+Merge two blocks of records on disk and write a bigger block.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_merge_blocks(
+/*=============*/
+ const dict_index_t* index, /*!< in: index being created */
+ const merge_file_t* file, /*!< in: file containing
+ index entries */
+ row_merge_block_t* block, /*!< in/out: 3 buffers */
+ ulint* foffs0, /*!< in/out: offset of first
+ source list in the file */
+ ulint* foffs1, /*!< in/out: offset of second
+ source list in the file */
+ merge_file_t* of, /*!< in/out: output file */
+ TABLE* table) /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ const byte* b1; /*!< pointer to block[1] */
+ byte* b2; /*!< pointer to block[2] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
+ const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */
+ ulint* offsets0;/* offsets of mrec0 */
+ ulint* offsets1;/* offsets of mrec1 */
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_block) {
+ fprintf(stderr,
+ "row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
+ " = fd=%d ofs=%lu\n",
+ file->fd, (ulong) *foffs0,
+ file->fd, (ulong) *foffs1,
+ of->fd, (ulong) of->offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0])
+ || !row_merge_read(file->fd, *foffs1, &block[1])) {
+corrupt:
+ mem_heap_free(heap);
+ return(DB_CORRUPTION);
+ }
+
+ b0 = block[0];
+ b1 = block[1];
+ b2 = block[2];
+
+ b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+ foffs0, &mrec0, offsets0);
+ b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
+ foffs1, &mrec1, offsets1);
+ if (UNIV_UNLIKELY(!b0 && mrec0)
+ || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+ goto corrupt;
+ }
+
+ while (mrec0 && mrec1) {
+ ibool null_eq = FALSE;
+ switch (row_merge_cmp(mrec0, mrec1,
+ offsets0, offsets1, index,
+ &null_eq)) {
+ case 0:
+ if (UNIV_UNLIKELY
+ (dict_index_is_unique(index) && !null_eq)) {
+ innobase_rec_to_mysql(table, mrec0,
+ index, offsets0);
+ mem_heap_free(heap);
+ return(DB_DUPLICATE_KEY);
+ }
+ /* fall through */
+ case -1:
+ ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
+ break;
+ case 1:
+ ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
+ break;
+ default:
+ ut_error;
+ }
+
+ }
+
+merged:
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+ }
+ }
+done0:
+ if (mrec1) {
+ /* append all mrec1 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
+ }
+ }
+done1:
+
+ mem_heap_free(heap);
+ b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
+ return(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/*************************************************************//**
+Copy a block of index entries.
+@return TRUE on success, FALSE on failure */
+static __attribute__((nonnull))
+ibool
+row_merge_blocks_copy(
+/*==================*/
+ const dict_index_t* index, /*!< in: index being created */
+ const merge_file_t* file, /*!< in: input file */
+ row_merge_block_t* block, /*!< in/out: 3 buffers */
+ ulint* foffs0, /*!< in/out: input file offset */
+ merge_file_t* of) /*!< in/out: output file */
+{
+ mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
+
+ mrec_buf_t* buf; /*!< buffer for handling
+ split mrec in block[] */
+ const byte* b0; /*!< pointer to block[0] */
+ byte* b2; /*!< pointer to block[2] */
+ const mrec_t* mrec0; /*!< merge rec, points to block[0] */
+ ulint* offsets0;/* offsets of mrec0 */
+ ulint* offsets1;/* dummy offsets */
+
+#ifdef UNIV_DEBUG
+ if (row_merge_print_block) {
+ fprintf(stderr,
+ "row_merge_blocks_copy fd=%d ofs=%lu"
+ " = fd=%d ofs=%lu\n",
+ file->fd, (ulong) foffs0,
+ of->fd, (ulong) of->offset);
+ }
+#endif /* UNIV_DEBUG */
+
+ heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+ /* Write a record and read the next record. Split the output
+ file in two halves, which can be merged on the following pass. */
+
+ if (!row_merge_read(file->fd, *foffs0, &block[0])) {
+corrupt:
+ mem_heap_free(heap);
+ return(FALSE);
+ }
+
+ b0 = block[0];
+ b2 = block[2];
+
+ b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+ foffs0, &mrec0, offsets0);
+ if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+ goto corrupt;
+ }
+
+ if (mrec0) {
+ /* append all mrec0 to output */
+ for (;;) {
+ ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+ }
+ }
+done0:
+
+ /* The file offset points to the beginning of the last page
+ that has been read. Update it to point to the next block. */
+ (*foffs0)++;
+
+ mem_heap_free(heap);
+ return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset)
+ != NULL);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return DB_SUCCESS or error code */
+static __attribute__((nonnull))
+ulint
+row_merge(
+/*======*/
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index, /*!< in: index being created */
+ merge_file_t* file, /*!< in/out: file containing
+ index entries */
+ row_merge_block_t* block, /*!< in/out: 3 buffers */
+ int* tmpfd, /*!< in/out: temporary file handle */
+ TABLE* table, /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+ ulint* num_run,/*!< in/out: Number of runs remain
+ to be merged */
+ ulint* run_offset) /*!< in/out: Array contains the
+ first offset number for each merge
+ run */
+{
+ ulint foffs0; /*!< first input offset */
+ ulint foffs1; /*!< second input offset */
+ ulint error; /*!< error code */
+ merge_file_t of; /*!< output file */
+ const ulint ihalf = run_offset[*num_run / 2];
+ /*!< half the input file */
+ ulint n_run = 0;
+ /*!< num of runs generated from this merge */
+
+ UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
+
+ ut_ad(ihalf < file->offset);
+
+ of.fd = *tmpfd;
+ of.offset = 0;
+ of.n_rec = 0;
+
+ /* Merge blocks to the output file. */
+ foffs0 = 0;
+ foffs1 = ihalf;
+
+ UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
+
+ for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ error = row_merge_blocks(index, file, block,
+ &foffs0, &foffs1, &of, table);
+
+ if (error != DB_SUCCESS) {
+ return(error);
+ }
+
+ }
+
+ /* Copy the last blocks, if there are any. */
+
+ while (foffs0 < ihalf) {
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs0 == ihalf);
+
+ while (foffs1 < file->offset) {
+ if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+ return(DB_INTERRUPTED);
+ }
+
+ /* Remember the offset number for this run */
+ run_offset[n_run++] = of.offset;
+
+ if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) {
+ return(DB_CORRUPTION);
+ }
+ }
+
+ ut_ad(foffs1 == file->offset);
+
+ if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+ return(DB_CORRUPTION);
+ }
+
+ ut_ad(n_run <= *num_run);
+
+ *num_run = n_run;
+
+ /* Each run can contain one or more offsets. As merge goes on,
+ the number of runs (to merge) will reduce until we have one
+ single run. So the number of runs will always be smaller than
+ the number of offsets in file */
+ ut_ad((*num_run) <= file->offset);
+
+ /* The number of offsets in output file is always equal or
+ smaller than input file */
+ ut_ad(of.offset <= file->offset);
+
+ /* Swap file descriptors for the next pass. */
+ *tmpfd = file->fd;
+ *file = of;
+
+ UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
+
+ return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_merge_sort(
+/*===========*/
+ trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index, /*!< in: index being created */
+ merge_file_t* file, /*!< in/out: file containing
+ index entries */
+ row_merge_block_t* block, /*!< in/out: 3 buffers */
+ int* tmpfd, /*!< in/out: temporary file handle */
+ TABLE* table) /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+{
+ ulint half = file->offset / 2;
+ ulint num_runs;
+ ulint* run_offset;
+ ulint error = DB_SUCCESS;
+
+ /* Record the number of merge runs we need to perform */
+ num_runs = file->offset;
+
+ /* If num_runs are less than 1, nothing to merge */
+ if (num_runs <= 1) {
+ return(error);
+ }
+
+ /* "run_offset" records each run's first offset number */
+ run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
+
+ /* This tells row_merge() where to start for the first round
+ of merge. */
+ run_offset[half] = half;
+
+ /* The file should always contain at least one byte (the end
+ of file marker). Thus, it must be at least one block. */
+ ut_ad(file->offset > 0);
+
+ /* Merge the runs until we have one big run */
+ do {
+ error = row_merge(trx, index, file, block, tmpfd,
+ table, &num_runs, run_offset);
+
+ UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
+
+ if (error != DB_SUCCESS) {
+ break;
+ }
+ } while (num_runs > 1);
+
+ mem_free(run_offset);
+
+ return(error);
+}
+
+/*************************************************************//**
+Copy externally stored columns to the data tuple. */
+static
+void
+row_merge_copy_blobs(
+/*=================*/
+ const mrec_t* mrec, /*!< in: merge record */
+ const ulint* offsets,/*!< in: offsets of mrec */
+ ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
+ dtuple_t* tuple, /*!< in/out: data tuple */
+ mem_heap_t* heap) /*!< in/out: memory heap */
+{
+ ulint i;
+ ulint n_fields = dtuple_get_n_fields(tuple);
+
+ for (i = 0; i < n_fields; i++) {
+ ulint len;
+ const void* data;
+ dfield_t* field = dtuple_get_nth_field(tuple, i);
+
+ if (!dfield_is_ext(field)) {
+ continue;
+ }
+
+ ut_ad(!dfield_is_null(field));
+
+ /* The table is locked during index creation.
+ Therefore, externally stored columns cannot possibly
+ be freed between the time the BLOB pointers are read
+ (row_merge_read_clustered_index()) and dereferenced
+ (below). */
+ data = btr_rec_copy_externally_stored_field(
+ mrec, offsets, zip_size, i, &len, heap);
+ /* Because we have locked the table, any records
+ written by incomplete transactions must have been
+ rolled back already. There must not be any incomplete
+ BLOB columns. */
+ ut_a(data);
+
+ dfield_set_data(field, data, len);
+ }
+}
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return DB_SUCCESS or error number */
+static
+ulint
+row_merge_insert_index_tuples(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: index */
+ dict_table_t* table, /*!< in: new table */
+ ulint zip_size,/*!< in: compressed page size of
+ the old table, or 0 if uncompressed */
+ int fd, /*!< in: file descriptor */
+ row_merge_block_t* block) /*!< in/out: file buffer */
+{
+ const byte* b;
+ que_thr_t* thr;
+ ins_node_t* node;
+ mem_heap_t* tuple_heap;
+ mem_heap_t* graph_heap;
+ ulint error = DB_SUCCESS;
+ ulint foffs = 0;
+ ulint* offsets;
+
+ ut_ad(trx);
+ ut_ad(index);
+ ut_ad(table);
+
+ /* We use the insert query graph as the dummy graph
+ needed in the row module call */
+
+ trx->op_info = "inserting index entries";
+
+ graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+ node = ins_node_create(INS_DIRECT, table, graph_heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, graph_heap);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+ tuple_heap = mem_heap_create(1000);
+
+ {
+ ulint i = 1 + REC_OFFS_HEADER_SIZE
+ + dict_index_get_n_fields(index);
+ offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
+ offsets[0] = i;
+ offsets[1] = dict_index_get_n_fields(index);
+ }
+
+ b = *block;
+
+ if (!row_merge_read(fd, foffs, block)) {
+ error = DB_CORRUPTION;
+ } else {
+ mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
+ for (;;) {
+ const mrec_t* mrec;
+ dtuple_t* dtuple;
+ ulint n_ext;
+
+ b = row_merge_read_rec(block, buf, b, index,
+ fd, &foffs, &mrec, offsets);
+ if (UNIV_UNLIKELY(!b)) {
+ /* End of list, or I/O error */
+ if (mrec) {
+ error = DB_CORRUPTION;
+ }
+ break;
+ }
+
+ dtuple = row_rec_to_index_entry_low(
+ mrec, index, offsets, &n_ext, tuple_heap);
+
+ if (UNIV_UNLIKELY(n_ext)) {
+ row_merge_copy_blobs(mrec, offsets, zip_size,
+ dtuple, tuple_heap);
+ }
+
+ node->row = dtuple;
+ node->table = table;
+ node->trx_id = trx->id;
+
+ ut_ad(dtuple_validate(dtuple));
+
+ do {
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ error = row_ins_index_entry(index, dtuple,
+ 0, FALSE, thr);
+
+ if (UNIV_LIKELY(error == DB_SUCCESS)) {
+
+ goto next_rec;
+ }
+
+ thr->lock_state = QUE_THR_LOCK_ROW;
+ trx->error_state = error;
+ que_thr_stop_for_mysql(thr);
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ } while (row_mysql_handle_errors(&error, trx,
+ thr, NULL));
+
+ goto err_exit;
+next_rec:
+ mem_heap_empty(tuple_heap);
+ }
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+err_exit:
+ que_graph_free(thr->graph);
+
+ trx->op_info = "";
+
+ mem_heap_free(tuple_heap);
+
+ return(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table, /*!< in: table to lock */
+ enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
+{
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+ sel_node_t* node;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+ heap = mem_heap_create(512);
+
+ trx->op_info = "setting table lock for creating or dropping index";
+
+ node = sel_node_create(heap);
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+ thr->graph->state = QUE_FORK_ACTIVE;
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(que_node_get_parent(thr));
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ err = lock_table(0, table, mode, thr);
+
+ trx->error_state = err;
+
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+ que_thr_stop_for_mysql_no_error(thr, trx);
+ } else {
+ que_thr_stop_for_mysql(thr);
+
+ if (err != DB_QUE_THR_SUSPENDED) {
+ ibool was_lock_wait;
+
+ was_lock_wait = row_mysql_handle_errors(
+ &err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+ } else {
+ que_thr_t* run_thr;
+ que_node_t* parent;
+
+ parent = que_node_get_parent(thr);
+ run_thr = que_fork_start_command(parent);
+
+ ut_a(run_thr == thr);
+
+ /* There was a lock wait but the thread was not
+ in a ready to run or running state. */
+ trx->error_state = DB_LOCK_WAIT;
+
+ goto run_again;
+ }
+ }
+
+ que_graph_free(thr->graph);
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Drop an index from the InnoDB system tables. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+ dict_index_t* index, /*!< in: index to be removed */
+ dict_table_t* table, /*!< in: table */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ ulint err;
+ pars_info_t* info = pars_info_create();
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in deleting the dictionary data from system
+ tables in Innobase. Deleting a row from SYS_INDEXES table also
+ frees the file segments of the B-tree associated with the index. */
+
+ static const char str1[] =
+ "PROCEDURE DROP_INDEX_PROC () IS\n"
+ "BEGIN\n"
+ /* Rename the index, so that it will be dropped by
+ row_merge_drop_temp_indexes() at crash recovery
+ if the server crashes before this trx is committed. */
+ "UPDATE SYS_INDEXES SET NAME=CONCAT('"
+ TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n"
+ "COMMIT WORK;\n"
+ /* Drop the statistics of the index. */
+ "DELETE FROM SYS_STATS WHERE INDEX_ID = :indexid;\n"
+ /* Drop the field definitions of the index. */
+ "DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
+ /* Drop the index definition and the B-tree. */
+ "DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
+ "END;\n";
+
+ ut_ad(index && table && trx);
+
+ pars_info_add_dulint_literal(info, "indexid", index->id);
+
+ trx_start_if_not_started(trx);
+ trx->op_info = "dropping index";
+
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ err = que_eval_sql(info, str1, FALSE, trx);
+
+ ut_a(err == DB_SUCCESS);
+
+ /* Replace this index with another equivalent index for all
+ foreign key constraints on this table where this index is used */
+
+ dict_table_replace_index_in_foreign_list(table, index);
+ dict_index_remove_from_cache(table, index);
+
+ trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index. The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table, /*!< in: table containing the indexes */
+ dict_index_t** index, /*!< in: indexes to drop */
+ ulint num_created) /*!< in: number of elements in index[] */
+{
+ ulint key_num;
+
+ for (key_num = 0; key_num < num_created; key_num++) {
+ row_merge_drop_index(index[key_num], table, trx);
+ }
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+ trx_t* trx;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+
+ /* Load the table definitions that contain partially defined
+ indexes, so that the data dictionary information can be checked
+ when accessing the tablename.ibd files. */
+ trx = trx_allocate_for_background();
+ trx->op_info = "dropping partially created indexes";
+ row_mysql_lock_data_dictionary(trx);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_indexes),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ dulint table_id;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+ &len);
+ if (len == UNIV_SQL_NULL || len == 0
+ || (char) *field != TEMP_INDEX_PREFIX) {
+ continue;
+ }
+
+ /* This is a temporary index. */
+
+ field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+ if (len != 8) {
+ /* Corrupted TABLE_ID */
+ continue;
+ }
+
+ table_id = mach_read_from_8(field);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_table_get_on_id_low(table_id);
+
+ if (table) {
+ dict_index_t* index;
+ dict_index_t* next_index;
+
+ for (index = dict_table_get_first_index(table);
+ index; index = next_index) {
+
+ next_index = dict_table_get_next_index(index);
+
+ if (*index->name == TEMP_INDEX_PREFIX) {
+ row_merge_drop_index(index, table, trx);
+ trx_commit_for_mysql(trx);
+ }
+ }
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ row_mysql_unlock_data_dictionary(trx);
+ trx_free_for_background(trx);
+}
+
+/*********************************************************************//**
+Create a merge file. */
+static
+void
+row_merge_file_create(
+/*==================*/
+ merge_file_t* merge_file) /*!< out: merge file structure */
+{
+ merge_file->fd = innobase_mysql_tmpfile();
+ merge_file->offset = 0;
+ merge_file->n_rec = 0;
+}
+
+/*********************************************************************//**
+Destroy a merge file. */
+static
+void
+row_merge_file_destroy(
+/*===================*/
+ merge_file_t* merge_file) /*!< out: merge file structure */
+{
+ if (merge_file->fd != -1) {
+ close(merge_file->fd);
+ merge_file->fd = -1;
+ }
+}
+
+/*********************************************************************//**
+Determine the precise type of a column that is added to a tem
+if a column must be constrained NOT NULL.
+@return col->prtype, possibly ORed with DATA_NOT_NULL */
+UNIV_INLINE
+ulint
+row_merge_col_prtype(
+/*=================*/
+ const dict_col_t* col, /*!< in: column */
+ const char* col_name, /*!< in: name of the column */
+ const merge_index_def_t*index_def) /*!< in: the index definition
+ of the primary key */
+{
+ ulint prtype = col->prtype;
+ ulint i;
+
+ ut_ad(index_def->ind_type & DICT_CLUSTERED);
+
+ if (prtype & DATA_NOT_NULL) {
+
+ return(prtype);
+ }
+
+ /* All columns that are included
+ in the PRIMARY KEY must be NOT NULL. */
+
+ for (i = 0; i < index_def->n_fields; i++) {
+ if (!strcmp(col_name, index_def->fields[i].field_name)) {
+ return(prtype | DATA_NOT_NULL);
+ }
+ }
+
+ return(prtype);
+}
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+ const char* table_name, /*!< in: new table name */
+ const merge_index_def_t*index_def, /*!< in: the index definition
+ of the primary key */
+ const dict_table_t* table, /*!< in: old table definition */
+ trx_t* trx) /*!< in/out: transaction
+ (sets error_state) */
+{
+ ulint i;
+ dict_table_t* new_table = NULL;
+ ulint n_cols = dict_table_get_n_user_cols(table);
+ ulint error;
+ mem_heap_t* heap = mem_heap_create(1000);
+
+ ut_ad(table_name);
+ ut_ad(index_def);
+ ut_ad(table);
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
+
+ for (i = 0; i < n_cols; i++) {
+ const dict_col_t* col;
+ const char* col_name;
+
+ col = dict_table_get_nth_col(table, i);
+ col_name = dict_table_get_col_name(table, i);
+
+ dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
+ row_merge_col_prtype(col, col_name,
+ index_def),
+ col->len);
+ }
+
+ error = row_create_table_for_mysql(new_table, trx);
+ mem_heap_free(heap);
+
+ if (error != DB_SUCCESS) {
+ trx->error_state = error;
+ new_table = NULL;
+ }
+
+ return(new_table);
+}
+
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones. The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+ trx_t* trx, /*!< in/out: transaction */
+ dict_table_t* table) /*!< in/out: table with new indexes */
+{
+ ulint err = DB_SUCCESS;
+ pars_info_t* info = pars_info_create();
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in renaming indexes. */
+
+ static const char rename_indexes[] =
+ "PROCEDURE RENAME_INDEXES_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+ "WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
+ TEMP_INDEX_PREFIX_STR "';\n"
+ "END;\n";
+
+ ut_ad(table);
+ ut_ad(trx);
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ trx->op_info = "renaming indexes";
+
+ pars_info_add_dulint_literal(info, "tableid", table->id);
+
+ err = que_eval_sql(info, rename_indexes, FALSE, trx);
+
+ if (err == DB_SUCCESS) {
+ dict_index_t* index = dict_table_get_first_index(table);
+ do {
+ if (*index->name == TEMP_INDEX_PREFIX) {
+ index->name++;
+ }
+ index = dict_table_get_next_index(index);
+ } while (index);
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Rename the tables in the data dictionary. The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+ dict_table_t* old_table, /*!< in/out: old table, renamed to
+ tmp_name */
+ dict_table_t* new_table, /*!< in/out: new table, renamed to
+ old_table->name */
+ const char* tmp_name, /*!< in: new name for old_table */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ ulint err = DB_ERROR;
+ pars_info_t* info;
+ char old_name[MAX_TABLE_NAME_LEN + 1];
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_ad(old_table != new_table);
+ ut_ad(mutex_own(&dict_sys->mutex));
+
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* store the old/current name to an automatic variable */
+ if (strlen(old_table->name) + 1 <= sizeof(old_name)) {
+ memcpy(old_name, old_table->name, strlen(old_table->name) + 1);
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, "InnoDB: too long table name: '%s', "
+ "max length is %d\n", old_table->name,
+ MAX_TABLE_NAME_LEN);
+ ut_error;
+ }
+
+ trx->op_info = "renaming tables";
+
+ /* We use the private SQL parser of Innobase to generate the query
+ graphs needed in updating the dictionary data in system tables. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_name", new_table->name);
+ pars_info_add_str_literal(info, "old_name", old_name);
+ pars_info_add_str_literal(info, "tmp_name", tmp_name);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENAME_TABLES () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
+ " WHERE NAME = :old_name;\n"
+ "UPDATE SYS_TABLES SET NAME = :old_name\n"
+ " WHERE NAME = :new_name;\n"
+ "END;\n", FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ /* The following calls will also rename the .ibd data files if
+ the tables are stored in a single-table tablespace */
+
+ if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
+ || !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
+
+ err = DB_ERROR;
+ goto err_exit;
+ }
+
+ err = dict_load_foreigns(old_name, FALSE, TRUE);
+
+ if (err != DB_SUCCESS) {
+err_exit:
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Create and execute a query graph for creating an index.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_merge_create_index_graph(
+/*=========================*/
+ trx_t* trx, /*!< in: trx */
+ dict_table_t* table, /*!< in: table */
+ dict_index_t* index) /*!< in: index */
+{
+ ind_node_t* node; /*!< Index creation node */
+ mem_heap_t* heap; /*!< Memory heap */
+ que_thr_t* thr; /*!< Query thread */
+ ulint err;
+
+ ut_ad(trx);
+ ut_ad(table);
+ ut_ad(index);
+
+ heap = mem_heap_create(512);
+
+ index->table = table;
+ node = ind_create_graph_create(index, heap);
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ return(err);
+}
+
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+ trx_t* trx, /*!< in/out: trx (sets error_state) */
+ dict_table_t* table, /*!< in: the index is on this table */
+ const merge_index_def_t*index_def)
+ /*!< in: the index definition */
+{
+ dict_index_t* index;
+ ulint err;
+ ulint n_fields = index_def->n_fields;
+ ulint i;
+
+ /* Create the index prototype, using the passed in def, this is not
+ a persistent operation. We pass 0 as the space id, and determine at
+ a lower level the space id where to store the table. */
+
+ index = dict_mem_index_create(table->name, index_def->name,
+ 0, index_def->ind_type, n_fields);
+
+ ut_a(index);
+
+ for (i = 0; i < n_fields; i++) {
+ merge_index_field_t* ifield = &index_def->fields[i];
+
+ dict_mem_index_add_field(index, ifield->field_name,
+ ifield->prefix_len);
+ }
+
+ /* Add the index to SYS_INDEXES, using the index prototype. */
+ err = row_merge_create_index_graph(trx, table, index);
+
+ if (err == DB_SUCCESS) {
+
+ index = row_merge_dict_table_get_index(
+ table, index_def);
+
+ ut_a(index);
+
+ /* Note the id of the transaction that created this
+ index, we use it to restrict readers from accessing
+ this index, to ensure read consistency. */
+ index->trx_id = (ib_uint64_t)
+ ut_conv_dulint_to_longlong(trx->id);
+ } else {
+ index = NULL;
+ }
+
+ return(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+ const trx_t* trx, /*!< in: transaction */
+ const dict_index_t* index) /*!< in: index to check */
+{
+ return(!trx->read_view || read_view_sees_trx_id(
+ trx->read_view,
+ ut_dulint_create((ulint) (index->trx_id >> 32),
+ (ulint) index->trx_id & 0xFFFFFFFF)));
+}
+
+/*********************************************************************//**
+Drop the old table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* table) /*!< in: table to drop */
+{
+ /* There must be no open transactions on the table. */
+ ut_a(table->n_mysql_handles_opened == 0);
+
+ return(row_drop_table_for_mysql(table->name, trx, FALSE));
+}
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ dict_table_t* old_table, /*!< in: table where rows are
+ read from */
+ dict_table_t* new_table, /*!< in: table where indexes are
+ created; identical to old_table
+ unless creating a PRIMARY KEY */
+ dict_index_t** indexes, /*!< in: indexes to be created */
+ ulint n_indexes, /*!< in: size of indexes[] */
+ TABLE* table) /*!< in/out: MySQL table, for
+ reporting erroneous key value
+ if applicable */
+{
+ merge_file_t* merge_files;
+ row_merge_block_t* block;
+ ulint block_size;
+ ulint i;
+ ulint error;
+ int tmpfd;
+
+ ut_ad(trx);
+ ut_ad(old_table);
+ ut_ad(new_table);
+ ut_ad(indexes);
+ ut_ad(n_indexes);
+
+ trx_start_if_not_started(trx);
+
+ /* Allocate memory for merge file data structure and initialize
+ fields */
+
+ merge_files = mem_alloc(n_indexes * sizeof *merge_files);
+ block_size = 3 * sizeof *block;
+ block = os_mem_alloc_large(&block_size);
+
+ for (i = 0; i < n_indexes; i++) {
+
+ row_merge_file_create(&merge_files[i]);
+ }
+
+ tmpfd = innobase_mysql_tmpfile();
+
+ /* Reset the MySQL row buffer that is used when reporting
+ duplicate keys. */
+ innobase_rec_reset(table);
+
+ /* Read clustered index of the table and create files for
+ secondary index entries for merge sort */
+
+ error = row_merge_read_clustered_index(
+ trx, table, old_table, new_table, indexes,
+ merge_files, n_indexes, block);
+
+ if (error != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ /* Now we have files containing index entries ready for
+ sorting and inserting. */
+
+ for (i = 0; i < n_indexes; i++) {
+ error = row_merge_sort(trx, indexes[i], &merge_files[i],
+ block, &tmpfd, table);
+
+ if (error == DB_SUCCESS) {
+ error = row_merge_insert_index_tuples(
+ trx, indexes[i], new_table,
+ dict_table_zip_size(old_table),
+ merge_files[i].fd, block);
+ }
+
+ /* Close the temporary file to free up space. */
+ row_merge_file_destroy(&merge_files[i]);
+
+ if (error != DB_SUCCESS) {
+ trx->error_key_num = i;
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ close(tmpfd);
+
+ for (i = 0; i < n_indexes; i++) {
+ row_merge_file_destroy(&merge_files[i]);
+ }
+
+ mem_free(merge_files);
+ os_mem_free_large(block, block_size);
+
+ return(error);
+}
diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c
new file mode 100644
index 00000000000..56754404b65
--- /dev/null
+++ b/storage/xtradb/row/row0mysql.c
@@ -0,0 +1,4234 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.c
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "row0ins.h"
+#include "row0merge.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+
+/** Provide optional 4.x backwards compatibility for 5.0 and above */
+UNIV_INTERN ibool row_rollback_on_timeout = FALSE;
+
+/** Chain node of the list of tables to drop in the background. */
+typedef struct row_mysql_drop_struct row_mysql_drop_t;
+
+/** Chain node of the list of tables to drop in the background. */
+struct row_mysql_drop_struct{
+ char* table_name; /*!< table name */
+ UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
+ /*!< list chain node */
+};
+
+/** @brief List of tables we should drop in background.
+
+ALTER TABLE in MySQL requires that the table handler can drop the
+table in background when there are no queries to it any
+more. Protected by kernel_mutex. */
+static UT_LIST_BASE_NODE_T(row_mysql_drop_t) row_mysql_drop_list;
+/** Flag: has row_mysql_drop_list been initialized? */
+static ibool row_mysql_drop_list_inited = FALSE;
+
+/** Magic table names for invoking various monitor threads */
+/* @{ */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+/* @} */
+
+/** Evaluates to true if str1 equals str2_onstack, used for comparing
+the magic table names.
+@param str1 in: string to compare
+@param str1_len in: length of str1, in bytes, including terminating NUL
+@param str2_onstack in: char[] array containing a NUL terminated string
+@return TRUE if str1 equals str2_onstack */
+#define STR_EQ(str1, str1_len, str2_onstack) \
+ ((str1_len) == sizeof(str2_onstack) \
+ && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0)
+
+/*******************************************************************//**
+Determine if the given name is a name reserved for MySQL system tables.
+@return TRUE if name is a MySQL system table name */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+ const char* name)
+{
+ if (strncmp(name, "mysql/", 6) != 0) {
+
+ return(FALSE);
+ }
+
+ return(0 == strcmp(name + 6, "host")
+ || 0 == strcmp(name + 6, "user")
+ || 0 == strcmp(name + 6, "db"));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+ const char* name); /*!< in: table name */
+
+/*******************************************************************//**
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+ if (srv_dml_needed_delay) {
+ os_thread_sleep(srv_dml_needed_delay);
+ }
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct of a
+ ha_innobase:: table handle */
+{
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+ byte* dest, /*!< in: where to store */
+ ulint len, /*!< in: length, must fit in two bytes */
+ ulint lenlen) /*!< in: storage length of len: either 1 or 2 bytes */
+{
+ if (lenlen == 2) {
+ ut_a(len < 256 * 256);
+
+ mach_write_to_2_little_endian(dest, len);
+
+ return(dest + 2);
+ }
+
+ ut_a(lenlen == 1);
+ ut_a(len < 256);
+
+ mach_write_to_1(dest, len);
+
+ return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+ ulint* len, /*!< out: variable-length field length */
+ const byte* field, /*!< in: field in the MySQL format */
+ ulint lenlen) /*!< in: storage length of len: either 1
+ or 2 bytes */
+{
+ if (lenlen == 2) {
+ *len = mach_read_from_2_little_endian(field);
+
+ return(field + 2);
+ }
+
+ ut_a(lenlen == 1);
+
+ *len = mach_read_from_1(field);
+
+ return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+ byte* dest, /*!< in: where to store */
+ ulint col_len,/*!< in: dest buffer size: determines into
+ how many bytes the BLOB length is stored,
+ the space for the length may vary from 1
+ to 4 bytes */
+ const void* data, /*!< in: BLOB data; if the value to store
+ is SQL NULL this should be NULL pointer */
+ ulint len) /*!< in: BLOB length; if the value to store
+ is SQL NULL this should be 0; remember
+ also to set the NULL bit in the MySQL record
+ header! */
+{
+ /* MySQL might assume the field is set to zero except the length and
+ the pointer fields */
+
+ memset(dest, '\0', col_len);
+
+ /* In dest there are 1 - 4 bytes reserved for the BLOB length,
+ and after that 8 bytes reserved for the pointer to the data.
+ In 32-bit architectures we only use the first 4 bytes of the pointer
+ slot. */
+
+ ut_a(col_len - 8 > 1 || len < 256);
+ ut_a(col_len - 8 > 2 || len < 256 * 256);
+ ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+ mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+ memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+ ulint* len, /*!< out: BLOB length */
+ const byte* ref, /*!< in: BLOB reference in the
+ MySQL format */
+ ulint col_len) /*!< in: BLOB reference length
+ (not BLOB length) */
+{
+ byte* data;
+
+ *len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+ memcpy(&data, ref + col_len - 8, sizeof data);
+
+ return(data);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+ dfield_t* dfield, /*!< in/out: dfield where dtype
+ information must be already set when
+ this function is called! */
+ byte* buf, /*!< in/out: buffer for a converted
+ integer value; this must be at least
+ col_len long then! */
+ ibool row_format_col, /*!< TRUE if the mysql_data is from
+ a MySQL row, FALSE if from a MySQL
+ key value;
+ in MySQL, a true VARCHAR storage
+ format differs in a row and in a
+ key value: in a key value the length
+ is always stored in 2 bytes! */
+ const byte* mysql_data, /*!< in: MySQL column value, not
+ SQL NULL; NOTE that dfield may also
+ get a pointer to mysql_data,
+ therefore do not discard this as long
+ as dfield is used! */
+ ulint col_len, /*!< in: MySQL column length; NOTE that
+ this is the storage length of the
+ column in the MySQL format row, not
+ necessarily the length of the actual
+ payload data; if the column is a true
+ VARCHAR then this is irrelevant */
+ ulint comp) /*!< in: nonzero=compact format */
+{
+ const byte* ptr = mysql_data;
+ const dtype_t* dtype;
+ ulint type;
+ ulint lenlen;
+
+ dtype = dfield_get_type(dfield);
+
+ type = dtype->mtype;
+
+ if (type == DATA_INT) {
+ /* Store integer data in Innobase in a big-endian format,
+ sign bit negated if the data is a signed integer. In MySQL,
+ integers are stored in a little-endian format. */
+
+ byte* p = buf + col_len;
+
+ for (;;) {
+ p--;
+ *p = *mysql_data;
+ if (p == buf) {
+ break;
+ }
+ mysql_data++;
+ }
+
+ if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+ *buf ^= 128;
+ }
+
+ ptr = buf;
+ buf += col_len;
+ } else if ((type == DATA_VARCHAR
+ || type == DATA_VARMYSQL
+ || type == DATA_BINARY)) {
+
+ if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+ /* The length of the actual data is stored to 1 or 2
+ bytes at the start of the field */
+
+ if (row_format_col) {
+ if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+ lenlen = 2;
+ } else {
+ lenlen = 1;
+ }
+ } else {
+ /* In a MySQL key value, lenlen is always 2 */
+ lenlen = 2;
+ }
+
+ ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+ lenlen);
+ } else {
+ /* Remove trailing spaces from old style VARCHAR
+ columns. */
+
+ /* Handle UCS2 strings differently. */
+ ulint mbminlen = dtype_get_mbminlen(dtype);
+
+ ptr = mysql_data;
+
+ if (mbminlen == 2) {
+ /* space=0x0020 */
+ /* Trim "half-chars", just in case. */
+ col_len &= ~1;
+
+ while (col_len >= 2 && ptr[col_len - 2] == 0x00
+ && ptr[col_len - 1] == 0x20) {
+ col_len -= 2;
+ }
+ } else {
+ ut_a(mbminlen == 1);
+ /* space=0x20 */
+ while (col_len > 0
+ && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ }
+ }
+ } else if (comp && type == DATA_MYSQL
+ && dtype_get_mbminlen(dtype) == 1
+ && dtype_get_mbmaxlen(dtype) > 1) {
+ /* In some cases we strip trailing spaces from UTF-8 and other
+ multibyte charsets, from FIXED-length CHAR columns, to save
+ space. UTF-8 would otherwise normally use 3 * the string length
+ bytes to store an ASCII string! */
+
+ /* We assume that this CHAR field is encoded in a
+ variable-length character set where spaces have
+ 1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+ Consider a CHAR(n) field, a field of n characters.
+ It will contain between n * mbminlen and n * mbmaxlen bytes.
+ We will try to truncate it to n bytes by stripping
+ space padding. If the field contains single-byte
+ characters only, it will be truncated to n characters.
+ Consider a CHAR(5) field containing the string ".a "
+ where "." denotes a 3-byte character represented by
+ the bytes "$%&". After our stripping, the string will
+ be stored as "$%&a " (5 bytes). The string ".abc "
+ will be stored as "$%&abc" (6 bytes).
+
+ The space padding will be restored in row0sel.c, function
+ row_sel_field_store_in_mysql_format(). */
+
+ ulint n_chars;
+
+ ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+ n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+ /* Strip space padding. */
+ while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+ col_len--;
+ }
+ } else if (type == DATA_BLOB && row_format_col) {
+
+ ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+ }
+
+ dfield_set_data(dfield, ptr, col_len);
+
+ return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+ dtuple_t* row, /*!< in/out: Innobase row where the
+ field type information is already
+ copied there! */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct where template
+ must be of type ROW_MYSQL_WHOLE_ROW */
+ byte* mysql_rec) /*!< in: row in the MySQL format;
+ NOTE: do not discard as long as
+ row is used, as row may contain
+ pointers to this record! */
+{
+ mysql_row_templ_t* templ;
+ dfield_t* dfield;
+ ulint i;
+
+ ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+ ut_ad(prebuilt->mysql_template);
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+
+ templ = prebuilt->mysql_template + i;
+ dfield = dtuple_get_nth_field(row, i);
+
+ if (templ->mysql_null_bit_mask != 0) {
+ /* Column may be SQL NULL */
+
+ if (mysql_rec[templ->mysql_null_byte_offset]
+ & (byte) (templ->mysql_null_bit_mask)) {
+
+ /* It is SQL NULL */
+
+ dfield_set_null(dfield);
+
+ goto next_column;
+ }
+ }
+
+ row_mysql_store_col_in_innobase_format(
+ dfield,
+ prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+ TRUE, /* MySQL row format data */
+ mysql_rec + templ->mysql_col_offset,
+ templ->mysql_col_len,
+ dict_table_is_comp(prebuilt->table));
+next_column:
+ ;
+ }
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+ ulint* new_err,/*!< out: possible new error encountered in
+ lock wait, or if no new error, the value
+ of trx->error_state at the entry of this
+ function */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* thr, /*!< in: query thread */
+ trx_savept_t* savept) /*!< in: savepoint or NULL */
+{
+ ulint err;
+
+handle_new_error:
+ err = trx->error_state;
+
+ ut_a(err != DB_SUCCESS);
+
+ trx->error_state = DB_SUCCESS;
+
+ switch (err) {
+ case DB_LOCK_WAIT_TIMEOUT:
+ if (row_rollback_on_timeout) {
+ trx_general_rollback_for_mysql(trx, NULL);
+ break;
+ }
+ /* fall through */
+ case DB_DUPLICATE_KEY:
+ case DB_FOREIGN_DUPLICATE_KEY:
+ case DB_TOO_BIG_RECORD:
+ case DB_ROW_IS_REFERENCED:
+ case DB_NO_REFERENCED_ROW:
+ case DB_CANNOT_ADD_CONSTRAINT:
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ case DB_OUT_OF_FILE_SPACE:
+ case DB_INTERRUPTED:
+ if (savept) {
+ /* Roll back the latest, possibly incomplete
+ insertion or update */
+
+ trx_general_rollback_for_mysql(trx, savept);
+ }
+ /* MySQL will roll back the latest SQL statement */
+ break;
+ case DB_LOCK_WAIT:
+ srv_suspend_mysql_thread(thr);
+
+ if (trx->error_state != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ goto handle_new_error;
+ }
+
+ *new_err = err;
+
+ return(TRUE);
+
+ case DB_DEADLOCK:
+ case DB_LOCK_TABLE_FULL:
+ /* Roll back the whole transaction; this resolution was added
+ to version 3.23.43 */
+
+ trx_general_rollback_for_mysql(trx, NULL);
+ break;
+
+ case DB_MUST_GET_MORE_FILE_SPACE:
+ fputs("InnoDB: The database cannot continue"
+ " operation because of\n"
+ "InnoDB: lack of space. You must add"
+ " a new data file to\n"
+ "InnoDB: my.cnf and restart the database.\n", stderr);
+
+ exit(1);
+
+ case DB_CORRUPTION:
+ fputs("InnoDB: We detected index corruption"
+ " in an InnoDB type table.\n"
+ "InnoDB: You have to dump + drop + reimport"
+ " the table or, in\n"
+ "InnoDB: a case of widespread corruption,"
+ " dump all InnoDB\n"
+ "InnoDB: tables and recreate the"
+ " whole InnoDB tablespace.\n"
+ "InnoDB: If the mysqld server crashes"
+ " after the startup or when\n"
+ "InnoDB: you dump the tables, look at\n"
+ "InnoDB: " REFMAN "forcing-recovery.html"
+ " for help.\n", stderr);
+ break;
+ case DB_FOREIGN_EXCEED_MAX_CASCADE:
+ fprintf(stderr, "InnoDB: Cannot delete/update rows with"
+ " cascading foreign key constraints that exceed max"
+ " depth of %lu\n"
+ "Please drop excessive foreign constraints"
+ " and try again\n", (ulong) DICT_FK_MAX_RECURSIVE_LOAD);
+ break;
+ default:
+ fprintf(stderr, "InnoDB: unknown error code %lu\n",
+ (ulong) err);
+ ut_error;
+ }
+
+ if (trx->error_state != DB_SUCCESS) {
+ *new_err = trx->error_state;
+ } else {
+ *new_err = err;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ return(FALSE);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+ dict_table_t* table) /*!< in: Innobase table handle */
+{
+ row_prebuilt_t* prebuilt;
+ mem_heap_t* heap;
+ dict_index_t* clust_index;
+ dtuple_t* ref;
+ ulint ref_len;
+
+ heap = mem_heap_create(sizeof *prebuilt + 128);
+
+ prebuilt = mem_heap_zalloc(heap, sizeof *prebuilt);
+
+ prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+ prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+ prebuilt->table = table;
+
+ prebuilt->sql_stat_start = TRUE;
+ prebuilt->heap = heap;
+
+ prebuilt->pcur = btr_pcur_create_for_mysql();
+ prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+ prebuilt->select_lock_type = LOCK_NONE;
+ prebuilt->stored_select_lock_type = 99999999;
+ UNIV_MEM_INVALID(&prebuilt->stored_select_lock_type,
+ sizeof prebuilt->stored_select_lock_type);
+
+ prebuilt->search_tuple = dtuple_create(
+ heap, 2 * dict_table_get_n_cols(table));
+
+ clust_index = dict_table_get_first_index(table);
+
+ /* Make sure that search_tuple is long enough for clustered index */
+ ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ prebuilt->clust_ref = ref;
+
+ prebuilt->autoinc_error = 0;
+ prebuilt->autoinc_offset = 0;
+
+ /* Default to 1, we will set the actual value later in
+ ha_innobase::get_auto_increment(). */
+ prebuilt->autoinc_increment = 1;
+
+ prebuilt->autoinc_last_value = 0;
+
+ return(prebuilt);
+}
+
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+ row_prebuilt_t* prebuilt, /*!< in, own: prebuilt struct */
+ ibool dict_locked) /*!< in: TRUE=data dictionary locked */
+{
+ ulint i;
+
+ if (UNIV_UNLIKELY
+ (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+ || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu,"
+ " magic n2 %lu, table name ",
+ (ulong) prebuilt->magic_n,
+ (ulong) prebuilt->magic_n2);
+ ut_print_name(stderr, NULL, TRUE, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption(prebuilt);
+
+ ut_error;
+ }
+
+ prebuilt->magic_n = ROW_PREBUILT_FREED;
+ prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+ btr_pcur_free_for_mysql(prebuilt->pcur);
+ btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+ if (prebuilt->mysql_template) {
+ mem_free(prebuilt->mysql_template);
+ }
+
+ if (prebuilt->ins_graph) {
+ que_graph_free_recursive(prebuilt->ins_graph);
+ }
+
+ if (prebuilt->sel_graph) {
+ que_graph_free_recursive(prebuilt->sel_graph);
+ }
+
+ if (prebuilt->upd_graph) {
+ que_graph_free_recursive(prebuilt->upd_graph);
+ }
+
+ if (prebuilt->blob_heap) {
+ mem_heap_free(prebuilt->blob_heap);
+ }
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_free(prebuilt->old_vers_heap);
+ }
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+ if (prebuilt->fetch_cache[i] != NULL) {
+
+ if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+ (prebuilt->fetch_cache[i]) - 4))
+ || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+ (prebuilt->fetch_cache[i])
+ + prebuilt->mysql_row_len))) {
+ fputs("InnoDB: Error: trying to free"
+ " a corrupt fetch buffer.\n", stderr);
+
+ mem_analyze_corruption(
+ prebuilt->fetch_cache[i]);
+
+ ut_error;
+ }
+
+ mem_free((prebuilt->fetch_cache[i]) - 4);
+ }
+ }
+
+ dict_table_decrement_handle_count(prebuilt->table, dict_locked);
+
+ mem_heap_free(prebuilt->heap);
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct
+ in MySQL handle */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ if (trx->magic_n != TRX_MAGIC_N) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to use a corrupt\n"
+ "InnoDB: trx handle. Magic n %lu\n",
+ (ulong) trx->magic_n);
+
+ mem_analyze_corruption(trx);
+
+ ut_error;
+ }
+
+ if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to use a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name ",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption(prebuilt);
+
+ ut_error;
+ }
+
+ prebuilt->trx = trx;
+
+ if (prebuilt->ins_graph) {
+ prebuilt->ins_graph->trx = trx;
+ }
+
+ if (prebuilt->upd_graph) {
+ prebuilt->upd_graph->trx = trx;
+ }
+
+ if (prebuilt->sel_graph) {
+ prebuilt->sel_graph->trx = trx;
+ }
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ ins_node_t* node;
+ dtuple_t* row;
+ dict_table_t* table = prebuilt->table;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->ins_node == NULL) {
+
+ /* Not called before for this handle: create an insert node
+ and query graph to the prebuilt struct */
+
+ node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+ prebuilt->ins_node = node;
+
+ if (prebuilt->ins_upd_rec_buff == NULL) {
+ prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+ prebuilt->heap, prebuilt->mysql_row_len);
+ }
+
+ row = dtuple_create(prebuilt->heap,
+ dict_table_get_n_cols(table));
+
+ dict_table_copy_types(row, table);
+
+ ins_node_set_new_row(node, row);
+
+ prebuilt->ins_graph = que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+ dict_table_t* table) /*!< in: table */
+{
+ ulint counter;
+
+ counter = table->stat_modified_counter;
+
+ table->stat_modified_counter = counter + 1;
+
+ if (!srv_stats_auto_update)
+ return;
+
+ /* Calculate new statistics if 1 / 16 of table has been modified
+ since the last time a statistics batch was run, or if
+ stat_modified_counter > 2 000 000 000 (to avoid wrap-around).
+ We calculate statistics at most every 16th round, since we may have
+ a counter table which is very small and updated very often. */
+
+ if (counter > 2000000000
+ || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) {
+
+ dict_update_statistics(table, TRUE);
+ }
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ if (lock_trx_holds_autoinc_locks(trx)) {
+ mutex_enter(&kernel_mutex);
+
+ lock_release_autoinc_locks(trx);
+
+ mutex_exit(&kernel_mutex);
+ }
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in the MySQL
+ table handle */
+{
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+ const dict_table_t* table = prebuilt->table;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ /* If we already hold an AUTOINC lock on the table then do nothing.
+ Note: We peek at the value of the current owner without acquiring
+ the kernel mutex. **/
+ if (trx == table->autoinc_trx) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "setting auto-inc lock";
+
+ if (node == NULL) {
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+ }
+
+ /* We use the insert query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(trx);
+
+ err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct in the MySQL
+ table handle */
+ dict_table_t* table, /*!< in: table to lock, or NULL
+ if prebuilt->table should be
+ locked as
+ prebuilt->select_lock_type */
+ ulint mode) /*!< in: lock mode of table
+ (ignored if table==NULL) */
+{
+ trx_t* trx = prebuilt->trx;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "setting table lock";
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+
+ /* We use the select query graph as the dummy graph needed
+ in the lock module call */
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = thr;
+ thr->prev_node = thr->common.parent;
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(trx);
+
+ if (table) {
+ err = lock_table(0, table, mode, thr);
+ } else {
+ err = lock_table(0, prebuilt->table,
+ prebuilt->select_lock_type, thr);
+ }
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: row in the MySQL format */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ que_thr_t* thr;
+ ulint err;
+ ibool was_lock_wait;
+ trx_t* trx = prebuilt->trx;
+ ins_node_t* node = prebuilt->ins_node;
+
+ ut_ad(trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you"
+ " used DISCARD TABLESPACE?\n"
+ "InnoDB: Look from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name ",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption(prebuilt);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+ fputs("InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that"
+ " newraw is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "inserting";
+
+ row_mysql_delay_if_needed();
+
+ trx_start_if_not_started(trx);
+
+ if (node == NULL) {
+ row_get_prebuilt_insert_row(prebuilt);
+ node = prebuilt->ins_node;
+ }
+
+ row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+ if (!prebuilt->mysql_has_locked) {
+ fprintf(stderr, "InnoDB: Error: row_insert_for_mysql is called without ha_innobase::external_lock()\n");
+ if (trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
+ }
+ }
+
+ if (prebuilt->sql_stat_start) {
+ node->state = INS_NODE_SET_IX_LOCK;
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ node->state = INS_NODE_ALLOC_ROW_ID;
+ }
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_ins_step(thr);
+
+ err = trx->error_state;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ /* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ prebuilt->table->stat_n_rows++;
+
+ srv_n_rows_inserted++;
+
+ if (prebuilt->table->stat_n_rows == 0) {
+ /* Avoid wrap-over */
+ prebuilt->table->stat_n_rows--;
+ }
+
+ row_update_statistics_if_needed(prebuilt->table);
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ sel_node_t* node;
+
+ ut_ad(prebuilt && prebuilt->trx);
+
+ if (prebuilt->sel_graph == NULL) {
+
+ node = sel_node_create(prebuilt->heap);
+
+ prebuilt->sel_graph = que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+
+ prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+ }
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+ dict_table_t* table, /*!< in: table to update */
+ mem_heap_t* heap) /*!< in: mem heap from which allocated */
+{
+ upd_node_t* node;
+
+ node = upd_node_create(heap);
+
+ node->in_mysql_interface = TRUE;
+ node->is_delete = FALSE;
+ node->searched_update = FALSE;
+ node->select = NULL;
+ node->pcur = btr_pcur_create_for_mysql();
+ node->table = table;
+
+ node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+ node->update_n_fields = dict_table_get_n_cols(table);
+
+ UT_LIST_INIT(node->columns);
+ node->has_clust_rec_x_lock = TRUE;
+ node->cmpl_info = 0;
+
+ node->table_sym = NULL;
+ node->col_assign_list = NULL;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ dict_table_t* table = prebuilt->table;
+ upd_node_t* node;
+
+ ut_ad(prebuilt && table && prebuilt->trx);
+
+ if (prebuilt->upd_node == NULL) {
+
+ /* Not called before for this handle: create an update node
+ and query graph to the prebuilt struct */
+
+ node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+ prebuilt->upd_node = node;
+
+ prebuilt->upd_graph = que_node_get_parent(
+ pars_complete_graph_for_exec(node,
+ prebuilt->trx,
+ prebuilt->heap));
+ prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+ }
+
+ return(prebuilt->upd_node->update);
+}
+
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+ byte* mysql_rec, /*!< in: the row to be updated, in
+ the MySQL format */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct in MySQL
+ handle */
+{
+ trx_savept_t savept;
+ ulint err;
+ que_thr_t* thr;
+ ibool was_lock_wait;
+ dict_index_t* clust_index;
+ /* ulint ref_len; */
+ upd_node_t* node;
+ dict_table_t* table = prebuilt->table;
+ trx_t* trx = prebuilt->trx;
+
+ ut_ad(prebuilt && trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ UT_NOT_USED(mysql_rec);
+
+ if (prebuilt->table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you"
+ " used DISCARD TABLESPACE?\n"
+ "InnoDB: Look from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+ return(DB_ERROR);
+ }
+
+ if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name ",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption(prebuilt);
+
+ ut_error;
+ }
+
+ if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+ fputs("InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw"
+ " is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "updating or deleting";
+
+ row_mysql_delay_if_needed();
+
+ trx_start_if_not_started(trx);
+
+ node = prebuilt->upd_node;
+
+ clust_index = dict_table_get_first_index(table);
+
+ if (prebuilt->pcur->btr_cur.index == clust_index) {
+ btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur);
+ } else {
+ btr_pcur_copy_stored_position(node->pcur,
+ prebuilt->clust_pcur);
+ }
+
+ ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+ /* MySQL seems to call rnd_pos before updating each row it
+ has cached: we can get the correct cursor position from
+ prebuilt->pcur; NOTE that we cannot build the row reference
+ from mysql_rec if the clustered index was automatically
+ generated for the table: MySQL does not know anything about
+ the row id used as the clustered index key */
+
+ savept = trx_savept_take(trx);
+
+ thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ ut_ad(!prebuilt->sql_stat_start);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+ thr->fk_cascade_depth = 0;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ /* Reset fk_cascade_depth back to 0 */
+ thr->fk_cascade_depth = 0;
+
+ if (err != DB_SUCCESS) {
+ que_thr_stop_for_mysql(thr);
+
+ if (err == DB_RECORD_NOT_FOUND) {
+ trx->error_state = DB_SUCCESS;
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ thr->lock_state= QUE_THR_LOCK_ROW;
+ was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+ &savept);
+ thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+ if (was_lock_wait) {
+ goto run_again;
+ }
+
+ trx->op_info = "";
+
+ return((int) err);
+ }
+
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ if (node->is_delete) {
+ if (prebuilt->table->stat_n_rows > 0) {
+ prebuilt->table->stat_n_rows--;
+ }
+
+ srv_n_rows_deleted++;
+ } else {
+ srv_n_rows_updated++;
+ }
+
+ row_update_statistics_if_needed(prebuilt->table);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur. Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt struct in MySQL
+ handle */
+ ibool has_latches_on_recs)/*!< in: TRUE if called so
+ that we have the latches on
+ the records under pcur and
+ clust_pcur, and we do not need
+ to reposition the cursors. */
+{
+ btr_pcur_t* pcur = prebuilt->pcur;
+ btr_pcur_t* clust_pcur = prebuilt->clust_pcur;
+ trx_t* trx = prebuilt->trx;
+
+ ut_ad(prebuilt && trx);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (UNIV_UNLIKELY
+ (!srv_locks_unsafe_for_binlog
+ && trx->isolation_level > TRX_ISO_READ_COMMITTED)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: calling row_unlock_for_mysql though\n"
+ "InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n"
+ "InnoDB: this session is not using"
+ " READ COMMITTED isolation level.\n");
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "unlock_row";
+
+ if (prebuilt->new_rec_locks >= 1) {
+
+ const rec_t* rec;
+ dict_index_t* index;
+ trx_id_t rec_trx_id;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ /* Restore the cursor position and find the record */
+
+ if (!has_latches_on_recs) {
+ btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ if (prebuilt->new_rec_locks >= 2) {
+ /* Restore the cursor position and find the record
+ in the clustered index. */
+
+ if (!has_latches_on_recs) {
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ clust_pcur, &mtr);
+ }
+
+ rec = btr_pcur_get_rec(clust_pcur);
+ index = btr_pcur_get_btr_cur(clust_pcur)->index;
+ }
+
+ if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+ /* This is not a clustered index record. We
+ do not know how to unlock the record. */
+ goto no_unlock;
+ }
+
+ /* If the record has been modified by this
+ transaction, do not unlock it. */
+
+ if (index->trx_id_offset) {
+ rec_trx_id = trx_read_trx_id(rec
+ + index->trx_id_offset);
+ } else {
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ if (ut_dulint_cmp(rec_trx_id, trx->id) != 0) {
+ /* We did not update the record: unlock it */
+
+ rec = btr_pcur_get_rec(pcur);
+ index = btr_pcur_get_btr_cur(pcur)->index;
+
+ lock_rec_unlock(trx, btr_pcur_get_block(pcur),
+ rec, prebuilt->select_lock_type);
+
+ if (prebuilt->new_rec_locks >= 2) {
+ rec = btr_pcur_get_rec(clust_pcur);
+ index = btr_pcur_get_btr_cur(clust_pcur)->index;
+
+ lock_rec_unlock(trx,
+ btr_pcur_get_block(clust_pcur),
+ rec,
+ prebuilt->select_lock_type);
+ }
+ }
+no_unlock:
+ mtr_commit(&mtr);
+ }
+
+ trx->op_info = "";
+
+ return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+ que_thr_t* thr, /*!< in: query thread */
+ upd_node_t* node, /*!< in: update node used in the cascade
+ or set null operation */
+ dict_table_t* table) /*!< in: table where we do the operation */
+{
+ ulint err;
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ thr->fk_cascade_depth++;
+
+ if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+ return (DB_FOREIGN_EXCEED_MAX_CASCADE);
+ }
+run_again:
+ thr->run_node = node;
+ thr->prev_node = node;
+
+ row_upd_step(thr);
+
+ err = trx->error_state;
+
+ /* Note that the cascade node is a subnode of another InnoDB
+ query graph node. We do a normal lock wait in this node, but
+ all errors are handled by the parent node. */
+
+ if (err == DB_LOCK_WAIT) {
+ /* Handle lock wait here */
+
+ que_thr_stop_for_mysql(thr);
+
+ srv_suspend_mysql_thread(thr);
+
+ /* Note that a lock wait may also end in a lock wait timeout,
+ or this transaction is picked as a victim in selective
+ deadlock resolution */
+
+ if (trx->error_state != DB_SUCCESS) {
+
+ return(trx->error_state);
+ }
+
+ /* Retry operation after a normal lock wait */
+
+ goto run_again;
+ }
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ if (node->is_delete) {
+ if (table->stat_n_rows > 0) {
+ table->stat_n_rows--;
+ }
+
+ srv_n_rows_deleted++;
+ } else {
+ srv_n_rows_updated++;
+ }
+
+ row_update_statistics_if_needed(table);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ const dict_index_t* clust_index;
+
+ clust_index = dict_table_get_first_index(table);
+
+ return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS);
+}
+
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line) /*!< in: line number */
+{
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ rw_lock_s_lock_func(&dict_operation_lock, 0, file, line);
+
+ trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+ rw_lock_s_unlock(&dict_operation_lock);
+
+ trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+ trx_t* trx, /*!< in/out: transaction */
+ const char* file, /*!< in: file name */
+ ulint line) /*!< in: line number */
+{
+ ut_a(trx->dict_operation_lock_mode == 0
+ || trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks or lock waits can occur then in these operations */
+
+ rw_lock_x_lock_func(&dict_operation_lock, 0, file, line);
+ trx->dict_operation_lock_mode = RW_X_LATCH;
+
+ mutex_enter(&(dict_sys->mutex));
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ mutex_exit(&(dict_sys->mutex));
+ rw_lock_x_unlock(&dict_operation_lock);
+
+ trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+ dict_table_t* table, /*!< in, own: table definition
+ (will be freed) */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ tab_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ const char* table_name;
+ ulint table_name_len;
+ ulint err;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ if (srv_created_new_raw) {
+ fputs("InnoDB: A new raw disk partition was initialized:\n"
+ "InnoDB: we do not allow database modifications"
+ " by the user.\n"
+ "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+ " is replaced with raw.\n", stderr);
+err_exit:
+ dict_mem_table_free(table);
+ trx_commit_for_mysql(trx);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "creating table";
+
+ if (row_mysql_is_system_table(table->name)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to create a MySQL system"
+ " table %s of type InnoDB.\n"
+ "InnoDB: MySQL system tables must be"
+ " of the MyISAM type!\n",
+ table->name);
+ goto err_exit;
+ }
+
+ trx_start_if_not_started(trx);
+
+ /* The table name is prefixed with the database name and a '/'.
+ Certain table names starting with 'innodb_' have their special
+ meaning regardless of the database name. Thus, we need to
+ ignore the database name prefix in the comparisons. */
+ table_name = strchr(table->name, '/');
+ ut_a(table_name);
+ table_name++;
+ table_name_len = strlen(table_name) + 1;
+
+ if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
+
+ /* Table equals "innodb_monitor":
+ start monitor prints */
+
+ srv_print_innodb_monitor = TRUE;
+
+ /* The lock timeout monitor thread also takes care
+ of InnoDB monitor prints */
+
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (STR_EQ(table_name, table_name_len,
+ S_innodb_lock_monitor)) {
+
+ srv_print_innodb_monitor = TRUE;
+ srv_print_innodb_lock_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (STR_EQ(table_name, table_name_len,
+ S_innodb_tablespace_monitor)) {
+
+ srv_print_innodb_tablespace_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (STR_EQ(table_name, table_name_len,
+ S_innodb_table_monitor)) {
+
+ srv_print_innodb_table_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+ } else if (STR_EQ(table_name, table_name_len,
+ S_innodb_mem_validate)) {
+ /* We define here a debugging feature intended for
+ developers */
+
+ fputs("Validating InnoDB memory:\n"
+ "to use this feature you must compile InnoDB with\n"
+ "UNIV_MEM_DEBUG defined in univ.i and"
+ " the server must be\n"
+ "quiet because allocation from a mem heap"
+ " is not protected\n"
+ "by any semaphore.\n", stderr);
+#ifdef UNIV_MEM_DEBUG
+ ut_a(mem_validate());
+ fputs("Memory validated\n", stderr);
+#else /* UNIV_MEM_DEBUG */
+ fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
+ stderr);
+#endif /* UNIV_MEM_DEBUG */
+ }
+
+ heap = mem_heap_create(512);
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ node = tab_create_graph_create(table, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ /* TO DO: free table? The code below will dereference
+ table->name, though. */
+ }
+
+ switch (err) {
+ case DB_OUT_OF_FILE_SPACE:
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: cannot create table ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs(" because tablespace full\n", stderr);
+
+ if (dict_table_get_low(table->name)) {
+
+ row_drop_table_for_mysql(table->name, trx, FALSE);
+ trx_commit_for_mysql(trx);
+ }
+ break;
+
+ case DB_DUPLICATE_KEY:
+ /* We may also get err == DB_ERROR if the .ibd file for the
+ table already exists */
+
+ break;
+ }
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+ dict_index_t* index, /*!< in, own: index definition
+ (will be freed) */
+ trx_t* trx, /*!< in: transaction handle */
+ const ulint* field_lengths) /*!< in: if not NULL, must contain
+ dict_index_get_n_fields(index)
+ actual field lengths for the
+ index columns, which are
+ then checked for not being too
+ large. */
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+ ulint i;
+ ulint len;
+ char* table_name;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "creating index";
+
+ /* Copy the table name because we may want to drop the
+ table later, after the index object is freed (inside
+ que_run_threads()) and thus index->table_name is not available. */
+ table_name = mem_strdup(index->table_name);
+
+ trx_start_if_not_started(trx);
+
+ /* Check that the same column does not appear twice in the index.
+ Starting from 4.0.14, InnoDB should be able to cope with that, but
+ safer not to allow them. */
+
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ ulint j;
+
+ for (j = 0; j < i; j++) {
+ if (0 == ut_strcmp(
+ dict_index_get_nth_field(index, j)->name,
+ dict_index_get_nth_field(index, i)->name)) {
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: column ", stderr);
+ ut_print_name(stderr, trx, FALSE,
+ dict_index_get_nth_field(
+ index, i)->name);
+ fputs(" appears twice in ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: This is not allowed"
+ " in InnoDB.\n", stderr);
+
+ err = DB_COL_APPEARS_TWICE_IN_INDEX;
+
+ goto error_handling;
+ }
+ }
+
+ /* Check also that prefix_len and actual length
+ < DICT_MAX_INDEX_COL_LEN */
+
+ len = dict_index_get_nth_field(index, i)->prefix_len;
+
+ if (field_lengths) {
+ len = ut_max(len, field_lengths[i]);
+ }
+
+ if (len >= DICT_MAX_INDEX_COL_LEN) {
+ err = DB_TOO_BIG_RECORD;
+
+ goto error_handling;
+ }
+ }
+
+ heap = mem_heap_create(512);
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ /* Note that the space id where we store the index is inherited from
+ the table in dict_build_index_def_step() in dict0crea.c. */
+
+ node = ind_create_graph_create(index, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+error_handling:
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, NULL);
+
+ row_drop_table_for_mysql(table_name, trx, FALSE);
+
+ trx_commit_for_mysql(trx);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ trx->op_info = "";
+
+ mem_free(table_name);
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+*/
+UNIV_INTERN
+int
+row_insert_stats_for_mysql(
+/*=======================*/
+ dict_index_t* index,
+ trx_t* trx)
+{
+ ind_node_t* node;
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ ulint err;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "try to insert rows to SYS_STATS";
+
+ trx_start_if_not_started(trx);
+ trx->error_state = DB_SUCCESS;
+
+ heap = mem_heap_create(512);
+
+ node = ind_insert_stats_graph_create(index, heap);
+
+ thr = pars_complete_graph_for_exec(node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ err = trx->error_state;
+
+ que_graph_free((que_t*) que_node_get_parent(thr));
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+both participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+ trx_t* trx, /*!< in: transaction */
+ const char* sql_string, /*!< in: table create statement where
+ foreign keys are declared like:
+ FOREIGN KEY (a, b) REFERENCES table2(c, d),
+ table2 can be written also with the
+ database name before it: test.table2 */
+ size_t sql_length, /*!< in: length of sql_string */
+ const char* name, /*!< in: table full name in the
+ normalized form
+ database_name/table_name */
+ ibool reject_fks) /*!< in: if TRUE, fail with error
+ code DB_CANNOT_ADD_CONSTRAINT if
+ any foreign keys are found. */
+{
+ ulint err;
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_a(sql_string);
+
+ trx->op_info = "adding foreign keys";
+
+ trx_start_if_not_started(trx);
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+ err = dict_create_foreign_constraints(trx, sql_string, sql_length,
+ name, reject_fks);
+ if (err == DB_SUCCESS) {
+ /* Check that also referencing constraints are ok */
+ err = dict_load_foreigns(name, FALSE, TRUE);
+ }
+
+ if (err != DB_SUCCESS) {
+ /* We have special error handling here */
+
+ trx->error_state = DB_SUCCESS;
+
+ trx_general_rollback_for_mysql(trx, NULL);
+
+ row_drop_table_for_mysql(name, trx, FALSE);
+
+ trx_commit_for_mysql(trx);
+
+ trx->error_state = DB_SUCCESS;
+ }
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.c.
+@return error code or DB_SUCCESS */
+static
+int
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+ const char* name) /*!< in: table name */
+{
+ ulint error;
+ trx_t* trx;
+
+ trx = trx_allocate_for_background();
+
+ /* If the original transaction was dropping a table referenced by
+ foreign keys, we must set the following to be able to drop the
+ table: */
+
+ trx->check_foreigns = FALSE;
+
+ /* fputs("InnoDB: Error: Dropping table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs(" in background drop list\n", stderr); */
+
+ /* Try to drop the table in InnoDB */
+
+ error = row_drop_table_for_mysql(name, trx, FALSE);
+
+ /* Flush the log to reduce probability that the .frm files and
+ the InnoDB data dictionary get out-of-sync if the user runs
+ with innodb_flush_log_at_trx_commit = 0 */
+
+ log_buffer_flush_to_disk();
+
+ trx_commit_for_mysql(trx);
+
+ trx_free_for_background(trx);
+
+ return((int) error);
+}
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+{
+ row_mysql_drop_t* drop;
+ dict_table_t* table;
+ ulint n_tables;
+ ulint n_tables_dropped = 0;
+loop:
+ mutex_enter(&kernel_mutex);
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+ n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+ mutex_exit(&kernel_mutex);
+
+ if (drop == NULL) {
+ /* All tables dropped */
+
+ return(n_tables + n_tables_dropped);
+ }
+
+ mutex_enter(&(dict_sys->mutex));
+ table = dict_table_get_low(drop->table_name);
+ mutex_exit(&(dict_sys->mutex));
+
+ if (table == NULL) {
+ /* If for some reason the table has already been dropped
+ through some other mechanism, do not try to drop it */
+
+ goto already_dropped;
+ }
+
+ if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+ drop->table_name)) {
+ /* If the DROP fails for some table, we return, and let the
+ main thread retry later */
+
+ return(n_tables + n_tables_dropped);
+ }
+
+ n_tables_dropped++;
+
+already_dropped:
+ mutex_enter(&kernel_mutex);
+
+ UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Dropped table ", stderr);
+ ut_print_name(stderr, NULL, TRUE, drop->table_name);
+ fputs(" in background drop queue.\n", stderr);
+
+ mem_free(drop->table_name);
+
+ mem_free(drop);
+
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+}
+
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ return(UT_LIST_GET_LEN(row_mysql_drop_list));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+ const char* name) /*!< in: table name */
+{
+ row_mysql_drop_t* drop;
+
+ mutex_enter(&kernel_mutex);
+
+ if (!row_mysql_drop_list_inited) {
+
+ UT_LIST_INIT(row_mysql_drop_list);
+ row_mysql_drop_list_inited = TRUE;
+ }
+
+ /* Look if the table already is in the drop list */
+ drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+ while (drop != NULL) {
+ if (strcmp(drop->table_name, name) == 0) {
+ /* Already in the list */
+
+ mutex_exit(&kernel_mutex);
+
+ return(FALSE);
+ }
+
+ drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
+ }
+
+ drop = mem_alloc(sizeof(row_mysql_drop_t));
+
+ drop->table_name = mem_strdup(name);
+
+ UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+ /* fputs("InnoDB: Adding table ", stderr);
+ ut_print_name(stderr, trx, TRUE, drop->table_name);
+ fputs(" to background drop list\n", stderr); */
+
+ mutex_exit(&kernel_mutex);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dict_foreign_t* foreign;
+ dulint new_id;
+ dict_table_t* table;
+ ibool success;
+ ulint err;
+ pars_info_t* info = NULL;
+
+ /* How do we prevent crashes caused by ongoing operations on
+ the table? Old operations could try to access non-existent
+ pages.
+
+ 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+ MySQL table lock on the table before we can do DISCARD
+ TABLESPACE. Then there are no running queries on the table.
+
+ 2) Purge and rollback: we assign a new table id for the
+ table. Since purge and rollback look for the table based on
+ the table id, they see the table as 'dropped' and discard
+ their operations.
+
+ 3) Insert buffer: we remove all entries for the tablespace in
+ the insert buffer tree; as long as the tablespace mem object
+ does not exist, ongoing insert buffer page merges are
+ discarded in buf0rea.c. If we recreate the tablespace mem
+ object with IMPORT TABLESPACE later, then the tablespace will
+ have the same id, but the tablespace_version field in the mem
+ object is different, and ongoing old insert buffer page merges
+ get discarded.
+
+ 4) Linear readahead and random readahead: we use the same
+ method as in 3) to discard ongoing operations.
+
+ 5) FOREIGN KEY operations: if
+ table->n_foreign_key_checks_running > 0, we do not allow the
+ discard. We also reserve the data dictionary latch. */
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx->op_info = "discarding tablespace";
+ trx_start_if_not_started(trx);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: is in the system tablespace 0"
+ " which cannot be discarded\n", stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ if (table->n_foreign_key_checks_running > 0) {
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: You are trying to DISCARD table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs("\n"
+ "InnoDB: though there is a foreign key check"
+ " running on it.\n"
+ "InnoDB: Cannot discard the table.\n",
+ stderr);
+
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns) {
+
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow discarding a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ err = DB_CANNOT_DROP_CONSTRAINT;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot DISCARD table ", ef);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ goto funct_exit;
+ }
+
+ dict_hdr_get_new_id(&new_id, NULL, NULL);
+
+ /* Remove all locks except the table-level S and X locks. */
+ lock_remove_all_on_table(table, FALSE);
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "table_name", name);
+ pars_info_add_dulint_literal(info, "new_id", new_id);
+
+ err = que_eval_sql(info,
+ "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+ "old_id CHAR;\n"
+ "BEGIN\n"
+ "SELECT ID INTO old_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = :table_name\n"
+ "LOCK IN SHARE MODE;\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " COMMIT WORK;\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "UPDATE SYS_TABLES SET ID = :new_id\n"
+ " WHERE ID = old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = old_id;\n"
+ "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = old_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ } else {
+ dict_table_change_id_in_cache(table, new_id);
+
+ success = fil_discard_tablespace(table->space);
+
+ if (!success) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+
+ err = DB_ERROR;
+ } else {
+ /* Set the flag which tells that now it is legal to
+ IMPORT a tablespace for this table */
+ table->tablespace_discarded = TRUE;
+ table->ibd_file_missing = TRUE;
+ }
+ }
+
+funct_exit:
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dict_table_t* table;
+ ibool success;
+ ib_uint64_t current_lsn;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ trx_start_if_not_started(trx);
+
+ trx->op_info = "importing tablespace";
+
+ current_lsn = log_get_lsn();
+
+ /* It is possible, though very improbable, that the lsn's in the
+ tablespace to be imported have risen above the current system lsn, if
+ a lengthy purge, ibuf merge, or rollback was performed on a backup
+ taken with ibbackup. If that is the case, reset page lsn's in the
+ file. We assume that mysqld was shut down after it performed these
+ cleanup operations on the .ibd file, so that it stamped the latest lsn
+ to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+ TODO: reset also the trx id's in clustered index records and write
+ a new space id to each data page. That would allow us to import clean
+ .ibd files from another MySQL installation. */
+
+ success = fil_reset_too_high_lsns(name, current_lsn);
+
+ if (!success) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: cannot reset lsn's in table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+ stderr);
+
+ err = DB_ERROR;
+
+ row_mysql_lock_data_dictionary(trx);
+
+ goto funct_exit;
+ }
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: does not exist in the InnoDB data dictionary\n"
+ "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+ stderr);
+
+ err = DB_TABLE_NOT_FOUND;
+
+ goto funct_exit;
+ }
+
+ if (table->space == 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: is in the system tablespace 0"
+ " which cannot be imported\n", stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ if (!table->tablespace_discarded) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: you are trying to"
+ " IMPORT a tablespace\n"
+ "InnoDB: ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs(", though you have not called DISCARD on it yet\n"
+ "InnoDB: during the lifetime of the mysqld process!\n",
+ stderr);
+
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Play safe and remove all insert buffer entries, though we should
+ have removed them already when DISCARD TABLESPACE was called */
+
+ ibuf_delete_for_discarded_space(table->space);
+
+ success = fil_open_single_table_tablespace(
+ TRUE, table->space,
+ table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+ table->name);
+ if (success) {
+ table->ibd_file_missing = FALSE;
+ table->tablespace_discarded = FALSE;
+ } else {
+ if (table->ibd_file_missing) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: cannot find or open in the"
+ " database directory the .ibd file of\n"
+ "InnoDB: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+ stderr);
+ }
+
+ err = DB_ERROR;
+ }
+
+funct_exit:
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+ dict_table_t* table, /*!< in: table handle */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dict_foreign_t* foreign;
+ ulint err;
+ mem_heap_t* heap;
+ byte* buf;
+ dtuple_t* tuple;
+ dfield_t* dfield;
+ dict_index_t* sys_index;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ dulint new_id;
+ ulint recreate_space = 0;
+ pars_info_t* info = NULL;
+
+ /* How do we prevent crashes caused by ongoing operations on
+ the table? Old operations could try to access non-existent
+ pages.
+
+ 1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+ MySQL table lock on the table before we can do TRUNCATE
+ TABLE. Then there are no running queries on the table. This is
+ guaranteed, because in ha_innobase::store_lock(), we do not
+ weaken the TL_WRITE lock requested by MySQL when executing
+ SQLCOM_TRUNCATE.
+
+ 2) Purge and rollback: we assign a new table id for the
+ table. Since purge and rollback look for the table based on
+ the table id, they see the table as 'dropped' and discard
+ their operations.
+
+ 3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE,
+ so we do not have to remove insert buffer records, as the
+ insert buffer works at a low level. If a freed page is later
+ reallocated, the allocator will remove the ibuf entries for
+ it.
+
+ When we truncate *.ibd files by recreating them (analogous to
+ DISCARD TABLESPACE), we remove all entries for the table in the
+ insert buffer tree. This is not strictly necessary, because
+ in 6) we will assign a new tablespace identifier, but we can
+ free up some space in the system tablespace.
+
+ 4) Linear readahead and random readahead: we use the same
+ method as in 3) to discard ongoing operations. (This is only
+ relevant for TRUNCATE TABLE by DISCARD TABLESPACE.)
+
+ 5) FOREIGN KEY operations: if
+ table->n_foreign_key_checks_running > 0, we do not allow the
+ TRUNCATE. We also reserve the data dictionary latch.
+
+ 6) Crash recovery: To prevent the application of pre-truncation
+ redo log records on the truncated tablespace, we will assign
+ a new tablespace identifier to the truncated tablespace. */
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_ad(table);
+
+ if (srv_created_new_raw) {
+ fputs("InnoDB: A new raw disk partition was initialized:\n"
+ "InnoDB: we do not allow database modifications"
+ " by the user.\n"
+ "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+ " is replaced with raw.\n", stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "truncating table";
+
+ trx_start_if_not_started(trx);
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+ /* Prevent foreign key checks etc. while we are truncating the
+ table */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns) {
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow truncating a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot truncate table ", ef);
+ ut_print_name(ef, trx, TRUE, table->name);
+ fputs(" by DROP+CREATE\n"
+ "InnoDB: because it is referenced by ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ err = DB_ERROR;
+ goto funct_exit;
+ }
+
+ /* TODO: could we replace the counter n_foreign_key_checks_running
+ with lock checks on the table? Acquire here an exclusive lock on the
+ table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+ they can cope with the table having been truncated here? Foreign key
+ checks take an IS or IX lock on the table. */
+
+ if (table->n_foreign_key_checks_running > 0) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Cannot truncate table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs(" by DROP+CREATE\n"
+ "InnoDB: because there is a foreign key check"
+ " running on it.\n",
+ stderr);
+ err = DB_ERROR;
+
+ goto funct_exit;
+ }
+
+ /* Remove all locks except the table-level S and X locks. */
+ lock_remove_all_on_table(table, FALSE);
+
+ trx->table_id = table->id;
+
+ /* Lock all index trees for this table, as we will
+ truncate the table/index and possibly change their metadata.
+ All DML/DDL are blocked by table level lock, with
+ a few exceptions such as queries into information schema
+ about the table, MySQL could try to access index stats
+ for this kind of query, we need to use index locks to
+ sync up */
+ dict_table_x_lock_indexes(table);
+
+ if (table->space && !table->dir_path_of_temp_table) {
+ /* Discard and create the single-table tablespace. */
+ ulint space = table->space;
+ ulint flags = fil_space_get_flags(space);
+
+ if (flags != ULINT_UNDEFINED
+ && fil_discard_tablespace(space)) {
+
+ dict_index_t* index;
+
+ dict_hdr_get_new_id(NULL, NULL, &space);
+
+ if (space == ULINT_UNDEFINED
+ || fil_create_new_single_table_tablespace(
+ space, table->name, FALSE, flags,
+ FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+ dict_table_x_unlock_indexes(table);
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: TRUNCATE TABLE %s failed to"
+ " create a new tablespace\n",
+ table->name);
+ table->ibd_file_missing = 1;
+ err = DB_ERROR;
+ goto funct_exit;
+ }
+
+ recreate_space = space;
+
+ /* Replace the space_id in the data dictionary cache.
+ The persisent data dictionary (SYS_TABLES.SPACE
+ and SYS_INDEXES.SPACE) are updated later in this
+ function. */
+ table->space = space;
+ index = dict_table_get_first_index(table);
+ do {
+ index->space = space;
+ index = dict_table_get_next_index(index);
+ } while (index);
+
+ mtr_start(&mtr);
+ fsp_header_init(space,
+ FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+ mtr_commit(&mtr);
+ }
+ }
+
+ /* scan SYS_INDEXES for all indexes of the table */
+ heap = mem_heap_create(800);
+
+ tuple = dtuple_create(heap, 1);
+ dfield = dtuple_get_nth_field(tuple, 0);
+
+ buf = mem_heap_alloc(heap, 8);
+ mach_write_to_8(buf, table->id);
+
+ dfield_set_data(dfield, buf, 8);
+ sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+ dict_index_copy_types(tuple, sys_index, 1);
+
+ mtr_start(&mtr);
+ btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+ BTR_MODIFY_LEAF, &pcur, &mtr);
+ for (;;) {
+ rec_t* rec;
+ const byte* field;
+ ulint len;
+ ulint root_page_no;
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ /* The end of SYS_INDEXES has been reached. */
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ field = rec_get_nth_field_old(rec, 0, &len);
+ ut_ad(len == 8);
+
+ if (memcmp(buf, field, len) != 0) {
+ /* End of indexes for the table (TABLE_ID mismatch). */
+ break;
+ }
+
+ if (rec_get_deleted_flag(rec, FALSE)) {
+ /* The index has been dropped. */
+ goto next_rec;
+ }
+
+ /* This call may commit and restart mtr
+ and reposition pcur. */
+ root_page_no = dict_truncate_index_tree(table, recreate_space,
+ &pcur, &mtr);
+
+ rec = btr_pcur_get_rec(&pcur);
+
+ if (root_page_no != FIL_NULL) {
+ page_rec_write_index_page_no(
+ rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+ root_page_no, &mtr);
+ /* We will need to commit and restart the
+ mini-transaction in order to avoid deadlocks.
+ The dict_truncate_index_tree() call has allocated
+ a page in this mini-transaction, and the rest of
+ this loop could latch another index page. */
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_MODIFY_LEAF,
+ &pcur, &mtr);
+ }
+
+next_rec:
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ mem_heap_free(heap);
+
+ /* Done with index truncation, release index tree locks,
+ subsequent work relates to table level metadata change */
+ dict_table_x_unlock_indexes(table);
+
+ dict_hdr_get_new_id(&new_id, NULL, NULL);
+
+ info = pars_info_create();
+
+ pars_info_add_int4_literal(info, "space", (lint) table->space);
+ pars_info_add_dulint_literal(info, "old_id", table->id);
+ pars_info_add_dulint_literal(info, "new_id", new_id);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES"
+ " SET ID = :new_id, SPACE = :space\n"
+ " WHERE ID = :old_id;\n"
+ "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "UPDATE SYS_INDEXES"
+ " SET TABLE_ID = :new_id, SPACE = :space\n"
+ " WHERE TABLE_ID = :old_id;\n"
+ "COMMIT WORK;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Unable to assign a new identifier to table ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs("\n"
+ "InnoDB: after truncating it. Background processes"
+ " may corrupt the table!\n", stderr);
+ err = DB_ERROR;
+ } else {
+ dict_table_change_id_in_cache(table, new_id);
+ }
+
+ /* MySQL calls ha_innobase::reset_auto_increment() which does
+ the same thing. */
+ dict_table_autoinc_lock(table);
+ dict_table_autoinc_initialize(table, 1);
+ dict_table_autoinc_unlock(table);
+ dict_update_statistics(table, TRUE);
+
+ trx_commit_for_mysql(trx);
+
+funct_exit:
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ srv_wake_master_thread();
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL. If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread. If the data dictionary was not already locked
+by the transaction, the transaction will be committed. Otherwise, the
+data dictionary will remain locked.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+ const char* name, /*!< in: table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool drop_db)/*!< in: TRUE=dropping whole database */
+{
+ dict_foreign_t* foreign;
+ dict_table_t* table;
+ ulint space_id;
+ ulint err;
+ const char* table_name;
+ ulint namelen;
+ ibool locked_dictionary = FALSE;
+ pars_info_t* info = NULL;
+
+ ut_a(name != NULL);
+
+ if (srv_created_new_raw) {
+ fputs("InnoDB: A new raw disk partition was initialized:\n"
+ "InnoDB: we do not allow database modifications"
+ " by the user.\n"
+ "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+ " is replaced with raw.\n", stderr);
+
+ return(DB_ERROR);
+ }
+
+ trx->op_info = "dropping table";
+
+ trx_start_if_not_started(trx);
+
+ /* The table name is prefixed with the database name and a '/'.
+ Certain table names starting with 'innodb_' have their special
+ meaning regardless of the database name. Thus, we need to
+ ignore the database name prefix in the comparisons. */
+ table_name = strchr(name, '/');
+ ut_a(table_name);
+ table_name++;
+ namelen = strlen(table_name) + 1;
+
+ if (namelen == sizeof S_innodb_monitor
+ && !memcmp(table_name, S_innodb_monitor,
+ sizeof S_innodb_monitor)) {
+
+ /* Table name equals "innodb_monitor":
+ stop monitor prints */
+
+ srv_print_innodb_monitor = FALSE;
+ srv_print_innodb_lock_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_lock_monitor
+ && !memcmp(table_name, S_innodb_lock_monitor,
+ sizeof S_innodb_lock_monitor)) {
+ srv_print_innodb_monitor = FALSE;
+ srv_print_innodb_lock_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_tablespace_monitor
+ && !memcmp(table_name, S_innodb_tablespace_monitor,
+ sizeof S_innodb_tablespace_monitor)) {
+
+ srv_print_innodb_tablespace_monitor = FALSE;
+ } else if (namelen == sizeof S_innodb_table_monitor
+ && !memcmp(table_name, S_innodb_table_monitor,
+ sizeof S_innodb_table_monitor)) {
+
+ srv_print_innodb_table_monitor = FALSE;
+ }
+
+ /* Serialize data dictionary operations with dictionary mutex:
+ no deadlocks can occur then in these operations */
+
+ if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+ /* Prevent foreign key checks etc. while we are dropping the
+ table */
+
+ row_mysql_lock_data_dictionary(trx);
+
+ locked_dictionary = TRUE;
+ }
+
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+ table = dict_table_get_low(name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs(" does not exist in the InnoDB internal\n"
+ "InnoDB: data dictionary though MySQL is"
+ " trying to drop it.\n"
+ "InnoDB: Have you copied the .frm file"
+ " of the table to the\n"
+ "InnoDB: MySQL database directory"
+ " from another database?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+ stderr);
+ goto funct_exit;
+ }
+
+ /* Check if the table is referenced by foreign key constraints from
+ some other table (not the table itself) */
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign && foreign->foreign_table == table) {
+check_next_foreign:
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ if (foreign && trx->check_foreigns
+ && !(drop_db && dict_tables_have_same_db(
+ name, foreign->foreign_table_name))) {
+ FILE* ef = dict_foreign_err_file;
+
+ /* We only allow dropping a referenced table if
+ FOREIGN_KEY_CHECKS is set to 0 */
+
+ err = DB_CANNOT_DROP_CONSTRAINT;
+
+ mutex_enter(&dict_foreign_err_mutex);
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+ fputs(" Cannot drop table ", ef);
+ ut_print_name(ef, trx, TRUE, name);
+ fputs("\n"
+ "because it is referenced by ", ef);
+ ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+ putc('\n', ef);
+ mutex_exit(&dict_foreign_err_mutex);
+
+ goto funct_exit;
+ }
+
+ if (foreign && trx->check_foreigns) {
+ goto check_next_foreign;
+ }
+
+ if (table->n_mysql_handles_opened > 0) {
+ ibool added;
+
+ added = row_add_table_to_background_drop_list(table->name);
+
+ if (added) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: MySQL is"
+ " trying to drop table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs("\n"
+ "InnoDB: though there are still"
+ " open handles to it.\n"
+ "InnoDB: Adding the table to the"
+ " background drop queue.\n",
+ stderr);
+
+ /* We return DB_SUCCESS to MySQL though the drop will
+ happen lazily later */
+ err = DB_SUCCESS;
+ } else {
+ /* The table is already in the background drop list */
+ err = DB_ERROR;
+ }
+
+ goto funct_exit;
+ }
+
+ /* TODO: could we replace the counter n_foreign_key_checks_running
+ with lock checks on the table? Acquire here an exclusive lock on the
+ table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+ they can cope with the table having been dropped here? Foreign key
+ checks take an IS or IX lock on the table. */
+
+ if (table->n_foreign_key_checks_running > 0) {
+
+ const char* table_name = table->name;
+ ibool added;
+
+ added = row_add_table_to_background_drop_list(table_name);
+
+ if (added) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: You are trying to drop table ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, table_name);
+ fputs("\n"
+ "InnoDB: though there is a"
+ " foreign key check running on it.\n"
+ "InnoDB: Adding the table to"
+ " the background drop queue.\n",
+ stderr);
+
+ /* We return DB_SUCCESS to MySQL though the drop will
+ happen lazily later */
+
+ err = DB_SUCCESS;
+ } else {
+ /* The table is already in the background drop list */
+ err = DB_ERROR;
+ }
+
+ goto funct_exit;
+ }
+
+ /* Remove all locks there are on the table or its records */
+ lock_remove_all_on_table(table, TRUE);
+
+ trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+ trx->table_id = table->id;
+
+ /* We use the private SQL parser of Innobase to generate the
+ query graphs needed in deleting the dictionary data from system
+ tables in Innobase. Deleting a row from SYS_INDEXES table also
+ frees the file segments of the B-tree associated with the index. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "table_name", name);
+
+ err = que_eval_sql(info,
+ "PROCEDURE DROP_TABLE_PROC () IS\n"
+ "sys_foreign_id CHAR;\n"
+ "table_id CHAR;\n"
+ "index_id CHAR;\n"
+ "foreign_id CHAR;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "SELECT ID INTO table_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = :table_name\n"
+ "LOCK IN SHARE MODE;\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " RETURN;\n"
+ "END IF;\n"
+ "found := 1;\n"
+ "SELECT ID INTO sys_foreign_id\n"
+ "FROM SYS_TABLES\n"
+ "WHERE NAME = 'SYS_FOREIGN'\n"
+ "LOCK IN SHARE MODE;\n"
+ "IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "IF (:table_name = 'SYS_FOREIGN') THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n"
+ " found := 0;\n"
+ "END IF;\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = :table_name\n"
+ " AND TO_BINARY(FOR_NAME)\n"
+ " = TO_BINARY(:table_name)\n"
+ " LOCK IN SHARE MODE;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_FOREIGN_COLS\n"
+ " WHERE ID = foreign_id;\n"
+ " DELETE FROM SYS_FOREIGN\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "found := 1;\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO index_id\n"
+ " FROM SYS_INDEXES\n"
+ " WHERE TABLE_ID = table_id\n"
+ " LOCK IN SHARE MODE;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " DELETE FROM SYS_STATS\n"
+ " WHERE INDEX_ID = index_id;\n"
+ " DELETE FROM SYS_FIELDS\n"
+ " WHERE INDEX_ID = index_id;\n"
+ " DELETE FROM SYS_INDEXES\n"
+ " WHERE ID = index_id\n"
+ " AND TABLE_ID = table_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "DELETE FROM SYS_COLUMNS\n"
+ "WHERE TABLE_ID = table_id;\n"
+ "DELETE FROM SYS_TABLES\n"
+ "WHERE ID = table_id;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ switch (err) {
+ ibool is_temp;
+ const char* name_or_path;
+ mem_heap_t* heap;
+
+ case DB_SUCCESS:
+
+ heap = mem_heap_create(200);
+
+ /* Clone the name, in case it has been allocated
+ from table->heap, which will be freed by
+ dict_table_remove_from_cache(table) below. */
+ name = mem_heap_strdup(heap, name);
+ space_id = table->space;
+
+ if (table->dir_path_of_temp_table != NULL) {
+ name_or_path = mem_heap_strdup(
+ heap, table->dir_path_of_temp_table);
+ is_temp = TRUE;
+ } else {
+ name_or_path = name;
+ is_temp = (table->flags >> DICT_TF2_SHIFT)
+ & DICT_TF2_TEMPORARY;
+ }
+
+ dict_table_remove_from_cache(table);
+
+ if (dict_load_table(name) != NULL) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: not able to remove table ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs(" from the dictionary cache!\n", stderr);
+ err = DB_ERROR;
+ }
+
+ /* Do not drop possible .ibd tablespace if something went
+ wrong: we do not want to delete valuable data of the user */
+
+ if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) {
+ if (!fil_space_for_table_exists_in_mem(space_id,
+ name_or_path,
+ is_temp, FALSE,
+ !is_temp)) {
+ err = DB_SUCCESS;
+
+ fprintf(stderr,
+ "InnoDB: We removed now the InnoDB"
+ " internal data dictionary entry\n"
+ "InnoDB: of table ");
+ ut_print_name(stderr, trx, TRUE, name);
+ fprintf(stderr, ".\n");
+ } else if (!fil_delete_tablespace(space_id)) {
+ fprintf(stderr,
+ "InnoDB: We removed now the InnoDB"
+ " internal data dictionary entry\n"
+ "InnoDB: of table ");
+ ut_print_name(stderr, trx, TRUE, name);
+ fprintf(stderr, ".\n");
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: not able to"
+ " delete tablespace %lu of table ",
+ (ulong) space_id);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("!\n", stderr);
+ err = DB_ERROR;
+ }
+ }
+
+ mem_heap_free(heap);
+ break;
+
+ case DB_TOO_MANY_CONCURRENT_TRXS:
+ /* Cannot even find a free slot for the
+ the undo log. We can directly exit here
+ and return the DB_TOO_MANY_CONCURRENT_TRXS
+ error. */
+ break;
+
+ case DB_OUT_OF_FILE_SPACE:
+ err = DB_MUST_GET_MORE_FILE_SPACE;
+
+ row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+ /* Fall through to raise error */
+
+ default:
+ /* No other possible error returns */
+ ut_error;
+ }
+
+funct_exit:
+
+ if (locked_dictionary) {
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ trx->op_info = "";
+
+ srv_wake_master_thread();
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void)
+/*============================*/
+{
+ trx_t* trx;
+ btr_pcur_t pcur;
+ mtr_t mtr;
+ mem_heap_t* heap;
+
+ trx = trx_allocate_for_background();
+ trx->op_info = "dropping temporary tables";
+ row_mysql_lock_data_dictionary(trx);
+
+ heap = mem_heap_create(200);
+
+ mtr_start(&mtr);
+
+ btr_pcur_open_at_index_side(
+ TRUE,
+ dict_table_get_first_index(dict_sys->sys_tables),
+ BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ for (;;) {
+ const rec_t* rec;
+ const byte* field;
+ ulint len;
+ const char* table_name;
+ dict_table_t* table;
+
+ btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+ if (!btr_pcur_is_on_user_rec(&pcur)) {
+ break;
+ }
+
+ rec = btr_pcur_get_rec(&pcur);
+ field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+ if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+ continue;
+ }
+
+ /* Because this is not a ROW_FORMAT=REDUNDANT table,
+ the is_temp flag is valid. Examine it. */
+
+ field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+ if (len != 4
+ || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
+ continue;
+ }
+
+ /* This is a temporary table. */
+ field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+ if (len == UNIV_SQL_NULL || len == 0) {
+ /* Corrupted SYS_TABLES.NAME */
+ continue;
+ }
+
+ table_name = mem_heap_strdupl(heap, (const char*) field, len);
+
+ btr_pcur_store_position(&pcur, &mtr);
+ btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+ table = dict_load_table(table_name);
+
+ if (table) {
+ row_drop_table_for_mysql(table_name, trx, FALSE);
+ trx_commit_for_mysql(trx);
+ }
+
+ mtr_start(&mtr);
+ btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &pcur, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+ row_mysql_unlock_data_dictionary(trx);
+ trx_free_for_background(trx);
+}
+
+/*******************************************************************//**
+Drop all foreign keys in a database, see Bug#18942.
+Called at the end of row_drop_database_for_mysql().
+@return error code or DB_SUCCESS */
+static
+ulint
+drop_all_foreign_keys_in_db(
+/*========================*/
+ const char* name, /*!< in: database name which ends to '/' */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ pars_info_t* pinfo;
+ ulint err;
+
+ ut_a(name[strlen(name) - 1] == '/');
+
+ pinfo = pars_info_create();
+
+ pars_info_add_str_literal(pinfo, "dbname", name);
+
+/** true if for_name is not prefixed with dbname */
+#define TABLE_NOT_IN_THIS_DB \
+"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
+
+ err = que_eval_sql(pinfo,
+ "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
+ "foreign_id CHAR;\n"
+ "for_name CHAR;\n"
+ "found INT;\n"
+ "DECLARE CURSOR cur IS\n"
+ "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
+ "WHERE FOR_NAME >= :dbname\n"
+ "LOCK IN SHARE MODE\n"
+ "ORDER BY FOR_NAME;\n"
+ "BEGIN\n"
+ "found := 1;\n"
+ "OPEN cur;\n"
+ "WHILE found = 1 LOOP\n"
+ " FETCH cur INTO foreign_id, for_name;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
+ " found := 0;\n"
+ " ELSIF (1=1) THEN\n"
+ " DELETE FROM SYS_FOREIGN_COLS\n"
+ " WHERE ID = foreign_id;\n"
+ " DELETE FROM SYS_FOREIGN\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "CLOSE cur;\n"
+ "COMMIT WORK;\n"
+ "END;\n",
+ FALSE, /* do not reserve dict mutex,
+ we are already holding it */
+ trx);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Drops a database for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+ const char* name, /*!< in: database name which ends to '/' */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ dict_table_t* table;
+ char* table_name;
+ int err = DB_SUCCESS;
+ ulint namelen = strlen(name);
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(name != NULL);
+ ut_a(name[namelen - 1] == '/');
+
+ trx->op_info = "dropping database";
+
+ trx_start_if_not_started(trx);
+loop:
+ row_mysql_lock_data_dictionary(trx);
+
+ while ((table_name = dict_get_first_table_name_in_db(name))) {
+ ut_a(memcmp(table_name, name, namelen) == 0);
+
+ table = dict_table_get_low(table_name);
+
+ ut_a(table);
+
+ /* Wait until MySQL does not have any queries running on
+ the table */
+
+ if (table->n_mysql_handles_opened > 0) {
+ row_mysql_unlock_data_dictionary(trx);
+
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: MySQL is trying to"
+ " drop database ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fputs("\n"
+ "InnoDB: though there are still"
+ " open handles to table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table_name);
+ fputs(".\n", stderr);
+
+ os_thread_sleep(1000000);
+
+ mem_free(table_name);
+
+ goto loop;
+ }
+
+ err = row_drop_table_for_mysql(table_name, trx, TRUE);
+ trx_commit_for_mysql(trx);
+
+ if (err != DB_SUCCESS) {
+ fputs("InnoDB: DROP DATABASE ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fprintf(stderr, " failed with error %lu for table ",
+ (ulint) err);
+ ut_print_name(stderr, trx, TRUE, table_name);
+ putc('\n', stderr);
+ mem_free(table_name);
+ break;
+ }
+
+ mem_free(table_name);
+ }
+
+ if (err == DB_SUCCESS) {
+ /* after dropping all tables try to drop all leftover
+ foreign keys in case orphaned ones exist */
+ err = (int) drop_all_foreign_keys_in_db(name, trx);
+
+ if (err != DB_SUCCESS) {
+ fputs("InnoDB: DROP DATABASE ", stderr);
+ ut_print_name(stderr, trx, TRUE, name);
+ fprintf(stderr, " failed with error %d while "
+ "dropping all foreign keys", err);
+ }
+ }
+
+ trx_commit_for_mysql(trx);
+
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return TRUE if temporary table */
+static
+ibool
+row_is_mysql_tmp_table_name(
+/*========================*/
+ const char* name) /*!< in: table name in the form
+ 'database/tablename' */
+{
+ return(strstr(name, "/#sql") != NULL);
+ /* return(strstr(name, "/@0023sql") != NULL); */
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static
+int
+row_delete_constraint_low(
+/*======================*/
+ const char* id, /*!< in: constraint id */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ pars_info_t* info = pars_info_create();
+
+ pars_info_add_str_literal(info, "id", id);
+
+ return((int) que_eval_sql(info,
+ "PROCEDURE DELETE_CONSTRAINT () IS\n"
+ "BEGIN\n"
+ "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+ "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+ "END;\n"
+ , FALSE, trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return error code or DB_SUCCESS */
+static
+int
+row_delete_constraint(
+/*==================*/
+ const char* id, /*!< in: constraint id */
+ const char* database_name, /*!< in: database name, with the
+ trailing '/' */
+ mem_heap_t* heap, /*!< in: memory heap */
+ trx_t* trx) /*!< in: transaction handle */
+{
+ ulint err;
+
+ /* New format constraints have ids <databasename>/<constraintname>. */
+ err = row_delete_constraint_low(
+ mem_heap_strcat(heap, database_name, id), trx);
+
+ if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+ /* Old format < 4.0.18 constraints have constraint ids
+ NUMBER_NUMBER. We only try deleting them if the
+ constraint name does not contain a '/' character, otherwise
+ deleting a new format constraint named 'foo/bar' from
+ database 'baz' would remove constraint 'bar' from database
+ 'foo', if it existed. */
+
+ err = row_delete_constraint_low(id, trx);
+ }
+
+ return((int) err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+ const char* old_name, /*!< in: old table name */
+ const char* new_name, /*!< in: new table name */
+ trx_t* trx, /*!< in: transaction handle */
+ ibool commit) /*!< in: if TRUE then commit trx */
+{
+ dict_table_t* table;
+ ulint err = DB_ERROR;
+ mem_heap_t* heap = NULL;
+ const char** constraints_to_drop = NULL;
+ ulint n_constraints_to_drop = 0;
+ ibool old_is_tmp, new_is_tmp;
+ pars_info_t* info = NULL;
+
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+ ut_a(old_name != NULL);
+ ut_a(new_name != NULL);
+
+ if (srv_created_new_raw || srv_force_recovery) {
+ fputs("InnoDB: A new raw disk partition was initialized or\n"
+ "InnoDB: innodb_force_recovery is on: we do not allow\n"
+ "InnoDB: database modifications by the user. Shut down\n"
+ "InnoDB: mysqld and edit my.cnf so that newraw"
+ " is replaced\n"
+ "InnoDB: with raw, and innodb_force_... is removed.\n",
+ stderr);
+
+ goto funct_exit;
+ } else if (row_mysql_is_system_table(new_name)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to create a MySQL"
+ " system table %s of type InnoDB.\n"
+ "InnoDB: MySQL system tables must be"
+ " of the MyISAM type!\n",
+ new_name);
+
+ goto funct_exit;
+ }
+
+ trx->op_info = "renaming table";
+ trx_start_if_not_started(trx);
+
+ old_is_tmp = row_is_mysql_tmp_table_name(old_name);
+ new_is_tmp = row_is_mysql_tmp_table_name(new_name);
+
+ table = dict_table_get_low(old_name);
+
+ if (!table) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, old_name);
+ fputs(" does not exist in the InnoDB internal\n"
+ "InnoDB: data dictionary though MySQL is"
+ " trying to rename the table.\n"
+ "InnoDB: Have you copied the .frm file"
+ " of the table to the\n"
+ "InnoDB: MySQL database directory"
+ " from another database?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+ stderr);
+ goto funct_exit;
+ } else if (table->ibd_file_missing) {
+ err = DB_TABLE_NOT_FOUND;
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Error: table ", stderr);
+ ut_print_name(stderr, trx, TRUE, old_name);
+ fputs(" does not have an .ibd file"
+ " in the database directory.\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+ stderr);
+ goto funct_exit;
+ } else if (new_is_tmp) {
+ /* MySQL is doing an ALTER TABLE command and it renames the
+ original table to a temporary table name. We want to preserve
+ the original foreign key constraint definitions despite the
+ name change. An exception is those constraints for which
+ the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+ heap = mem_heap_create(100);
+
+ err = dict_foreign_parse_drop_constraints(
+ heap, trx, table, &n_constraints_to_drop,
+ &constraints_to_drop);
+
+ if (err != DB_SUCCESS) {
+
+ goto funct_exit;
+ }
+ }
+
+ /* We use the private SQL parser of Innobase to generate the query
+ graphs needed in updating the dictionary data from system tables. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+
+ err = que_eval_sql(info,
+ "PROCEDURE RENAME_TABLE () IS\n"
+ "BEGIN\n"
+ "UPDATE SYS_TABLES SET NAME = :new_table_name\n"
+ " WHERE NAME = :old_table_name;\n"
+ "END;\n"
+ , FALSE, trx);
+
+ if (err != DB_SUCCESS) {
+
+ goto end;
+ } else if (!new_is_tmp) {
+ /* Rename all constraints. */
+
+ info = pars_info_create();
+
+ pars_info_add_str_literal(info, "new_table_name", new_name);
+ pars_info_add_str_literal(info, "old_table_name", old_name);
+
+ err = que_eval_sql(
+ info,
+ "PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+ "gen_constr_prefix CHAR;\n"
+ "new_db_name CHAR;\n"
+ "foreign_id CHAR;\n"
+ "new_foreign_id CHAR;\n"
+ "old_db_name_len INT;\n"
+ "old_t_name_len INT;\n"
+ "new_db_name_len INT;\n"
+ "id_len INT;\n"
+ "found INT;\n"
+ "BEGIN\n"
+ "found := 1;\n"
+ "old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+ "new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+ "new_db_name := SUBSTR(:new_table_name, 0,\n"
+ " new_db_name_len);\n"
+ "old_t_name_len := LENGTH(:old_table_name);\n"
+ "gen_constr_prefix := CONCAT(:old_table_name,\n"
+ " '_ibfk_');\n"
+ "WHILE found = 1 LOOP\n"
+ " SELECT ID INTO foreign_id\n"
+ " FROM SYS_FOREIGN\n"
+ " WHERE FOR_NAME = :old_table_name\n"
+ " AND TO_BINARY(FOR_NAME)\n"
+ " = TO_BINARY(:old_table_name)\n"
+ " LOCK IN SHARE MODE;\n"
+ " IF (SQL % NOTFOUND) THEN\n"
+ " found := 0;\n"
+ " ELSE\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET FOR_NAME = :new_table_name\n"
+ " WHERE ID = foreign_id;\n"
+ " id_len := LENGTH(foreign_id);\n"
+ " IF (INSTR(foreign_id, '/') > 0) THEN\n"
+ " IF (INSTR(foreign_id,\n"
+ " gen_constr_prefix) > 0)\n"
+ " THEN\n"
+ " new_foreign_id :=\n"
+ " CONCAT(:new_table_name,\n"
+ " SUBSTR(foreign_id, old_t_name_len,\n"
+ " id_len - old_t_name_len));\n"
+ " ELSE\n"
+ " new_foreign_id :=\n"
+ " CONCAT(new_db_name,\n"
+ " SUBSTR(foreign_id,\n"
+ " old_db_name_len,\n"
+ " id_len - old_db_name_len));\n"
+ " END IF;\n"
+ " UPDATE SYS_FOREIGN\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " UPDATE SYS_FOREIGN_COLS\n"
+ " SET ID = new_foreign_id\n"
+ " WHERE ID = foreign_id;\n"
+ " END IF;\n"
+ " END IF;\n"
+ "END LOOP;\n"
+ "UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+ "WHERE REF_NAME = :old_table_name\n"
+ " AND TO_BINARY(REF_NAME)\n"
+ " = TO_BINARY(:old_table_name);\n"
+ "END;\n"
+ , FALSE, trx);
+
+ } else if (n_constraints_to_drop > 0) {
+ /* Drop some constraints of tmp tables. */
+
+ ulint db_name_len = dict_get_db_name_len(old_name) + 1;
+ char* db_name = mem_heap_strdupl(heap, old_name,
+ db_name_len);
+ ulint i;
+
+ for (i = 0; i < n_constraints_to_drop; i++) {
+ err = row_delete_constraint(constraints_to_drop[i],
+ db_name, heap, trx);
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+ }
+ }
+
+end:
+ if (err != DB_SUCCESS) {
+ if (err == DB_DUPLICATE_KEY) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error; possible reasons:\n"
+ "InnoDB: 1) Table rename would cause"
+ " two FOREIGN KEY constraints\n"
+ "InnoDB: to have the same internal name"
+ " in case-insensitive comparison.\n"
+ "InnoDB: 2) table ", stderr);
+ ut_print_name(stderr, trx, TRUE, new_name);
+ fputs(" exists in the InnoDB internal data\n"
+ "InnoDB: dictionary though MySQL is"
+ " trying to rename table ", stderr);
+ ut_print_name(stderr, trx, TRUE, old_name);
+ fputs(" to it.\n"
+ "InnoDB: Have you deleted the .frm file"
+ " and not used DROP TABLE?\n"
+ "InnoDB: You can look for further help from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: If table ", stderr);
+ ut_print_name(stderr, trx, TRUE, new_name);
+ fputs(" is a temporary table #sql..., then"
+ " it can be that\n"
+ "InnoDB: there are still queries running"
+ " on the table, and it will be\n"
+ "InnoDB: dropped automatically when"
+ " the queries end.\n"
+ "InnoDB: You can drop the orphaned table"
+ " inside InnoDB by\n"
+ "InnoDB: creating an InnoDB table with"
+ " the same name in another\n"
+ "InnoDB: database and copying the .frm file"
+ " to the current database.\n"
+ "InnoDB: Then MySQL thinks the table exists,"
+ " and DROP TABLE will\n"
+ "InnoDB: succeed.\n", stderr);
+ }
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ } else {
+ /* The following call will also rename the .ibd data file if
+ the table is stored in a single-table tablespace */
+
+ if (!dict_table_rename_in_cache(table, new_name,
+ !new_is_tmp)) {
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ goto funct_exit;
+ }
+
+ /* We only want to switch off some of the type checking in
+ an ALTER, not in a RENAME. */
+
+ err = dict_load_foreigns(
+ new_name, FALSE, !old_is_tmp || trx->check_foreigns);
+
+ if (err != DB_SUCCESS) {
+ ut_print_timestamp(stderr);
+
+ if (old_is_tmp) {
+ fputs(" InnoDB: Error: in ALTER TABLE ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, new_name);
+ fputs("\n"
+ "InnoDB: has or is referenced"
+ " in foreign key constraints\n"
+ "InnoDB: which are not compatible"
+ " with the new table definition.\n",
+ stderr);
+ } else {
+ fputs(" InnoDB: Error: in RENAME TABLE"
+ " table ",
+ stderr);
+ ut_print_name(stderr, trx, TRUE, new_name);
+ fputs("\n"
+ "InnoDB: is referenced in"
+ " foreign key constraints\n"
+ "InnoDB: which are not compatible"
+ " with the new table definition.\n",
+ stderr);
+ }
+
+ ut_a(dict_table_rename_in_cache(table,
+ old_name, FALSE));
+ trx->error_state = DB_SUCCESS;
+ trx_general_rollback_for_mysql(trx, NULL);
+ trx->error_state = DB_SUCCESS;
+ }
+ }
+
+funct_exit:
+
+ if (commit) {
+ trx_commit_for_mysql(trx);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return TRUE if ok */
+UNIV_INTERN
+ibool
+row_check_index_for_mysql(
+/*======================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct
+ in MySQL handle */
+ const dict_index_t* index, /*!< in: index */
+ ulint* n_rows) /*!< out: number of entries
+ seen in the consistent read */
+{
+ dtuple_t* prev_entry = NULL;
+ ulint matched_fields;
+ ulint matched_bytes;
+ byte* buf;
+ ulint ret;
+ rec_t* rec;
+ ibool is_ok = TRUE;
+ int cmp;
+ ibool contains_null;
+ ulint i;
+ ulint cnt;
+ mem_heap_t* heap = NULL;
+ ulint n_ext;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets;
+ rec_offs_init(offsets_);
+
+ *n_rows = 0;
+
+ buf = mem_alloc(UNIV_PAGE_SIZE);
+ heap = mem_heap_create(100);
+
+ cnt = 1000;
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+ /* Check thd->killed every 1,000 scanned rows */
+ if (--cnt == 0) {
+ if (trx_is_interrupted(prebuilt->trx)) {
+ goto func_exit;
+ }
+ cnt = 1000;
+ }
+
+ switch (ret) {
+ case DB_SUCCESS:
+ break;
+ default:
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Warning: CHECK TABLE on ", stderr);
+ dict_index_name_print(stderr, prebuilt->trx, index);
+ fprintf(stderr, " returned %lu\n", ret);
+ /* fall through (this error is ignored by CHECK TABLE) */
+ case DB_END_OF_INDEX:
+func_exit:
+ mem_free(buf);
+ mem_heap_free(heap);
+
+ return(is_ok);
+ }
+
+ *n_rows = *n_rows + 1;
+
+ /* row_search... returns the index record in buf, record origin offset
+ within buf stored in the first 4 bytes, because we have built a dummy
+ template */
+
+ rec = buf + mach_read_from_4(buf);
+
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ if (prev_entry != NULL) {
+ matched_fields = 0;
+ matched_bytes = 0;
+
+ cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+ &matched_fields,
+ &matched_bytes);
+ contains_null = FALSE;
+
+ /* In a unique secondary index we allow equal key values if
+ they contain SQL NULLs */
+
+ for (i = 0;
+ i < dict_index_get_n_ordering_defined_by_user(index);
+ i++) {
+ if (UNIV_SQL_NULL == dfield_get_len(
+ dtuple_get_nth_field(prev_entry, i))) {
+
+ contains_null = TRUE;
+ }
+ }
+
+ if (cmp > 0) {
+ fputs("InnoDB: index records in a wrong order in ",
+ stderr);
+not_ok:
+ dict_index_name_print(stderr,
+ prebuilt->trx, index);
+ fputs("\n"
+ "InnoDB: prev record ", stderr);
+ dtuple_print(stderr, prev_entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ is_ok = FALSE;
+ } else if (dict_index_is_unique(index)
+ && !contains_null
+ && matched_fields
+ >= dict_index_get_n_ordering_defined_by_user(
+ index)) {
+
+ fputs("InnoDB: duplicate key in ", stderr);
+ goto not_ok;
+ }
+ }
+
+ {
+ mem_heap_t* tmp_heap = NULL;
+
+ /* Empty the heap on each round. But preserve offsets[]
+ for the row_rec_to_index_entry() call, by copying them
+ into a separate memory heap when needed. */
+ if (UNIV_UNLIKELY(offsets != offsets_)) {
+ ulint size = rec_offs_get_n_alloc(offsets)
+ * sizeof *offsets;
+
+ tmp_heap = mem_heap_create(size);
+ offsets = mem_heap_dup(tmp_heap, offsets, size);
+ }
+
+ mem_heap_empty(heap);
+
+ prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec,
+ index, offsets,
+ &n_ext, heap);
+
+ if (UNIV_LIKELY_NULL(tmp_heap)) {
+ mem_heap_free(tmp_heap);
+ }
+ }
+
+ ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+ goto loop;
+}
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+ const char* table_name) /*!< in: name of the table, in the
+ form database/table_name */
+{
+ const char* name; /* table_name without database/ */
+ ulint len;
+
+ name = strchr(table_name, '/');
+ ut_a(name != NULL);
+ name++;
+ len = strlen(name) + 1;
+
+ if (STR_EQ(name, len, S_innodb_monitor)
+ || STR_EQ(name, len, S_innodb_lock_monitor)
+ || STR_EQ(name, len, S_innodb_tablespace_monitor)
+ || STR_EQ(name, len, S_innodb_table_monitor)
+ || STR_EQ(name, len, S_innodb_mem_validate)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
diff --git a/storage/xtradb/row/row0purge.c b/storage/xtradb/row/row0purge.c
new file mode 100644
index 00000000000..835af990672
--- /dev/null
+++ b/storage/xtradb/row/row0purge.c
@@ -0,0 +1,700 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.c
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ purge_node_t* node;
+
+ ut_ad(parent && heap);
+
+ node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+ node->common.type = QUE_NODE_PURGE;
+ node->common.parent = parent;
+
+ node->heap = mem_heap_create(256);
+
+ return(node);
+}
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found.
+@return TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+ ulint mode, /*!< in: latching mode */
+ purge_node_t* node, /*!< in: row purge node */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool found;
+
+ if (node->found_clust) {
+ found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+ return(found);
+ }
+
+ found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+ node->ref, mtr);
+ node->found_clust = found;
+
+ if (found) {
+ btr_pcur_store_position(&(node->pcur), mtr);
+ }
+
+ return(found);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@return TRUE if success, or if not found, or if modified after the
+delete marking */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ index = dict_table_get_first_index(node->table);
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ success = row_purge_reposition_pcur(mode, node, &mtr);
+
+ if (!success) {
+ /* The record is already removed */
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(
+ rec, index, rec_get_offsets(
+ rec, index, offsets_,
+ ULINT_UNDEFINED, &heap)))) {
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ /* Someone else has modified the record later: do not remove */
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(TRUE);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ RB_NONE, &mtr);
+
+ if (err == DB_SUCCESS) {
+ success = TRUE;
+ } else if (err == DB_OUT_OF_FILE_SPACE) {
+ success = FALSE;
+ } else {
+ ut_error;
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ return(success);
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+ purge_node_t* node) /*!< in: row purge node */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+ /* fputs("Purge: Removing clustered record\n", stderr); */
+
+ success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible.
+@return TRUE if success or if not found */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has = 0; /* remove warning */
+ ibool found;
+ ulint err;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ if (!found) {
+ /* Not found. This is a legitimate condition. In a
+ rollback, InnoDB will remove secondary recs that would
+ be purged anyway. Then the actual purge will not find
+ the secondary index record. Also, the purge itself is
+ eager: if it comes to consider a secondary index
+ record, and notices it does not need to exist in the
+ index, it will remove it. Then if/when the purge
+ comes to consider the secondary index record a second
+ time, it will not exist any more in the index. */
+
+ /* fputs("PURGE:........sec entry not found\n", stderr); */
+ /* dtuple_print(stderr, entry); */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ /* We should remove the index record if no later version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should do nothing. */
+
+ mtr_start(&mtr_vers);
+
+ success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
+
+ if (success) {
+ old_has = row_vers_old_has_index_entry(
+ TRUE, btr_pcur_get_rec(&(node->pcur)),
+ &mtr_vers, index, entry);
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+ if (!success || !old_has) {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ RB_NONE, &mtr);
+ success = err == DB_SUCCESS;
+ ut_a(success || err == DB_OUT_OF_FILE_SPACE);
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+ purge_node_t* node, /*!< in: row purge node */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ ibool success;
+ ulint n_tries = 0;
+
+ /* fputs("Purge: Removing secondary record\n", stderr); */
+
+ success = row_purge_remove_sec_if_poss_low(node, index, entry,
+ BTR_MODIFY_LEAF);
+ if (success) {
+
+ return;
+ }
+retry:
+ success = row_purge_remove_sec_if_poss_low(node, index, entry,
+ BTR_MODIFY_TREE);
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+ purge_node_t* node) /*!< in: row purge node */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+
+ ut_ad(node);
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ /* Build the index entry */
+ entry = row_build_index_entry(node->row, NULL, index, heap);
+ ut_a(entry);
+ row_purge_remove_sec_if_poss(node, index, entry);
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ row_purge_remove_clust_if_poss(node);
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern(
+/*==========================*/
+ purge_node_t* node) /*!< in: row purge node */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ibool is_insert;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ ulint i;
+ mtr_t mtr;
+
+ ut_ad(node);
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ goto skip_secondaries;
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field_binary(NULL, node->index,
+ node->update)) {
+ /* Build the older version of the index entry */
+ entry = row_build_index_entry(node->row, NULL,
+ index, heap);
+ ut_a(entry);
+ row_purge_remove_sec_if_poss(node, index, entry);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+skip_secondaries:
+ /* Free possible externally stored fields */
+ for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+ const upd_field_t* ufield
+ = upd_get_nth_field(node->update, i);
+
+ if (dfield_is_ext(&ufield->new_val)) {
+ buf_block_t* block;
+ ulint internal_offset;
+ byte* data_field;
+
+ /* We use the fact that new_val points to
+ node->undo_rec and get thus the offset of
+ dfield data inside the undo record. Then we
+ can calculate from node->roll_ptr the file
+ address of the new_val data */
+
+ internal_offset
+ = ((const byte*)
+ dfield_get_data(&ufield->new_val))
+ - node->undo_rec;
+
+ ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+ trx_undo_decode_roll_ptr(node->roll_ptr,
+ &is_insert, &rseg_id,
+ &page_no, &offset);
+ mtr_start(&mtr);
+
+ /* We have to acquire an X-latch to the clustered
+ index tree */
+
+ index = dict_table_get_first_index(node->table);
+
+ mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+ /* NOTE: we must also acquire an X-latch to the
+ root page of the tree. We will need it when we
+ free pages from the tree. If the tree is of height 1,
+ the tree X-latch does NOT protect the root page,
+ because it is also a leaf page. Since we will have a
+ latch on an undo log page, we would break the
+ latching order if we would only later latch the
+ root page of such a tree! */
+
+ btr_root_get(index, &mtr);
+
+ /* We assume in purge of externally stored fields
+ that the space id of the undo log record is 0! */
+
+ block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ data_field = buf_block_get_frame(block)
+ + offset + internal_offset;
+
+ ut_a(dfield_get_len(&ufield->new_val)
+ >= BTR_EXTERN_FIELD_REF_SIZE);
+ btr_free_externally_stored_field(
+ index,
+ data_field + dfield_get_len(&ufield->new_val)
+ - BTR_EXTERN_FIELD_REF_SIZE,
+ NULL, NULL, NULL, 0, RB_NONE, &mtr);
+ mtr_commit(&mtr);
+ }
+ }
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record.
+@return TRUE if purge operation required: NOTE that then the CALLER
+must unfreeze data dictionary! */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+ purge_node_t* node, /*!< in: row undo node */
+ ibool* updated_extern,
+ /*!< out: TRUE if an externally stored field
+ was updated */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ trx_t* trx;
+ undo_no_t undo_no;
+ dulint table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+
+ ut_ad(node && thr);
+
+ trx = thr_get_trx(thr);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ updated_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+
+ return(FALSE);
+ }
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+ node->table = NULL;
+
+ if (type == TRX_UNDO_UPD_EXIST_REC
+ && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
+
+ /* Purge requires no changes to indexes: we may return */
+
+ return(FALSE);
+ }
+
+ /* Prevent DROP TABLE etc. from running when we are doing the purge
+ for this row */
+
+ row_mysql_freeze_data_dictionary(trx);
+
+ mutex_enter(&(dict_sys->mutex));
+
+ node->table = dict_table_get_on_id_low(table_id);
+
+ mutex_exit(&(dict_sys->mutex));
+
+ if (node->table == NULL) {
+ /* The table has been dropped: no need to do purge */
+err_exit:
+ row_mysql_unfreeze_data_dictionary(trx);
+ return(FALSE);
+ }
+
+ if (node->table->ibd_file_missing) {
+ /* We skip purge of missing .ibd files */
+
+ node->table = NULL;
+
+ goto err_exit;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index == NULL) {
+ /* The table was corrupt in the data dictionary */
+
+ goto err_exit;
+ }
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, trx,
+ node->heap, &(node->update));
+
+ /* Read to the partial row the fields that occur in indexes */
+
+ if (!(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ ptr = trx_undo_rec_get_partial_row(
+ ptr, clust_index, &node->row,
+ type == TRX_UNDO_UPD_DEL_REC,
+ node->heap);
+ }
+
+ return(TRUE);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_purge(
+/*======*/
+ purge_node_t* node, /*!< in: row purge node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_ptr_t roll_ptr;
+ ibool purge_needed;
+ ibool updated_extern;
+ trx_t* trx;
+
+ ut_ad(node && thr);
+
+ trx = thr_get_trx(thr);
+
+ node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+ &(node->reservation),
+ node->heap);
+ if (!node->undo_rec) {
+ /* Purge completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+
+ if (node->undo_rec == &trx_purge_dummy_rec) {
+ purge_needed = FALSE;
+ } else {
+ purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+ thr);
+ /* If purge_needed == TRUE, we must also remember to unfreeze
+ data dictionary! */
+ }
+
+ if (purge_needed) {
+ node->found_clust = FALSE;
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+ row_purge_del_mark(node);
+
+ } else if (updated_extern
+ || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ row_purge_upd_exist_or_extern(node);
+ }
+
+ if (node->found_clust) {
+ btr_pcur_close(&(node->pcur));
+ }
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ /* Do some cleanup */
+ trx_purge_rec_release(node->reservation);
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ purge_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+ err = row_purge(node, thr);
+
+ ut_ad(err == DB_SUCCESS);
+
+ return(thr);
+}
diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c
new file mode 100644
index 00000000000..8e806a14a98
--- /dev/null
+++ b/storage/xtradb/row/row0row.c
@@ -0,0 +1,1179 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.c
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "data0type.h"
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "ha_prototypes.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "ut0mem.h"
+
+/*********************************************************************//**
+Gets the offset of trx id field, in bytes relative to the origin of
+a clustered index record.
+@return offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+ const rec_t* rec __attribute__((unused)),
+ /*!< in: record */
+ dict_index_t* index, /*!< in: clustered index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ ulint pos;
+ ulint offset;
+ ulint len;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+ offset = rec_get_nth_field_offs(offsets, pos, &len);
+
+ ut_ad(len == DATA_TRX_ID_LEN);
+
+ return(offset);
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+ const dtuple_t* row, /*!< in: row which should be
+ inserted or purged */
+ row_ext_t* ext, /*!< in: externally stored column prefixes,
+ or NULL */
+ dict_index_t* index, /*!< in: index on the table */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory for
+ the index entry is allocated */
+{
+ dtuple_t* entry;
+ ulint entry_len;
+ ulint i;
+
+ ut_ad(row && index && heap);
+ ut_ad(dtuple_check_typed(row));
+
+ entry_len = dict_index_get_n_fields(index);
+ entry = dtuple_create(heap, entry_len);
+
+ if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+ dtuple_set_n_fields_cmp(entry, entry_len);
+ /* There may only be externally stored columns
+ in a clustered index B-tree of a user table. */
+ ut_a(!ext);
+ } else {
+ dtuple_set_n_fields_cmp(
+ entry, dict_index_get_n_unique_in_tree(index));
+ }
+
+ for (i = 0; i < entry_len; i++) {
+ const dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = ind_field->col;
+ ulint col_no
+ = dict_col_get_no(col);
+ dfield_t* dfield
+ = dtuple_get_nth_field(entry, i);
+ const dfield_t* dfield2
+ = dtuple_get_nth_field(row, col_no);
+ ulint len
+ = dfield_get_len(dfield2);
+
+ dfield_copy(dfield, dfield2);
+
+ if (dfield_is_null(dfield) || ind_field->prefix_len == 0) {
+ continue;
+ }
+
+ /* If a column prefix index, take only the prefix.
+ Prefix-indexed columns may be externally stored. */
+ ut_ad(col->ord_part);
+
+ if (UNIV_LIKELY_NULL(ext)) {
+ /* See if the column is stored externally. */
+ const byte* buf = row_ext_lookup(ext, col_no,
+ &len);
+ if (UNIV_LIKELY_NULL(buf)) {
+ if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+ return(NULL);
+ }
+ dfield_set_data(dfield, buf, len);
+ }
+ } else if (dfield_is_ext(dfield)) {
+ ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ ut_a(ind_field->prefix_len <= len
+ || dict_index_is_clust(index));
+ }
+
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ind_field->prefix_len, len, dfield_get_data(dfield));
+ dfield_set_len(dfield, len);
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+
+ return(entry);
+}
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+ ulint type, /*!< in: ROW_COPY_POINTERS or
+ ROW_COPY_DATA; the latter
+ copies also the data fields to
+ heap while the first only
+ places pointers to data fields
+ on the index page, and thus is
+ more efficient */
+ const dict_index_t* index, /*!< in: clustered index */
+ const rec_t* rec, /*!< in: record in the clustered
+ index; NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the row dtuple is used! */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec,index)
+ or NULL, in which case this function
+ will invoke rec_get_offsets() */
+ const dict_table_t* col_table,
+ /*!< in: table, to check which
+ externally stored columns
+ occur in the ordering columns
+ of an index, or NULL if
+ index->table should be
+ consulted instead */
+ row_ext_t** ext, /*!< out, own: cache of
+ externally stored column
+ prefixes, or NULL */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ dtuple_t* row;
+ const dict_table_t* table;
+ ulint n_fields;
+ ulint n_ext_cols;
+ ulint* ext_cols = NULL; /* remove warning */
+ ulint len;
+ ulint row_len;
+ byte* buf;
+ ulint i;
+ ulint j;
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_ad(index && rec && heap);
+ ut_ad(dict_index_is_clust(index));
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &tmp_heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ if (type != ROW_COPY_POINTERS) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, (ulint*) offsets);
+ }
+
+ table = index->table;
+ row_len = dict_table_get_n_cols(table);
+
+ row = dtuple_create(heap, row_len);
+
+ dict_table_copy_types(row, table);
+
+ dtuple_set_info_bits(row, rec_get_info_bits(
+ rec, dict_table_is_comp(table)));
+
+ n_fields = rec_offs_n_fields(offsets);
+ n_ext_cols = rec_offs_n_extern(offsets);
+ if (n_ext_cols) {
+ ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols);
+ }
+
+ for (i = j = 0; i < n_fields; i++) {
+ dict_field_t* ind_field
+ = dict_index_get_nth_field(index, i);
+ const dict_col_t* col
+ = dict_field_get_col(ind_field);
+ ulint col_no
+ = dict_col_get_no(col);
+ dfield_t* dfield
+ = dtuple_get_nth_field(row, col_no);
+
+ if (ind_field->prefix_len == 0) {
+
+ const byte* field = rec_get_nth_field(
+ rec, offsets, i, &len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+
+ if (UNIV_LIKELY_NULL(col_table)) {
+ ut_a(col_no
+ < dict_table_get_n_cols(col_table));
+ col = dict_table_get_nth_col(
+ col_table, col_no);
+ }
+
+ if (col->ord_part) {
+ /* We will have to fetch prefixes of
+ externally stored columns that are
+ referenced by column prefixes. */
+ ext_cols[j++] = col_no;
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(row));
+
+ if (!ext) {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed. */
+ ut_ad(dict_table_get_format(index->table)
+ < DICT_TF_FORMAT_ZIP);
+ } else if (j) {
+ *ext = row_ext_create(j, ext_cols, row,
+ dict_table_zip_size(index->table),
+ heap);
+ } else {
+ *ext = NULL;
+ }
+
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(row);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+ const rec_t* rec, /*!< in: record in the index */
+ const dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ dtuple_t* entry;
+ dfield_t* dfield;
+ ulint i;
+ const byte* field;
+ ulint len;
+ ulint rec_len;
+
+ ut_ad(rec && heap && index);
+ /* Because this function may be invoked by row0merge.c
+ on a record whose header is in different format, the check
+ rec_offs_validate(rec, index, offsets) must be avoided here. */
+ ut_ad(n_ext);
+ *n_ext = 0;
+
+ rec_len = rec_offs_n_fields(offsets);
+
+ entry = dtuple_create(heap, rec_len);
+
+ dtuple_set_n_fields_cmp(entry,
+ dict_index_get_n_unique_in_tree(index));
+ ut_ad(rec_len == dict_index_get_n_fields(index));
+
+ dict_index_copy_types(entry, index, rec_len);
+
+ for (i = 0; i < rec_len; i++) {
+
+ dfield = dtuple_get_nth_field(entry, i);
+ field = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ if (rec_offs_nth_extern(offsets, i)) {
+ dfield_set_ext(dfield);
+ (*n_ext)++;
+ }
+ }
+
+ ut_ad(dtuple_check_typed(entry));
+
+ return(entry);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+ ulint type, /*!< in: ROW_COPY_DATA, or
+ ROW_COPY_POINTERS: the former
+ copies also the data fields to
+ heap as the latter only places
+ pointers to data fields on the
+ index page */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case
+ ROW_COPY_POINTERS the data
+ fields in the row will point
+ directly into this record,
+ therefore, the buffer page of
+ this record must be at least
+ s-latched and the latch held
+ as long as the dtuple is used! */
+ const dict_index_t* index, /*!< in: index */
+ ulint* offsets,/*!< in/out: rec_get_offsets(rec) */
+ ulint* n_ext, /*!< out: number of externally
+ stored columns */
+ mem_heap_t* heap) /*!< in: memory heap from which
+ the memory needed is allocated */
+{
+ dtuple_t* entry;
+ byte* buf;
+
+ ut_ad(rec && heap && index);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, offsets);
+ }
+
+ entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
+
+ dtuple_set_info_bits(entry,
+ rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+ return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+ ulint type, /*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+ the former copies also the data fields to
+ heap, whereas the latter only places pointers
+ to data fields on the index page */
+ dict_index_t* index, /*!< in: secondary index */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: in the case ROW_COPY_POINTERS
+ the data fields in the row will point
+ directly into this record, therefore,
+ the buffer page of this record must be
+ at least s-latched and the latch held
+ as long as the row reference is used! */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ dict_table_t* table;
+ dict_index_t* clust_index;
+ dfield_t* dfield;
+ dtuple_t* ref;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ byte* buf;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* tmp_heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(index && rec && heap);
+ ut_ad(!dict_index_is_clust(index));
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &tmp_heap);
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+
+ if (type == ROW_COPY_DATA) {
+ /* Take a copy of rec to heap */
+
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+
+ rec = rec_copy(buf, rec, offsets);
+ /* Avoid a debug assertion in rec_offs_validate(). */
+ rec_offs_make_valid(rec, index, offsets);
+ }
+
+ table = index->table;
+
+ clust_index = dict_table_get_first_index(table);
+
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (tmp_heap) {
+ mem_heap_free(tmp_heap);
+ }
+
+ return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+ dtuple_t* ref, /*!< in/out: row reference built;
+ see the NOTE below! */
+ const rec_t* rec, /*!< in: record in the index;
+ NOTE: the data fields in ref
+ will point directly into this
+ record, therefore, the buffer
+ page of this record must be at
+ least s-latched and the latch
+ held as long as the row
+ reference is used! */
+ const dict_index_t* index, /*!< in: secondary index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index)
+ or NULL */
+ trx_t* trx) /*!< in: transaction */
+{
+ const dict_index_t* clust_index;
+ dfield_t* dfield;
+ const byte* field;
+ ulint len;
+ ulint ref_len;
+ ulint pos;
+ ulint clust_col_prefix_len;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_offs_init(offsets_);
+
+ ut_a(ref);
+ ut_a(index);
+ ut_a(rec);
+ ut_ad(!dict_index_is_clust(index));
+
+ if (UNIV_UNLIKELY(!index->table)) {
+ fputs("InnoDB: table ", stderr);
+notfound:
+ ut_print_name(stderr, trx, TRUE, index->table_name);
+ fputs(" for index ", stderr);
+ ut_print_name(stderr, trx, FALSE, index->name);
+ fputs(" not found\n", stderr);
+ ut_error;
+ }
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (UNIV_UNLIKELY(!clust_index)) {
+ fputs("InnoDB: clust index for table ", stderr);
+ goto notfound;
+ }
+
+ if (!offsets) {
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+ } else {
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ }
+
+ /* Secondary indexes must not contain externally stored columns. */
+ ut_ad(!rec_offs_any_extern(offsets));
+ ref_len = dict_index_get_n_unique(clust_index);
+
+ ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+ dict_index_copy_types(ref, clust_index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield = dtuple_get_nth_field(ref, i);
+
+ pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+ ut_a(pos != ULINT_UNDEFINED);
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+
+ dfield_set_data(dfield, field, len);
+
+ /* If the primary key contains a column prefix, then the
+ secondary index may contain a longer prefix of the same
+ column, or the full column, and we must adjust the length
+ accordingly. */
+
+ clust_col_prefix_len = dict_index_get_nth_field(
+ clust_index, i)->prefix_len;
+
+ if (clust_col_prefix_len > 0) {
+ if (len != UNIV_SQL_NULL) {
+
+ const dtype_t* dtype
+ = dfield_get_type(dfield);
+
+ dfield_set_len(dfield,
+ dtype_get_at_most_n_mbchars(
+ dtype->prtype,
+ dtype->mbminlen,
+ dtype->mbmaxlen,
+ clust_col_prefix_len,
+ len, (char*) field));
+ }
+ }
+ }
+
+ ut_ad(dtuple_check_typed(ref));
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+ btr_pcur_t* pcur, /*!< out: persistent cursor, which must
+ be closed by the caller */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const dict_table_t* table, /*!< in: table */
+ const dtuple_t* ref, /*!< in: row reference */
+ mtr_t* mtr) /*!< in/out: mtr */
+{
+ ulint low_match;
+ rec_t* rec;
+ dict_index_t* index;
+
+ ut_ad(dtuple_check_typed(ref));
+
+ index = dict_table_get_first_index(table);
+
+ ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+ btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (page_rec_is_infimum(rec)) {
+
+ return(FALSE);
+ }
+
+ if (low_match != dtuple_get_n_fields(ref)) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: secondary index */
+ dict_index_t** clust_index,/*!< out: clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mem_heap_t* heap;
+ dtuple_t* ref;
+ dict_table_t* table;
+ btr_pcur_t pcur;
+ ibool found;
+ rec_t* clust_rec;
+
+ ut_ad(!dict_index_is_clust(index));
+
+ table = index->table;
+
+ heap = mem_heap_create(256);
+
+ ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+ found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+ clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+ mem_heap_free(heap);
+
+ btr_pcur_close(&pcur);
+
+ *clust_index = dict_table_get_first_index(table);
+
+ return(clust_rec);
+}
+
+/***************************************************************//**
+Searches an index record.
+@return TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: index entry */
+ ulint mode, /*!< in: BTR_MODIFY_LEAF, ... */
+ btr_pcur_t* pcur, /*!< in/out: persistent cursor, which must
+ be closed by the caller */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint n_fields;
+ ulint low_match;
+ rec_t* rec;
+
+ ut_ad(dtuple_check_typed(entry));
+
+ btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+ low_match = btr_pcur_get_low_match(pcur);
+
+ rec = btr_pcur_get_rec(pcur);
+
+ n_fields = dtuple_get_n_fields(entry);
+
+ return(!page_rec_is_infimum(rec) && low_match == n_fields);
+}
+
+#include <my_sys.h>
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formated in hex */
+{
+ ulint ret;
+
+ if (data_len <= sizeof(ullint)) {
+
+ ullint value;
+ ibool unsigned_type = prtype & DATA_UNSIGNED;
+
+ value = mach_read_int_type((const byte*) data,
+ data_len, unsigned_type);
+
+ if (unsigned_type) {
+
+ ret = ut_snprintf(buf, buf_size, "%llu",
+ value) + 1;
+ } else {
+
+ ret = ut_snprintf(buf, buf_size, "%lld",
+ (long long) value) + 1;
+ }
+
+ } else {
+
+ *format_in_hex = TRUE;
+ ret = 0;
+ }
+
+ return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ ulint prtype, /*!< in: precise type */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size, /*!< in: output buffer size
+ in bytes */
+ ibool* format_in_hex) /*!< out: should the data be
+ formated in hex */
+{
+ ulint charset_coll;
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ /* we assume system_charset_info is UTF-8 */
+
+ charset_coll = dtype_get_charset_coll(prtype);
+
+ if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+ return(ut_str_sql_format(data, data_len, buf, buf_size));
+ }
+ /* else */
+
+ if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+ *format_in_hex = TRUE;
+ return(0);
+ }
+ /* else */
+
+ return(innobase_raw_format(data, data_len, charset_coll,
+ buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+ const char* data, /*!< in: raw data */
+ ulint data_len, /*!< in: raw data length
+ in bytes */
+ const dict_field_t* dict_field, /*!< in: index field */
+ char* buf, /*!< out: output buffer */
+ ulint buf_size) /*!< in: output buffer size
+ in bytes */
+{
+ ulint mtype;
+ ulint prtype;
+ ulint ret;
+ ibool format_in_hex;
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ if (data_len == UNIV_SQL_NULL) {
+
+ ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1;
+
+ return(ut_min(ret, buf_size));
+ }
+
+ mtype = dict_field->col->mtype;
+ prtype = dict_field->col->prtype;
+
+ format_in_hex = FALSE;
+
+ switch (mtype) {
+ case DATA_INT:
+
+ ret = row_raw_format_int(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+ break;
+ case DATA_CHAR:
+ case DATA_VARCHAR:
+ case DATA_MYSQL:
+ case DATA_VARMYSQL:
+
+ ret = row_raw_format_str(data, data_len, prtype,
+ buf, buf_size, &format_in_hex);
+ if (format_in_hex) {
+
+ goto format_in_hex;
+ }
+
+ break;
+ /* XXX support more data types */
+ default:
+ format_in_hex:
+
+ if (UNIV_LIKELY(buf_size > 2)) {
+
+ memcpy(buf, "0x", 2);
+ buf += 2;
+ buf_size -= 2;
+ ret = 2 + ut_raw_to_hex(data, data_len,
+ buf, buf_size);
+ } else {
+
+ buf[0] = '\0';
+ ret = 1;
+ }
+ }
+
+ return(ret);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include "ut0dbg.h"
+
+void
+test_row_raw_format_int()
+{
+ ulint ret;
+ char buf[128];
+ ibool format_in_hex;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+ ret_expected, buf_expected, format_in_hex_expected)\
+ do {\
+ ibool ok = TRUE;\
+ ulint i;\
+ memset(buf, 'x', 10);\
+ buf[10] = '\0';\
+ format_in_hex = FALSE;\
+ fprintf(stderr, "TESTING \"\\x");\
+ for (i = 0; i < data_len; i++) {\
+ fprintf(stderr, "%02hhX", data[i]);\
+ }\
+ fprintf(stderr, "\", %lu, %lu, %lu\n",\
+ (ulint) data_len, (ulint) prtype,\
+ (ulint) buf_size);\
+ ret = row_raw_format_int(data, data_len, prtype,\
+ buf, buf_size, &format_in_hex);\
+ if (ret != ret_expected) {\
+ fprintf(stderr, "expected ret %lu, got %lu\n",\
+ (ulint) ret_expected, ret);\
+ ok = FALSE;\
+ }\
+ if (strcmp((char*) buf, buf_expected) != 0) {\
+ fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+ buf_expected, buf);\
+ ok = FALSE;\
+ }\
+ if (format_in_hex != format_in_hex_expected) {\
+ fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+ (int) format_in_hex_expected,\
+ (int) format_in_hex);\
+ ok = FALSE;\
+ }\
+ if (ok) {\
+ fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+ (ulint) ret, buf, (int) format_in_hex);\
+ } else {\
+ return;\
+ }\
+ } while (0)
+
+#if 1
+ /* min values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, 0,
+ buf, sizeof(buf), 5, "-128", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, 0,
+ buf, sizeof(buf), 7, "-32768", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, 0,
+ buf, sizeof(buf), 9, "-8388608", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+ buf, sizeof(buf), 12, "-2147483648", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+ buf, sizeof(buf), 14, "-549755813888", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+ buf, sizeof(buf), 17, "-140737488355328", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+ buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+ buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+ /* min values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 2, "0", 0);
+
+ /* max values for signed 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, 0,
+ buf, sizeof(buf), 4, "127", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, 0,
+ buf, sizeof(buf), 6, "32767", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+ buf, sizeof(buf), 8, "8388607", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+ buf, sizeof(buf), 11, "2147483647", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+ buf, sizeof(buf), 13, "549755813887", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+ buf, sizeof(buf), 16, "140737488355327", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+ buf, sizeof(buf), 18, "36028797018963967", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+ buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+ /* max values for unsigned 1-8 byte integers */
+
+ CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 4, "255", 0);
+
+ CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "65535", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 9, "16777215", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 11, "4294967295", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+ buf, sizeof(buf), 14, "1099511627775", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+ buf, sizeof(buf), 16, "281474976710655", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+ buf, sizeof(buf), 18, "72057594037927935", 0);
+
+ CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+ /* some random values */
+
+ CALL_AND_TEST("\x52", 1, 0,
+ buf, sizeof(buf), 4, "-46", 0);
+
+ CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "14", 0);
+
+ CALL_AND_TEST("\x62\xCE", 2, 0,
+ buf, sizeof(buf), 6, "-7474", 0);
+
+ CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "10710", 0);
+
+ CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+ buf, sizeof(buf), 5, "-112", 0);
+
+ CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "41238", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+ buf, sizeof(buf), 3, "-9", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+ buf, sizeof(buf), 3, "92", 0);
+
+ CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+ buf, sizeof(buf), 6, "-9117", 0);
+
+ CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+ buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+ /* speed test */
+
+ speedo_t speedo;
+ ulint i;
+
+ speedo_reset(&speedo);
+
+ for (i = 0; i < 1000000; i++) {
+ row_raw_format_int("\x23", 1,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x23", 1,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ 0, buf, sizeof(buf),
+ &format_in_hex);
+ row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+ DATA_UNSIGNED, buf, sizeof(buf),
+ &format_in_hex);
+ }
+
+ speedo_show(&speedo);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c
new file mode 100644
index 00000000000..2839d935167
--- /dev/null
+++ b/storage/xtradb/row/row0sel.c
@@ -0,0 +1,4964 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.c
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+#include "ha_prototypes.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH 16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT 1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT 100
+
+/* Flags for search shortcut */
+#define SEL_FOUND 0
+#define SEL_EXHAUSTED 1
+#define SEL_RETRY 2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return TRUE if the columns are equal */
+static
+ibool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+ ulint mtype, /*!< in: main type */
+ ulint prtype, /*!< in: precise type */
+ ulint mbminlen, /*!< in: minimum length of a
+ multi-byte character */
+ ulint mbmaxlen, /*!< in: maximum length of a
+ multi-byte character */
+ const byte* clust_field, /*!< in: the locally stored part of
+ the clustered index column, including
+ the BLOB pointer; the clustered
+ index record must be covered by
+ a lock or a page latch to protect it
+ against deletion (rollback or purge) */
+ ulint clust_len, /*!< in: length of clust_field */
+ const byte* sec_field, /*!< in: column in secondary index */
+ ulint sec_len, /*!< in: length of sec_field */
+ ulint zip_size) /*!< in: compressed page size, or 0 */
+{
+ ulint len;
+ byte buf[DICT_MAX_INDEX_COL_LEN];
+
+ len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
+ zip_size,
+ clust_field, clust_len);
+
+ if (UNIV_UNLIKELY(len == 0)) {
+ /* The BLOB was being deleted as the server crashed.
+ There should not be any secondary index records
+ referring to this clustered index record, because
+ btr_free_externally_stored_field() is called after all
+ secondary index entries of the row have been purged. */
+ return(FALSE);
+ }
+
+ len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+ sec_len, len, (const char*) buf);
+
+ return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
+}
+
+/********************************************************************//**
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return TRUE if the secondary record is equal to the corresponding
+fields in the clustered record, when compared with collation;
+FALSE if not equal or if the clustered record has been marked for deletion */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+ const rec_t* sec_rec, /*!< in: secondary index record */
+ dict_index_t* sec_index, /*!< in: secondary index */
+ const rec_t* clust_rec, /*!< in: clustered index record;
+ must be protected by a lock or
+ a page latch against deletion
+ in rollback or purge */
+ dict_index_t* clust_index) /*!< in: clustered index */
+{
+ const byte* sec_field;
+ ulint sec_len;
+ const byte* clust_field;
+ ulint n;
+ ulint i;
+ mem_heap_t* heap = NULL;
+ ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
+ ulint* clust_offs = clust_offsets_;
+ ulint* sec_offs = sec_offsets_;
+ ibool is_equal = TRUE;
+
+ rec_offs_init(clust_offsets_);
+ rec_offs_init(sec_offsets_);
+
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(clust_index->table))) {
+
+ /* The clustered index record is delete-marked;
+ it is not visible in the read view. Besides,
+ if there are any externally stored columns,
+ some of them may have already been purged. */
+ return(FALSE);
+ }
+
+ clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+ ULINT_UNDEFINED, &heap);
+ sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+ ULINT_UNDEFINED, &heap);
+
+ n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+ for (i = 0; i < n; i++) {
+ const dict_field_t* ifield;
+ const dict_col_t* col;
+ ulint clust_pos;
+ ulint clust_len;
+ ulint len;
+
+ ifield = dict_index_get_nth_field(sec_index, i);
+ col = dict_field_get_col(ifield);
+ clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+ clust_field = rec_get_nth_field(
+ clust_rec, clust_offs, clust_pos, &clust_len);
+ sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+ len = clust_len;
+
+ if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
+
+ if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+ len -= BTR_EXTERN_FIELD_REF_SIZE;
+ }
+
+ len = dtype_get_at_most_n_mbchars(
+ col->prtype, col->mbminlen, col->mbmaxlen,
+ ifield->prefix_len, len, (char*) clust_field);
+
+ if (rec_offs_nth_extern(clust_offs, clust_pos)
+ && len < sec_len) {
+ if (!row_sel_sec_rec_is_for_blob(
+ col->mtype, col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ clust_field, clust_len,
+ sec_field, sec_len,
+ dict_table_zip_size(
+ clust_index->table))) {
+ goto inequal;
+ }
+
+ continue;
+ }
+ }
+
+ if (0 != cmp_data_data(col->mtype, col->prtype,
+ clust_field, len,
+ sec_field, sec_len)) {
+inequal:
+ is_equal = FALSE;
+ goto func_exit;
+ }
+ }
+
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(is_equal);
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ sel_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(sel_node_t));
+ node->common.type = QUE_NODE_SELECT;
+ node->state = SEL_NODE_OPEN;
+
+ node->plans = NULL;
+
+ return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+ sel_node_t* node) /*!< in: select node struct */
+{
+ ulint i;
+ plan_t* plan;
+
+ if (node->plans != NULL) {
+ for (i = 0; i < node->n_tables; i++) {
+ plan = sel_node_get_nth_plan(node, i);
+
+ btr_pcur_close(&(plan->pcur));
+ btr_pcur_close(&(plan->clust_pcur));
+
+ if (plan->old_vers_heap) {
+ mem_heap_free(plan->old_vers_heap);
+ }
+ }
+ }
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ exp = node->select_list;
+
+ while (exp) {
+ eval_exp(exp);
+
+ exp = que_node_get_next(exp);
+ }
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+ sym_node_t* var, /*!< in: first variable in a list of variables */
+ sel_node_t* node) /*!< in: select node */
+{
+ que_node_t* exp;
+
+ if (var == NULL) {
+
+ return;
+ }
+
+ exp = node->select_list;
+
+ while (var) {
+ ut_ad(exp);
+
+ eval_node_copy_val(var->alias, exp);
+
+ exp = que_node_get_next(exp);
+ var = que_node_get_next(var);
+ }
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ func_node_t* func_node;
+
+ ut_ad(node->is_aggregate);
+
+ func_node = node->select_list;
+
+ while (func_node) {
+ eval_node_set_int_val(func_node, 0);
+
+ func_node = que_node_get_next(func_node);
+ }
+
+ node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+ sel_node_t* node) /*!< in: select node */
+{
+ sym_node_t* var;
+
+ var = UT_LIST_GET_FIRST(node->copy_variables);
+
+ while (var) {
+ eval_node_copy_val(var, var->alias);
+
+ var->indirection = NULL;
+
+ var = UT_LIST_GET_NEXT(col_var_list, var);
+ }
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+ dict_index_t* index, /*!< in: record index */
+ const rec_t* rec, /*!< in: record in a clustered or non-clustered
+ index; must be protected by a page latch */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ dfield_t* val;
+ ulint index_type;
+ ulint field_no;
+ const byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (dict_index_is_clust(index)) {
+ index_type = SYM_CLUST_FIELD_NO;
+ } else {
+ index_type = SYM_SEC_FIELD_NO;
+ }
+
+ while (column) {
+ mem_heap_t* heap = NULL;
+ ibool needs_copy;
+
+ field_no = column->field_nos[index_type];
+
+ if (field_no != ULINT_UNDEFINED) {
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+ field_no))) {
+
+ /* Copy an externally stored field to the
+ temporary heap, if possible. */
+
+ heap = mem_heap_create(1);
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets,
+ dict_table_zip_size(index->table),
+ field_no, &len, heap);
+
+ /* data == NULL means that the
+ externally stored field was not
+ written yet. This record
+ should only be seen by
+ recv_recovery_rollback_active() or any
+ TRX_ISO_READ_UNCOMMITTED
+ transactions. The InnoDB SQL parser
+ (the sole caller of this function)
+ does not implement READ UNCOMMITTED,
+ and it is not involved during rollback. */
+ ut_a(data);
+ ut_a(len != UNIV_SQL_NULL);
+
+ needs_copy = TRUE;
+ } else {
+ data = rec_get_nth_field(rec, offsets,
+ field_no, &len);
+
+ needs_copy = column->copy_val;
+ }
+
+ if (needs_copy) {
+ eval_node_copy_and_alloc_val(column, data,
+ len);
+ } else {
+ val = que_node_get_val(column);
+ dfield_set_data(val, data, len);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+ sym_node_t* column) /*!< in: symbol table node for a column */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+ column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+ * sizeof(sel_buf_t));
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = column->prefetch_buf + i;
+
+ sel_buf->data = NULL;
+
+ sel_buf->val_buf_size = 0;
+ }
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+ sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
+{
+ sel_buf_t* sel_buf;
+ ulint i;
+
+ for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+ sel_buf = prefetch_buf + i;
+
+ if (sel_buf->val_buf_size > 0) {
+
+ mem_free(sel_buf->data);
+ }
+ }
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint val_buf_size;
+
+ ut_ad(plan->n_rows_prefetched > 0);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ val = que_node_get_val(column);
+
+ if (!column->copy_val) {
+ /* We did not really push any value for the
+ column */
+
+ ut_ad(!column->prefetch_buf);
+ ut_ad(que_node_get_val_buf_size(column) == 0);
+ ut_d(dfield_set_null(val));
+
+ goto next_col;
+ }
+
+ ut_ad(column->prefetch_buf);
+ ut_ad(!dfield_is_ext(val));
+
+ sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+ data = sel_buf->data;
+ len = sel_buf->len;
+ val_buf_size = sel_buf->val_buf_size;
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ sel_buf->data = dfield_get_data(val);
+ sel_buf->len = dfield_get_len(val);
+ sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+ dfield_set_data(val, data, len);
+ que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+
+ plan->n_rows_prefetched--;
+
+ plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+ plan_t* plan) /*!< in: plan node for a table */
+{
+ sym_node_t* column;
+ sel_buf_t* sel_buf;
+ dfield_t* val;
+ byte* data;
+ ulint len;
+ ulint pos;
+ ulint val_buf_size;
+
+ if (plan->n_rows_prefetched == 0) {
+ pos = 0;
+ plan->first_prefetched = 0;
+ } else {
+ pos = plan->n_rows_prefetched;
+
+ /* We have the convention that pushing new rows starts only
+ after the prefetch stack has been emptied: */
+
+ ut_ad(plan->first_prefetched == 0);
+ }
+
+ plan->n_rows_prefetched++;
+
+ ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+ column = UT_LIST_GET_FIRST(plan->columns);
+
+ while (column) {
+ if (!column->copy_val) {
+ /* There is no sense to push pointers to database
+ page fields when we do not keep latch on the page! */
+
+ goto next_col;
+ }
+
+ if (!column->prefetch_buf) {
+ /* Allocate a new prefetch buffer */
+
+ sel_col_prefetch_buf_alloc(column);
+ }
+
+ sel_buf = column->prefetch_buf + pos;
+
+ val = que_node_get_val(column);
+
+ data = dfield_get_data(val);
+ len = dfield_get_len(val);
+ val_buf_size = que_node_get_val_buf_size(column);
+
+ /* We must keep track of the allocated memory for
+ column values to be able to free it later: therefore
+ we swap the values for sel_buf and val */
+
+ dfield_set_data(val, sel_buf->data, sel_buf->len);
+ que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+ sel_buf->data = data;
+ sel_buf->len = len;
+ sel_buf->val_buf_size = val_buf_size;
+next_col:
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+ read_view_t* read_view, /*!< in: read view */
+ dict_index_t* index, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a clustered index */
+ ulint** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, plan->index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint err;
+
+ if (*old_vers_heap) {
+ mem_heap_empty(*old_vers_heap);
+ } else {
+ *old_vers_heap = mem_heap_create(512);
+ }
+
+ err = row_vers_build_for_consistent_read(
+ rec, mtr, index, offsets, read_view, offset_heap,
+ *old_vers_heap, old_vers);
+ return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+ dict_index_t* clust_index, /*!< in: clustered index */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ ulint** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ const rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint err;
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(200);
+ }
+
+ err = row_vers_build_for_semi_consistent_read(
+ rec, mtr, clust_index, offsets, offset_heap,
+ prebuilt->old_vers_heap, old_vers);
+ return(err);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved and the right sides of
+ comparisons evaluated */
+{
+ func_node_t* cond;
+
+ /* All conditions in end_conds are comparisons of a column to an
+ expression */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ /* Evaluate the left side of the comparison, i.e., get the
+ column value if there is an indirection */
+
+ eval_sym(cond->args);
+
+ /* Do the comparison */
+
+ if (!eval_cmp(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+ plan_t* plan) /*!< in: plan for the table; the column values must
+ already have been retrieved */
+{
+ func_node_t* cond;
+
+ cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+ while (cond) {
+ eval_exp(cond);
+
+ if (!eval_node_get_ibool_val(cond)) {
+
+ return(FALSE);
+ }
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+ sel_node_t* node, /*!< in: select_node */
+ plan_t* plan, /*!< in: plan node for table */
+ rec_t* rec, /*!< in: record in a non-clustered index */
+ que_thr_t* thr, /*!< in: query thread */
+ rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* index;
+ rec_t* clust_rec;
+ rec_t* old_vers;
+ ulint err;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ *out_rec = NULL;
+
+ offsets = rec_get_offsets(rec,
+ btr_pcur_get_btr_cur(&plan->pcur)->index,
+ offsets, ULINT_UNDEFINED, &heap);
+
+ row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+ index = dict_table_get_first_index(plan->table);
+
+ btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+ BTR_SEARCH_LEAF, &plan->clust_pcur,
+ 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(&(plan->clust_pcur))
+ < dict_index_get_n_unique(index)) {
+
+ ut_a(rec_get_deleted_flag(rec,
+ dict_table_is_comp(plan->table)));
+ ut_a(node->read_view);
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.c
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ goto func_exit;
+ }
+
+ offsets = rec_get_offsets(clust_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->read_view) {
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED isolation level
+ we lock only the record, i.e., next-key locking is
+ not used. */
+ ulint lock_type;
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ if (srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(&plan->clust_pcur),
+ clust_rec, index, offsets,
+ node->row_lock_mode, lock_type, thr);
+
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ /* Declare the variable uninitialized in Valgrind.
+ It should be set to DB_SUCCESS at func_exit. */
+ UNIV_MEM_INVALID(&err, sizeof err);
+ break;
+ default:
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(
+ node->read_view, index, clust_rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+
+ if (clust_rec == NULL) {
+ goto func_exit;
+ }
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if ((old_vers
+ || rec_get_deleted_flag(rec, dict_table_is_comp(
+ plan->table)))
+ && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+ clust_rec, index)) {
+ goto func_exit;
+ }
+ }
+
+ /* Fetch the columns needed in test conditions. The clustered
+ index record is protected by a page latch that was acquired
+ when plan->clust_pcur was positioned. The latch will not be
+ released until mtr_commit(mtr). */
+
+ ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+ row_sel_fetch_columns(index, clust_rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+ *out_rec = clust_rec;
+func_exit:
+ err = DB_SUCCESS;
+err_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+enum db_err
+sel_set_rec_lock(
+/*=============*/
+ const buf_block_t* block, /*!< in: buffer block of rec */
+ const rec_t* rec, /*!< in: record */
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ ulint mode, /*!< in: lock mode */
+ ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
+ LOC_REC_NOT_GAP */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ trx_t* trx;
+ enum db_err err;
+
+ trx = thr_get_trx(thr);
+
+ if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+ if (buf_LRU_buf_pool_running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+ }
+ }
+
+ if (dict_index_is_clust(index)) {
+ err = lock_clust_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, mode, type, thr);
+ } else {
+ err = lock_sec_rec_read_check_and_lock(
+ 0, block, rec, index, offsets, mode, type, thr);
+ }
+
+ return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+ plan_t* plan, /*!< in: table plan */
+ ibool search_latch_locked,
+ /*!< in: TRUE if the thread currently
+ has the search latch locked in
+ s-mode */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ func_node_t* cond;
+ que_node_t* exp;
+ ulint n_fields;
+ ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
+ ulint i;
+
+ if (search_latch_locked) {
+ has_search_latch = RW_S_LATCH;
+ }
+
+ index = plan->index;
+
+ /* Calculate the value of the search tuple: the exact match columns
+ get their expressions evaluated when we evaluate the right sides of
+ end_conds */
+
+ cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+ while (cond) {
+ eval_exp(que_node_get_next(cond->args));
+
+ cond = UT_LIST_GET_NEXT(cond_list, cond);
+ }
+
+ if (plan->tuple) {
+ n_fields = dtuple_get_n_fields(plan->tuple);
+
+ if (plan->n_exact_match < n_fields) {
+ /* There is a non-exact match field which must be
+ evaluated separately */
+
+ eval_exp(plan->tuple_exps[n_fields - 1]);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ exp = plan->tuple_exps[i];
+
+ dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+ que_node_get_val(exp));
+ }
+
+ /* Open pcur to the index */
+
+ btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+ BTR_SEARCH_LEAF, &plan->pcur,
+ has_search_latch, mtr);
+ } else {
+ /* Open the cursor to the start or the end of the index
+ (FALSE: no init) */
+
+ btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
+ &(plan->pcur), FALSE, mtr);
+ }
+
+ ut_ad(plan->n_rows_prefetched == 0);
+ ut_ad(plan->n_rows_fetched == 0);
+ ut_ad(plan->cursor_at_end == FALSE);
+
+ plan->pcur_is_open = TRUE;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+ plan_t* plan, /*!< in: table plan */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ibool equal_position;
+ ulint relative_position;
+
+ ut_ad(!plan->cursor_at_end);
+
+ relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+ equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
+ &(plan->pcur), mtr);
+
+ /* If the cursor is traveling upwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+ yet on the successor of the page infimum;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ not yet processed the cursor record: no need to move the cursor to the
+ next record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we must move to the next record;
+ (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the next
+ record, else there is no need to move the cursor. */
+
+ if (plan->asc) {
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(FALSE);
+ }
+
+ /* If the cursor is traveling downwards, and relative_position is
+
+ (1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+ the last record LESS than the successor of a page infimum; we have not
+ processed the cursor record: no need to move the cursor;
+ (2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+ first record GREATER than the predecessor of a page supremum; we have
+ processed the cursor record: we should move the cursor to the previous
+ record;
+ (3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+ last record LESS or EQUAL to the old stored user record; (a) if
+ equal_position is FALSE, this means that the cursor is now on a record
+ less than the old user record, and we need not move to the previous
+ record; (b) if equal_position is TRUE, then if
+ plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+ record, else there is no need to move the cursor. */
+
+ if (relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+ return(FALSE);
+ }
+
+ if (relative_position == BTR_PCUR_ON) {
+
+ if (equal_position) {
+
+ return(plan->stored_cursor_rec_processed);
+ }
+
+ return(FALSE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+ plan_t* plan) /*!< in: plan */
+{
+ plan->pcur_is_open = FALSE;
+ plan->cursor_at_end = FALSE;
+ plan->n_rows_fetched = 0;
+ plan->n_rows_prefetched = 0;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+ sel_node_t* node, /*!< in: select node for a consistent read */
+ plan_t* plan, /*!< in: plan for a unique search in clustered
+ index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_index_t* index;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ulint ret;
+ rec_offs_init(offsets_);
+
+ index = plan->index;
+
+ ut_ad(node->read_view);
+ ut_ad(plan->unique_search);
+ ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ row_sel_open_pcur(plan, TRUE, mtr);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return(SEL_RETRY);
+ }
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (dict_index_is_clust(index)) {
+ if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+ node->read_view)) {
+ ret = SEL_RETRY;
+ goto func_exit;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
+
+ ret = SEL_RETRY;
+ goto func_exit;
+ }
+
+ /* Test the deleted flag. */
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+
+ ret = SEL_EXHAUSTED;
+ goto func_exit;
+ }
+
+ /* Fetch the columns needed in test conditions. The index
+ record is protected by a page latch that was acquired when
+ plan->pcur was positioned. The latch will not be released
+ until mtr_commit(mtr). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ ret = SEL_EXHAUSTED;
+ goto func_exit;
+ }
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ plan->n_rows_fetched++;
+ ret = SEL_FOUND;
+func_exit:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(ret);
+}
+
+/*********************************************************************//**
+Performs a select step.
+@return DB_SUCCESS or error code */
+static
+ulint
+row_sel(
+/*====*/
+ sel_node_t* node, /*!< in: select node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ plan_t* plan;
+ mtr_t mtr;
+ ibool moved;
+ rec_t* rec;
+ rec_t* old_vers;
+ rec_t* clust_rec;
+ ibool search_latch_locked;
+ ibool consistent_read;
+
+ /* The following flag becomes TRUE when we are doing a
+ consistent read from a non-clustered index and we must look
+ at the clustered index to find out the previous delete mark
+ state of the non-clustered record: */
+
+ ibool cons_read_requires_clust_rec = FALSE;
+ ulint cost_counter = 0;
+ ibool cursor_just_opened;
+ ibool must_go_to_next;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ /* TRUE if the search was made using
+ a non-clustered index, and we had to
+ access the clustered record: now &mtr
+ contains a clustered index latch, and
+ &mtr must be committed before we move
+ to the next non-clustered record */
+ ulint found_flag;
+ ulint err;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_ad(thr->run_node == node);
+
+ search_latch_locked = FALSE;
+
+ if (node->read_view) {
+ /* In consistent reads, we try to do with the hash index and
+ not to use the buffer page get. This is to reduce memory bus
+ load resulting from semaphore operations. The search latch
+ will be s-locked when we access an index with a unique search
+ condition, but not locked when we access an index with a
+ less selective search condition. */
+
+ consistent_read = TRUE;
+ } else {
+ consistent_read = FALSE;
+ }
+
+table_loop:
+ /* TABLE LOOP
+ ----------
+ This is the outer major loop in calculating a join. We come here when
+ node->fetch_table changes, and after adding a row to aggregate totals
+ and, of course, when this function is called. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ plan = sel_node_get_nth_plan(node, node->fetch_table);
+ index = plan->index;
+
+ if (plan->n_rows_prefetched > 0) {
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+ if (plan->cursor_at_end) {
+ /* The cursor has already reached the result set end: no more
+ rows to process for this table cursor, as also the prefetch
+ stack was empty */
+
+ ut_ad(plan->pcur_is_open);
+
+ goto table_exhausted_no_mtr;
+ }
+
+ /* Open a cursor to index, or restore an open cursor position */
+
+ mtr_start(&mtr);
+
+ if (consistent_read && plan->unique_search && !plan->pcur_is_open
+ && !plan->must_get_clust
+ && !plan->table->big_rows) {
+ if (!search_latch_locked) {
+ rw_lock_s_lock(&btr_search_latch);
+
+ search_latch_locked = TRUE;
+ } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
+
+ /* There is an x-latch request waiting: release the
+ s-latch for a moment; as an s-latch here is often
+ kept for some 10 searches before being released,
+ a waiting x-latch request would block other threads
+ from acquiring an s-latch for a long time, lowering
+ performance significantly in multiprocessors. */
+
+ rw_lock_s_unlock(&btr_search_latch);
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+ found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+ if (found_flag == SEL_FOUND) {
+
+ goto next_table;
+
+ } else if (found_flag == SEL_EXHAUSTED) {
+
+ goto table_exhausted;
+ }
+
+ ut_ad(found_flag == SEL_RETRY);
+
+ plan_reset_cursor(plan);
+
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ }
+
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+
+ search_latch_locked = FALSE;
+ }
+
+ if (!plan->pcur_is_open) {
+ /* Evaluate the expressions to build the search tuple and
+ open the cursor */
+
+ row_sel_open_pcur(plan, search_latch_locked, &mtr);
+
+ cursor_just_opened = TRUE;
+
+ /* A new search was made: increment the cost counter */
+ cost_counter++;
+ } else {
+ /* Restore pcur position to the index */
+
+ must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+ cursor_just_opened = FALSE;
+
+ if (must_go_to_next) {
+ /* We have already processed the cursor record: move
+ to the next */
+
+ goto next_rec;
+ }
+ }
+
+rec_loop:
+ /* RECORD LOOP
+ -----------
+ In this loop we use pcur and try to fetch a qualifying row, and
+ also fill the prefetch buffer for this table if n_rows_fetched has
+ exceeded a threshold. While we are inside this loop, the following
+ holds:
+ (1) &mtr is started,
+ (2) pcur is positioned and open.
+
+ NOTE that if cursor_just_opened is TRUE here, it means that we came
+ to this point right after row_sel_open_pcur. */
+
+ ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+ rec = btr_pcur_get_rec(&(plan->pcur));
+
+ /* PHASE 1: Set a lock if specified */
+
+ if (!node->asc && cursor_just_opened
+ && !page_rec_is_supremum(rec)) {
+
+ /* When we open a cursor for a descending search, we must set
+ a next-key lock on the successor record: otherwise it would
+ be possible to insert new records next to the cursor position,
+ and it might be that these new records should appear in the
+ search result set, resulting in the phantom problem. */
+
+ if (!consistent_read) {
+
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED isolation
+ level, we lock only the record, i.e., next-key
+ locking is not used. */
+
+ rec_t* next_rec = page_rec_get_next(rec);
+ ulint lock_type;
+ trx_t* trx;
+
+ trx = thr_get_trx(thr);
+
+ offsets = rec_get_offsets(next_rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (srv_locks_unsafe_for_binlog
+ || trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED) {
+
+ if (page_rec_is_supremum(next_rec)) {
+
+ goto skip_lock;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+ next_rec, index, offsets,
+ node->row_lock_mode,
+ lock_type, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ /* Note that in this case we will store in pcur
+ the PREDECESSOR of the record we are waiting
+ the lock for */
+ goto lock_wait_or_error;
+ }
+ }
+ }
+
+skip_lock:
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. We also increment the cost counter as we may have
+ processed yet another page of index. */
+
+ cost_counter++;
+
+ goto next_rec;
+ }
+
+ if (!consistent_read) {
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using READ COMMITTED isolation level,
+ we lock only the record, i.e., next-key locking is
+ not used. */
+
+ ulint lock_type;
+ trx_t* trx;
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ trx = thr_get_trx(thr);
+
+ if (srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+
+ if (page_rec_is_supremum(rec)) {
+
+ goto next_rec;
+ }
+
+ lock_type = LOCK_REC_NOT_GAP;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+ rec, index, offsets,
+ node->row_lock_mode, lock_type, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ /* A page supremum record cannot be in the result set: skip
+ it now when we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ ut_ad(page_rec_is_user_rec(rec));
+
+ if (cost_counter > SEL_COST_LIMIT) {
+
+ /* Now that we have placed the necessary locks, we can stop
+ for a while and store the cursor position; NOTE that if we
+ would store the cursor position BEFORE placing a record lock,
+ it might happen that the cursor would jump over some records
+ that another transaction could meanwhile insert adjacent to
+ the cursor: this would result in the phantom problem. */
+
+ goto stop_for_a_while;
+ }
+
+ /* PHASE 2: Check a mixed index mix id if needed */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ ut_ad(plan->mode == PAGE_CUR_GE);
+
+ /* As the cursor is now placed on a user record after a search
+ with the mode PAGE_CUR_GE, the up_match field in the cursor
+ tells how many fields in the user record matched to the search
+ tuple */
+
+ if (btr_pcur_get_up_match(&(plan->pcur))
+ < plan->n_exact_match) {
+ goto table_exhausted;
+ }
+
+ /* Ok, no need to test end_conds or mix id */
+
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ /* PHASE 3: Get previous version in a consistent read */
+
+ cons_read_requires_clust_rec = FALSE;
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (consistent_read) {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (dict_index_is_clust(index)) {
+
+ if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+ node->read_view)) {
+
+ err = row_sel_build_prev_vers(
+ node->read_view, index, rec,
+ &offsets, &heap, &plan->old_vers_heap,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The record does not exist
+ in our read view. Skip it, but
+ first attempt to determine
+ whether the index segment we
+ are searching through has been
+ exhausted. */
+
+ offsets = rec_get_offsets(
+ rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ /* Fetch the columns needed in
+ test conditions. The clustered
+ index record is protected by a
+ page latch that was acquired
+ by row_sel_open_pcur() or
+ row_sel_restore_pcur_pos().
+ The latch will not be released
+ until mtr_commit(mtr). */
+
+ row_sel_fetch_columns(
+ index, rec, offsets,
+ UT_LIST_GET_FIRST(
+ plan->columns));
+
+ if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else if (!lock_sec_rec_cons_read_sees(rec,
+ node->read_view)) {
+ cons_read_requires_clust_rec = TRUE;
+ }
+ }
+
+ /* PHASE 4: Test search end conditions and deleted flag */
+
+ /* Fetch the columns needed in test conditions. The record is
+ protected by a page latch that was acquired by
+ row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
+ will not be released until mtr_commit(mtr). */
+
+ row_sel_fetch_columns(index, rec, offsets,
+ UT_LIST_GET_FIRST(plan->columns));
+
+ /* Test the selection end conditions: these can only contain columns
+ which already are found in the index, even though the index might be
+ non-clustered */
+
+ if (plan->unique_search && cursor_just_opened) {
+
+ /* No test necessary: the test was already made above */
+
+ } else if (!row_sel_test_end_conds(plan)) {
+
+ goto table_exhausted;
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+ && !cons_read_requires_clust_rec) {
+
+ /* The record is delete marked: we can skip it if this is
+ not a consistent read which might see an earlier version
+ of a non-clustered index record */
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 5: Get the clustered index record, if needed and if we did
+ not do the search using the clustered index */
+
+ if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+ &mtr);
+ mtr_has_extra_clust_latch = TRUE;
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* Retrieving the clustered record required a search:
+ increment the cost counter */
+
+ cost_counter++;
+
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(consistent_read);
+
+ goto next_rec;
+ }
+
+ if (rec_get_deleted_flag(clust_rec,
+ dict_table_is_comp(plan->table))) {
+
+ /* The record is delete marked: we can skip it */
+
+ goto next_rec;
+ }
+
+ if (node->can_get_updated) {
+
+ btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+ }
+ }
+
+ /* PHASE 6: Test the rest of search conditions */
+
+ if (!row_sel_test_other_conds(plan)) {
+
+ if (plan->unique_search) {
+
+ goto table_exhausted;
+ }
+
+ goto next_rec;
+ }
+
+ /* PHASE 7: We found a new qualifying row for the current table; push
+ the row if prefetch is on, or move to the next table in the join */
+
+ plan->n_rows_fetched++;
+
+ ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+ if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+ || plan->unique_search || plan->no_prefetch
+ || plan->table->big_rows) {
+
+ /* No prefetch in operation: go to the next table */
+
+ goto next_table;
+ }
+
+ sel_push_prefetched_row(plan);
+
+ if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+ /* The prefetch buffer is now full */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table;
+ }
+
+next_rec:
+ ut_ad(!search_latch_locked);
+
+ if (mtr_has_extra_clust_latch) {
+
+ /* We must commit &mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ goto commit_mtr_for_a_while;
+ }
+
+ if (node->asc) {
+ moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+ } else {
+ moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+ }
+
+ if (!moved) {
+
+ goto table_exhausted;
+ }
+
+ cursor_just_opened = FALSE;
+
+ /* END OF RECORD LOOP
+ ------------------ */
+ goto rec_loop;
+
+next_table:
+ /* We found a record which satisfies the conditions: we can move to
+ the next table or return a row in the result set */
+
+ ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+ if (plan->unique_search && !node->can_get_updated) {
+
+ plan->cursor_at_end = TRUE;
+ } else {
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+ /* If we use 'goto' to this label, it means that the row was popped
+ from the prefetched rows stack, and &mtr is already committed */
+
+ if (node->fetch_table + 1 == node->n_tables) {
+
+ sel_eval_select_list(node);
+
+ if (node->is_aggregate) {
+
+ goto table_loop;
+ }
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ node->fetch_table++;
+
+ /* When we move to the next table, we first reset the plan cursor:
+ we do not care about resetting it when we backtrack from a table */
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+ goto table_loop;
+
+table_exhausted:
+ /* The table cursor pcur reached the result set end: backtrack to the
+ previous table in the join if we do not have cached prefetched rows */
+
+ plan->cursor_at_end = TRUE;
+
+ mtr_commit(&mtr);
+
+ mtr_has_extra_clust_latch = FALSE;
+
+ if (plan->n_rows_prefetched > 0) {
+ /* The table became exhausted during a prefetch */
+
+ sel_pop_prefetched_row(plan);
+
+ goto next_table_no_mtr;
+ }
+
+table_exhausted_no_mtr:
+ if (node->fetch_table == 0) {
+ err = DB_SUCCESS;
+
+ if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+ node->aggregate_already_fetched = TRUE;
+
+ sel_assign_into_var_values(node->into_list, node);
+
+ thr->run_node = que_node_get_parent(node);
+ } else {
+ node->state = SEL_NODE_NO_MORE_ROWS;
+
+ thr->run_node = que_node_get_parent(node);
+ }
+
+ goto func_exit;
+ }
+
+ node->fetch_table--;
+
+ goto table_loop;
+
+stop_for_a_while:
+ /* Return control for a while to que_run_threads, so that runaway
+ queries can be canceled. NOTE that when we come here, we must, in a
+ locking read, have placed the necessary (possibly waiting request)
+ record lock on the cursor record or its successor: when we reposition
+ the cursor, this record lock guarantees that nobody can meanwhile have
+ inserted new records which should have appeared in the result set,
+ which would result in the phantom problem. */
+
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+ err = DB_SUCCESS;
+ goto func_exit;
+
+commit_mtr_for_a_while:
+ /* Stores the cursor position and commits &mtr; this is used if
+ &mtr may contain latches which would break the latching order if
+ &mtr would not be committed and the latches released. */
+
+ plan->stored_cursor_rec_processed = TRUE;
+
+ ut_ad(!search_latch_locked);
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+ mtr_has_extra_clust_latch = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+ goto table_loop;
+
+lock_wait_or_error:
+ /* See the note at stop_for_a_while: the same holds for this case */
+
+ ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+ ut_ad(!search_latch_locked);
+
+ plan->stored_cursor_rec_processed = FALSE;
+ btr_pcur_store_position(&(plan->pcur), &mtr);
+
+ mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+func_exit:
+ if (search_latch_locked) {
+ rw_lock_s_unlock(&btr_search_latch);
+ }
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint i_lock_mode;
+ sym_node_t* table_node;
+ sel_node_t* node;
+ ulint err;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+ /* If this is a new time this node is executed (or when execution
+ resumes after wait for a table intention lock), set intention locks
+ on the tables, or assign a read view */
+
+ if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+ node->state = SEL_NODE_OPEN;
+ }
+
+ if (node->state == SEL_NODE_OPEN) {
+
+ /* It may be that the current session has not yet started
+ its transaction, or it has been committed: */
+
+ trx_start_if_not_started(thr_get_trx(thr));
+
+ plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+ if (node->consistent_read) {
+ /* Assign a read view for the query */
+ node->read_view = trx_assign_read_view(
+ thr_get_trx(thr));
+ } else {
+ if (node->set_x_locks) {
+ i_lock_mode = LOCK_IX;
+ } else {
+ i_lock_mode = LOCK_IS;
+ }
+
+ table_node = node->table_list;
+
+ while (table_node) {
+ err = lock_table(0, table_node->table,
+ i_lock_mode, thr);
+ if (err != DB_SUCCESS) {
+ thr_get_trx(thr)->error_state = err;
+
+ return(NULL);
+ }
+
+ table_node = que_node_get_next(table_node);
+ }
+ }
+
+ /* If this is an explicit cursor, copy stored procedure
+ variable values, so that the values cannot change between
+ fetches (currently, we copy them also for non-explicit
+ cursors) */
+
+ if (node->explicit_cursor
+ && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+ row_sel_copy_input_variable_vals(node);
+ }
+
+ node->state = SEL_NODE_FETCH;
+ node->fetch_table = 0;
+
+ if (node->is_aggregate) {
+ /* Reset the aggregate total values */
+ sel_reset_aggregate_vals(node);
+ }
+ }
+
+ err = row_sel(node, thr);
+
+ /* NOTE! if queries are parallelized, the following assignment may
+ have problems; the assignment should be made only if thr is the
+ only top-level thr in the graph: */
+
+ thr->graph->last_sel_node = node;
+
+ if (err != DB_SUCCESS) {
+ thr_get_trx(thr)->error_state = err;
+
+ return(NULL);
+ }
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ sel_node_t* sel_node;
+ fetch_node_t* node;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+ sel_node = node->cursor_def;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+ if (thr->prev_node != que_node_get_parent(node)) {
+
+ if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+ if (node->into_list) {
+ sel_assign_into_var_values(node->into_list,
+ sel_node);
+ } else {
+ void* ret = (*node->func->func)(
+ sel_node, node->func->arg);
+
+ if (!ret) {
+ sel_node->state
+ = SEL_NODE_NO_MORE_ROWS;
+ }
+ }
+ }
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ /* Make the fetch node the parent of the cursor definition for
+ the time of the fetch, so that execution knows to return to this
+ fetch node after a row has been selected or we know that there is
+ no row left */
+
+ sel_node->common.parent = node;
+
+ if (sel_node->state == SEL_NODE_CLOSED) {
+ fprintf(stderr,
+ "InnoDB: Error: fetch called on a closed cursor\n");
+
+ thr_get_trx(thr)->error_state = DB_ERROR;
+
+ return(NULL);
+ }
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+ void* row, /*!< in: sel_node_t* */
+ void* user_arg) /*!< in: not used */
+{
+ sel_node_t* node = row;
+ que_node_t* exp;
+ ulint i = 0;
+
+ UT_NOT_USED(user_arg);
+
+ fprintf(stderr, "row_fetch_print: row %p\n", row);
+
+ exp = node->select_list;
+
+ while (exp) {
+ dfield_t* dfield = que_node_get_val(exp);
+ const dtype_t* type = dfield_get_type(dfield);
+
+ fprintf(stderr, " column %lu:\n", (ulong)i);
+
+ dtype_print(type);
+ putc('\n', stderr);
+
+ if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
+ ut_print_buf(stderr, dfield_get_data(dfield),
+ dfield_get_len(dfield));
+ putc('\n', stderr);
+ } else {
+ fputs(" <NULL>;\n", stderr);
+ }
+
+ exp = que_node_get_next(exp);
+ i++;
+ }
+
+ return((void*)42);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ row_printf_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* arg;
+
+ ut_ad(thr);
+
+ node = thr->run_node;
+
+ sel_node = node->sel_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+
+ if (sel_node->state != SEL_NODE_FETCH) {
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to print */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+ }
+
+ arg = sel_node->select_list;
+
+ while (arg) {
+ dfield_print_also_hex(que_node_get_val(arg));
+
+ fputs(" ::: ", stderr);
+
+ arg = que_node_get_next(arg);
+ }
+
+ putc('\n', stderr);
+
+ /* Fetch next row to print */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+ dtuple_t* tuple, /*!< in/out: tuple where to build;
+ NOTE: we assume that the type info
+ in the tuple is already according
+ to index! */
+ byte* buf, /*!< in: buffer to use in field
+ conversions */
+ ulint buf_len, /*!< in: buffer length */
+ dict_index_t* index, /*!< in: index of the key value */
+ const byte* key_ptr, /*!< in: MySQL key value */
+ ulint key_len, /*!< in: MySQL key value length */
+ trx_t* trx) /*!< in: transaction */
+{
+ byte* original_buf = buf;
+ const byte* original_key_ptr = key_ptr;
+ dict_field_t* field;
+ dfield_t* dfield;
+ ulint data_offset;
+ ulint data_len;
+ ulint data_field_len;
+ ibool is_null;
+ const byte* key_end;
+ ulint n_fields = 0;
+
+ /* For documentation of the key value storage format in MySQL, see
+ ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+ key_end = key_ptr + key_len;
+
+ /* Permit us to access any field in the tuple (ULINT_MAX): */
+
+ dtuple_set_n_fields(tuple, ULINT_MAX);
+
+ dfield = dtuple_get_nth_field(tuple, 0);
+ field = dict_index_get_nth_field(index, 0);
+
+ if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+ /* A special case: we are looking for a position in the
+ generated clustered index which InnoDB automatically added
+ to a table with no primary key: the first and the only
+ ordering column is ROW_ID which InnoDB stored to the key_ptr
+ buffer. */
+
+ ut_a(key_len == DATA_ROW_ID_LEN);
+
+ dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+ dtuple_set_n_fields(tuple, 1);
+
+ return;
+ }
+
+ while (key_ptr < key_end) {
+
+ ulint type = dfield_get_type(dfield)->mtype;
+ ut_a(field->col->mtype == type);
+
+ data_offset = 0;
+ is_null = FALSE;
+
+ if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+ /* The first byte in the field tells if this is
+ an SQL NULL value */
+
+ data_offset = 1;
+
+ if (*key_ptr != 0) {
+ dfield_set_null(dfield);
+
+ is_null = TRUE;
+ }
+ }
+
+ /* Calculate data length and data field total length */
+
+ if (type == DATA_BLOB) {
+ /* The key field is a column prefix of a BLOB or
+ TEXT */
+
+ ut_a(field->prefix_len > 0);
+
+ /* MySQL stores the actual data length to the first 2
+ bytes after the optional SQL NULL marker byte. The
+ storage format is little-endian, that is, the most
+ significant byte at a higher address. In UTF-8, MySQL
+ seems to reserve field->prefix_len bytes for
+ storing this field in the key value buffer, even
+ though the actual value only takes data_len bytes
+ from the start. */
+
+ data_len = key_ptr[data_offset]
+ + 256 * key_ptr[data_offset + 1];
+ data_field_len = data_offset + 2 + field->prefix_len;
+
+ data_offset += 2;
+
+ /* Now that we know the length, we store the column
+ value like it would be a fixed char field */
+
+ } else if (field->prefix_len > 0) {
+ /* Looks like MySQL pads unused end bytes in the
+ prefix with space. Therefore, also in UTF-8, it is ok
+ to compare with a prefix containing full prefix_len
+ bytes, and no need to take at most prefix_len / 3
+ UTF-8 characters from the start.
+ If the prefix is used as the upper end of a LIKE
+ 'abc%' query, then MySQL pads the end with chars
+ 0xff. TODO: in that case does it any harm to compare
+ with the full prefix_len bytes. How do characters
+ 0xff in UTF-8 behave? */
+
+ data_len = field->prefix_len;
+ data_field_len = data_offset + data_len;
+ } else {
+ data_len = dfield_get_type(dfield)->len;
+ data_field_len = data_offset + data_len;
+ }
+
+ if (UNIV_UNLIKELY
+ (dtype_get_mysql_type(dfield_get_type(dfield))
+ == DATA_MYSQL_TRUE_VARCHAR)
+ && UNIV_LIKELY(type != DATA_INT)) {
+ /* In a MySQL key value format, a true VARCHAR is
+ always preceded by 2 bytes of a length field.
+ dfield_get_type(dfield)->len returns the maximum
+ 'payload' len in bytes. That does not include the
+ 2 bytes that tell the actual data length.
+
+ We added the check != DATA_INT to make sure we do
+ not treat MySQL ENUM or SET as a true VARCHAR! */
+
+ data_len += 2;
+ data_field_len += 2;
+ }
+
+ /* Storing may use at most data_len bytes of buf */
+
+ if (UNIV_LIKELY(!is_null)) {
+ row_mysql_store_col_in_innobase_format(
+ dfield, buf,
+ FALSE, /* MySQL key value format col */
+ key_ptr + data_offset, data_len,
+ dict_table_is_comp(index->table));
+ buf += data_len;
+ }
+
+ key_ptr += data_field_len;
+
+ if (UNIV_UNLIKELY(key_ptr > key_end)) {
+ /* The last field in key was not a complete key field
+ but a prefix of it.
+
+ Print a warning about this! HA_READ_PREFIX_LAST does
+ not currently work in InnoDB with partial-field key
+ value prefixes. Since MySQL currently uses a padding
+ trick to calculate LIKE 'abc%' type queries there
+ should never be partial-field prefixes in searches. */
+
+ ut_print_timestamp(stderr);
+
+ fputs(" InnoDB: Warning: using a partial-field"
+ " key prefix in search.\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, ". Last data field length %lu bytes,\n"
+ "InnoDB: key ptr now exceeds"
+ " key end by %lu bytes.\n"
+ "InnoDB: Key value in the MySQL format:\n",
+ (ulong) data_field_len,
+ (ulong) (key_ptr - key_end));
+ fflush(stderr);
+ ut_print_buf(stderr, original_key_ptr, key_len);
+ putc('\n', stderr);
+
+ if (!is_null) {
+ ulint len = dfield_get_len(dfield);
+ dfield_set_len(dfield, len
+ - (ulint) (key_ptr - key_end));
+ }
+ }
+
+ n_fields++;
+ field++;
+ dfield++;
+ }
+
+ ut_a(buf <= original_buf + buf_len);
+
+ /* We set the length of tuple to n_fields: we assume that the memory
+ area allocated for it is big enough (usually bigger than n_fields). */
+
+ dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
+ const rec_t* index_rec, /*!< in: record */
+ const dict_index_t* index, /*!< in: index of the record */
+ const ulint* offsets) /*!< in: rec_get_offsets
+ (index_rec, index) */
+{
+ const byte* data;
+ ulint len;
+
+ ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+ data = rec_get_nth_field(
+ index_rec, offsets,
+ dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+ if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
+ fprintf(stderr,
+ "InnoDB: Error: Row id field is"
+ " wrong length %lu in ", (ulong) len);
+ dict_index_name_print(stderr, prebuilt->trx, index);
+ fprintf(stderr, "\n"
+ "InnoDB: Field number %lu, record:\n",
+ (ulong) dict_index_get_sys_col_pos(index,
+ DATA_ROW_ID));
+ rec_print_new(stderr, index_rec, offsets);
+ putc('\n', stderr);
+ ut_error;
+ }
+
+ ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
+static
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+ byte* dest, /*!< in/out: buffer where to store; NOTE
+ that BLOBs are not in themselves
+ stored here: the caller must allocate
+ and copy the BLOB into buffer before,
+ and pass the pointer to the BLOB in
+ 'data' */
+ const mysql_row_templ_t* templ,
+ /*!< in: MySQL column template.
+ Its following fields are referenced:
+ type, is_unsigned, mysql_col_len,
+ mbminlen, mbmaxlen */
+ const byte* data, /*!< in: data to store */
+ ulint len) /*!< in: length of the data */
+{
+ byte* ptr;
+ byte* field_end;
+ byte* pad_ptr;
+
+ ut_ad(len != UNIV_SQL_NULL);
+ UNIV_MEM_ASSERT_RW(data, len);
+
+ switch (templ->type) {
+ case DATA_INT:
+ /* Convert integer data from Innobase to a little-endian
+ format, sign bit restored to normal */
+
+ ptr = dest + len;
+
+ for (;;) {
+ ptr--;
+ *ptr = *data;
+ if (ptr == dest) {
+ break;
+ }
+ data++;
+ }
+
+ if (!templ->is_unsigned) {
+ dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+ }
+
+ ut_ad(templ->mysql_col_len == len);
+ break;
+
+ case DATA_VARCHAR:
+ case DATA_VARMYSQL:
+ case DATA_BINARY:
+ field_end = dest + templ->mysql_col_len;
+
+ if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+ /* This is a >= 5.0.3 type true VARCHAR. Store the
+ length of the data to the first byte or the first
+ two bytes of dest. */
+
+ dest = row_mysql_store_true_var_len(
+ dest, len, templ->mysql_length_bytes);
+ }
+
+ /* Copy the actual data */
+ ut_memcpy(dest, data, len);
+
+ /* Pad with trailing spaces. We pad with spaces also the
+ unused end of a >= 5.0.3 true VARCHAR column, just in case
+ MySQL expects its contents to be deterministic. */
+
+ pad_ptr = dest + len;
+
+ ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+ /* We handle UCS2 charset strings differently. */
+ if (templ->mbminlen == 2) {
+ /* A space char is two bytes, 0x0020 in UCS2 */
+
+ if (len & 1) {
+ /* A 0x20 has been stripped from the column.
+ Pad it back. */
+
+ if (pad_ptr < field_end) {
+ *pad_ptr = 0x20;
+ pad_ptr++;
+ }
+ }
+
+ /* Pad the rest of the string with 0x0020 */
+
+ while (pad_ptr < field_end) {
+ *pad_ptr = 0x00;
+ pad_ptr++;
+ *pad_ptr = 0x20;
+ pad_ptr++;
+ }
+ } else {
+ ut_ad(templ->mbminlen == 1);
+ /* space=0x20 */
+
+ memset(pad_ptr, 0x20, field_end - pad_ptr);
+ }
+ break;
+
+ case DATA_BLOB:
+ /* Store a pointer to the BLOB buffer to dest: the BLOB was
+ already copied to the buffer in row_sel_store_mysql_rec */
+
+ row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+ len);
+ break;
+
+ case DATA_MYSQL:
+ memcpy(dest, data, len);
+
+ ut_ad(templ->mysql_col_len >= len);
+ ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+ ut_ad(templ->mbmaxlen > templ->mbminlen
+ || templ->mysql_col_len == len);
+ /* The following assertion would fail for old tables
+ containing UTF-8 ENUM columns due to Bug #9526. */
+ ut_ad(!templ->mbmaxlen
+ || !(templ->mysql_col_len % templ->mbmaxlen));
+ ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
+
+ if (templ->mbminlen != templ->mbmaxlen) {
+ /* Pad with spaces. This undoes the stripping
+ done in row0mysql.ic, function
+ row_mysql_store_col_in_innobase_format(). */
+
+ memset(dest + len, 0x20, templ->mysql_col_len - len);
+ }
+ break;
+
+ default:
+#ifdef UNIV_DEBUG
+ case DATA_SYS_CHILD:
+ case DATA_SYS:
+ /* These column types should never be shipped to MySQL. */
+ ut_ad(0);
+
+ case DATA_CHAR:
+ case DATA_FIXBINARY:
+ case DATA_FLOAT:
+ case DATA_DOUBLE:
+ case DATA_DECIMAL:
+ /* Above are the valid column types for MySQL data. */
+#endif /* UNIV_DEBUG */
+ ut_ad(templ->mysql_col_len == len);
+ memcpy(dest, data, len);
+ }
+}
+
+/**************************************************************//**
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@return TRUE on success, FALSE if not all columns could be retrieved */
+static __attribute__((warn_unused_result))
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+ byte* mysql_rec, /*!< out: row in the MySQL format */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: Innobase record in the index
+ which was described in prebuilt's
+ template; must be protected by
+ a page latch */
+ const ulint* offsets, /* in: array returned by
+ rec_get_offsets() */
+ ulint start_field_no, /* in: start from this field */
+ ulint end_field_no) /* in: end at this field */
+{
+ mysql_row_templ_t* templ;
+ mem_heap_t* extern_field_heap = NULL;
+ mem_heap_t* heap;
+ const byte* data;
+ ulint len;
+ ulint i;
+
+ ut_ad(prebuilt->mysql_template);
+ ut_ad(prebuilt->default_rec);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+ mem_heap_free(prebuilt->blob_heap);
+ prebuilt->blob_heap = NULL;
+ }
+
+// psergey@askmonty.org: don't take the following:
+#if 0
+ /* init null bytes with default values as they might be
+
+ left uninitialized in some cases and these uninited bytes
+ might be copied into mysql record buffer that leads to
+ valgrind warnings */
+ memcpy(mysql_rec, prebuilt->default_rec, prebuilt->null_bitmap_len);
+#endif
+
+ for (i = start_field_no; i < end_field_no /* prebuilt->n_template */ ; i++) {
+
+ templ = prebuilt->mysql_template + i;
+
+ if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+ templ->rec_field_no))) {
+
+ /* Copy an externally stored field to the temporary
+ heap */
+
+ ut_a(!prebuilt->trx->has_search_latch);
+
+ if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ UNIV_PAGE_SIZE);
+ }
+
+ heap = prebuilt->blob_heap;
+ } else {
+ extern_field_heap
+ = mem_heap_create(UNIV_PAGE_SIZE);
+
+ heap = extern_field_heap;
+ }
+
+ /* NOTE: if we are retrieving a big BLOB, we may
+ already run out of memory in the next call, which
+ causes an assert */
+
+ data = btr_rec_copy_externally_stored_field(
+ rec, offsets,
+ dict_table_zip_size(prebuilt->table),
+ templ->rec_field_no, &len, heap);
+
+ if (UNIV_UNLIKELY(!data)) {
+ /* The externally stored field
+ was not written yet. This
+ record should only be seen by
+ recv_recovery_rollback_active()
+ or any TRX_ISO_READ_UNCOMMITTED
+ transactions. */
+
+ if (extern_field_heap) {
+ mem_heap_free(extern_field_heap);
+ }
+
+ return(FALSE);
+ }
+
+ ut_a(len != UNIV_SQL_NULL);
+ } else {
+ /* Field is stored in the row. */
+
+ data = rec_get_nth_field(rec, offsets,
+ templ->rec_field_no, &len);
+
+ if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
+ && len != UNIV_SQL_NULL) {
+
+ /* It is a BLOB field locally stored in the
+ InnoDB record: we MUST copy its contents to
+ prebuilt->blob_heap here because later code
+ assumes all BLOB values have been copied to a
+ safe place. */
+
+ if (prebuilt->blob_heap == NULL) {
+ prebuilt->blob_heap = mem_heap_create(
+ UNIV_PAGE_SIZE);
+ }
+
+ data = memcpy(mem_heap_alloc(
+ prebuilt->blob_heap, len),
+ data, len);
+ }
+ }
+
+ if (len != UNIV_SQL_NULL) {
+ row_sel_field_store_in_mysql_format(
+ mysql_rec + templ->mysql_col_offset,
+ templ, data, len);
+
+ /* Cleanup */
+ if (extern_field_heap) {
+ mem_heap_free(extern_field_heap);
+ extern_field_heap = NULL;
+ }
+
+ if (templ->mysql_null_bit_mask) {
+ /* It is a nullable column with a non-NULL
+ value */
+ mysql_rec[templ->mysql_null_byte_offset]
+ &= ~(byte) templ->mysql_null_bit_mask;
+ }
+ } else {
+ /* MySQL assumes that the field for an SQL
+ NULL value is set to the default value. */
+
+ UNIV_MEM_ASSERT_RW(prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ mysql_rec[templ->mysql_null_byte_offset]
+ |= (byte) templ->mysql_null_bit_mask;
+ memcpy(mysql_rec + templ->mysql_col_offset,
+ (const byte*) prebuilt->default_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ }
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+ read_view_t* read_view, /*!< in: read view */
+ dict_index_t* clust_index, /*!< in: clustered index */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record in a clustered index */
+ ulint** offsets, /*!< in/out: offsets returned by
+ rec_get_offsets(rec, clust_index) */
+ mem_heap_t** offset_heap, /*!< in/out: memory heap from which
+ the offsets are allocated */
+ rec_t** old_vers, /*!< out: old version, or NULL if the
+ record does not exist in the view:
+ i.e., it was freshly inserted
+ afterwards */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint err;
+
+ if (prebuilt->old_vers_heap) {
+ mem_heap_empty(prebuilt->old_vers_heap);
+ } else {
+ prebuilt->old_vers_heap = mem_heap_create(200);
+ }
+
+ err = row_vers_build_for_consistent_read(
+ rec, mtr, clust_index, offsets, read_view, offset_heap,
+ prebuilt->old_vers_heap, old_vers);
+ return(err);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
+ dict_index_t* sec_index,/*!< in: secondary index where rec resides */
+ const rec_t* rec, /*!< in: record in a non-clustered index; if
+ this is a locking read, then rec is not
+ allowed to be delete-marked, and that would
+ not make sense either */
+ que_thr_t* thr, /*!< in: query thread */
+ const rec_t** out_rec,/*!< out: clustered record or an old version of
+ it, NULL if the old version did not exist
+ in the read view, i.e., it was a fresh
+ inserted version */
+ ulint** offsets,/*!< in: offsets returned by
+ rec_get_offsets(rec, sec_index);
+ out: offsets returned by
+ rec_get_offsets(out_rec, clust_index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mtr_t* mtr) /*!< in: mtr used to get access to the
+ non-clustered record; the same mtr is used to
+ access the clustered index */
+{
+ dict_index_t* clust_index;
+ const rec_t* clust_rec;
+ rec_t* old_vers;
+ enum db_err err;
+ trx_t* trx;
+
+ *out_rec = NULL;
+ trx = thr_get_trx(thr);
+
+ row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+ sec_index, *offsets, trx);
+
+ clust_index = dict_table_get_first_index(sec_index->table);
+
+ btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+ PAGE_CUR_LE, BTR_SEARCH_LEAF,
+ prebuilt->clust_pcur, 0, mtr);
+
+ clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+ prebuilt->clust_pcur->trx_if_known = trx;
+
+ /* Note: only if the search ends up on a non-infimum record is the
+ low_match value the real match to the search tuple */
+
+ if (!page_rec_is_user_rec(clust_rec)
+ || btr_pcur_get_low_match(prebuilt->clust_pcur)
+ < dict_index_get_n_unique(clust_index)) {
+
+ /* In a rare case it is possible that no clust rec is found
+ for a delete-marked secondary index record: if in row0umod.c
+ in row_undo_mod_remove_clust_low() we have already removed
+ the clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case we know that the
+ clustered index record did not exist in the read view of
+ trx. */
+
+ if (!rec_get_deleted_flag(rec,
+ dict_table_is_comp(sec_index->table))
+ || prebuilt->select_lock_type != LOCK_NONE) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: error clustered record"
+ " for sec rec not found\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, sec_index);
+ fputs("\n"
+ "InnoDB: sec index record ", stderr);
+ rec_print(stderr, rec, sec_index);
+ fputs("\n"
+ "InnoDB: clust index record ", stderr);
+ rec_print(stderr, clust_rec, clust_index);
+ putc('\n', stderr);
+ trx_print(stderr, trx, 600);
+
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n", stderr);
+ }
+
+ clust_rec = NULL;
+
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ *offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+ ULINT_UNDEFINED, offset_heap);
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; we are searching
+ the clust rec with a unique condition, hence
+ we set a LOCK_REC_NOT_GAP type lock */
+
+ err = lock_clust_rec_read_check_and_lock(
+ 0, btr_pcur_get_block(prebuilt->clust_pcur),
+ clust_rec, clust_index, *offsets,
+ prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
+ switch (err) {
+ case DB_SUCCESS:
+ case DB_SUCCESS_LOCKED_REC:
+ break;
+ default:
+ goto err_exit;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ old_vers = NULL;
+
+ /* If the isolation level allows reading of uncommitted data,
+ then we never look for an earlier version */
+
+ if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && !lock_clust_rec_cons_read_sees(
+ clust_rec, clust_index, *offsets,
+ trx->read_view)) {
+
+ /* The following call returns 'offsets' associated with
+ 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index, prebuilt,
+ clust_rec, offsets, offset_heap, &old_vers,
+ mtr);
+
+ if (err != DB_SUCCESS || old_vers == NULL) {
+
+ goto err_exit;
+ }
+
+ clust_rec = old_vers;
+ }
+
+ /* If we had to go to an earlier version of row or the
+ secondary index record is delete marked, then it may be that
+ the secondary index record corresponding to clust_rec
+ (or old_vers) is not rec; in that case we must ignore
+ such row because in our snapshot rec would not have existed.
+ Remember that from rec we cannot see directly which transaction
+ id corresponds to it: we have to go to the clustered index
+ record. A query where we want to fetch all rows where
+ the secondary index value is in some interval would return
+ a wrong result if we would not drop rows which we come to
+ visit through secondary index records that would not really
+ exist in our snapshot. */
+
+ if (clust_rec
+ && (old_vers
+ || trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+ || rec_get_deleted_flag(rec, dict_table_is_comp(
+ sec_index->table)))
+ && !row_sel_sec_rec_is_for_clust_rec(
+ rec, sec_index, clust_rec, clust_index)) {
+ clust_rec = NULL;
+#ifdef UNIV_SEARCH_DEBUG
+ } else {
+ ut_a(clust_rec == NULL
+ || row_sel_sec_rec_is_for_clust_rec(
+ rec, sec_index, clust_rec, clust_index));
+#endif
+ }
+
+ err = DB_SUCCESS;
+ }
+
+func_exit:
+ *out_rec = clust_rec;
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* We may use the cursor in update or in unlock_row():
+ store its position */
+
+ btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+ }
+
+err_exit:
+ return(err);
+}
+
+/********************************************************************//**
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@return TRUE if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+ ibool* same_user_rec, /*!< out: TRUE if we were able to restore
+ the cursor on a user record with the
+ same ordering prefix in in the
+ B-tree index */
+ ulint latch_mode, /*!< in: latch mode wished in
+ restoration */
+ btr_pcur_t* pcur, /*!< in: cursor whose position
+ has been stored */
+ ibool moves_up, /*!< in: TRUE if the cursor moves up
+ in the index */
+ mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
+ mtr temporarily! */
+{
+ ibool success;
+ ulint relative_position;
+
+ relative_position = pcur->rel_pos;
+
+ success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+ *same_user_rec = success;
+
+ if (relative_position == BTR_PCUR_ON) {
+ if (success) {
+ return(FALSE);
+ }
+
+ if (moves_up) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(TRUE);
+ }
+
+ if (relative_position == BTR_PCUR_AFTER
+ || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
+
+ if (moves_up) {
+ return(TRUE);
+ }
+
+ if (btr_pcur_is_on_user_rec(pcur)) {
+ btr_pcur_move_to_prev(pcur, mtr);
+ }
+
+ return(TRUE);
+ }
+
+ ut_ad(relative_position == BTR_PCUR_BEFORE
+ || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
+
+ if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+ btr_pcur_move_to_next(pcur, mtr);
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+ byte* buf, /*!< in/out: buffer where to copy the
+ row */
+ row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
+{
+ ulint i;
+ mysql_row_templ_t* templ;
+ byte* cached_rec;
+ ut_ad(prebuilt->n_fetch_cached > 0);
+ ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+ if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+ /* Copy cache record field by field, don't touch fields that
+ are not covered by current key */
+ cached_rec = prebuilt->fetch_cache[
+ prebuilt->fetch_cache_first];
+
+ for (i = 0; i < prebuilt->n_template; i++) {
+ templ = prebuilt->mysql_template + i;
+#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
+ UNIV_MEM_ASSERT_RW(cached_rec
+ + templ->mysql_col_offset,
+ templ->mysql_col_len);
+#endif
+ ut_memcpy(buf + templ->mysql_col_offset,
+ cached_rec + templ->mysql_col_offset,
+ templ->mysql_col_len);
+ /* Copy NULL bit of the current field from cached_rec
+ to buf */
+ if (templ->mysql_null_bit_mask) {
+ /*buf[templ->mysql_null_byte_offset]
+ ^= (buf[templ->mysql_null_byte_offset]
+ ^ cached_rec[templ->mysql_null_byte_offset])
+ & (byte)templ->mysql_null_bit_mask;*/
+ byte *null_byte= buf + templ->mysql_null_byte_offset;
+ (*null_byte)&= ~templ->mysql_null_bit_mask;
+ (*null_byte)|= cached_rec[templ->mysql_null_byte_offset] &
+ templ->mysql_null_bit_mask;
+ }
+ }
+ }
+ else {
+#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
+ UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
+ [prebuilt->fetch_cache_first],
+ prebuilt->mysql_prefix_len);
+#endif
+ ut_memcpy(buf,
+ prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+ prebuilt->mysql_prefix_len);
+ }
+ prebuilt->n_fetch_cached--;
+ prebuilt->fetch_cache_first++;
+
+ if (prebuilt->n_fetch_cached == 0) {
+ prebuilt->fetch_cache_first = 0;
+ }
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache.
+@return TRUE on success, FALSE if the record contains incomplete BLOBs */
+UNIV_INLINE __attribute__((warn_unused_result))
+ibool
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
+ const rec_t* rec, /*!< in: record to push; must
+ be protected by a page latch */
+ const ulint* offsets, /* in: rec_get_offsets() */
+ ulint start_field_no, /* in: start from this field */
+ byte* remainder_buf) /* in: if start_field_no !=0,
+ where to take prev fields */
+{
+ byte* buf;
+ ulint i;
+
+ ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+ ut_a(!prebuilt->templ_contains_blob);
+
+ if (prebuilt->fetch_cache[0] == NULL) {
+ /* Allocate memory for the fetch cache */
+
+ for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+
+ /* A user has reported memory corruption in these
+ buffers in Linux. Put magic numbers there to help
+ to track a possible bug. */
+
+ buf = mem_alloc(prebuilt->mysql_row_len + 8);
+
+ prebuilt->fetch_cache[i] = buf + 4;
+
+ mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
+ mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
+ ROW_PREBUILT_FETCH_MAGIC_N);
+ }
+ }
+
+ ut_ad(prebuilt->fetch_cache_first == 0);
+ UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+ prebuilt->mysql_row_len);
+
+ if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
+ prebuilt->fetch_cache[
+ prebuilt->n_fetch_cached],
+ prebuilt,
+ rec,
+ offsets,
+ start_field_no,
+ prebuilt->n_template))) {
+ return(FALSE);
+ }
+
+ if (start_field_no) {
+
+ for (i=0; i < start_field_no; i++) {
+ register ulint offs;
+ mysql_row_templ_t* templ;
+ register byte * null_byte;
+
+ templ = prebuilt->mysql_template + i;
+
+ if (templ->mysql_null_bit_mask) {
+ offs = templ->mysql_null_byte_offset;
+
+ null_byte= prebuilt->fetch_cache[
+ prebuilt->n_fetch_cached]+offs;
+ (*null_byte)&= ~templ->mysql_null_bit_mask;
+ (*null_byte)|= (*(remainder_buf + offs) &
+ templ->mysql_null_bit_mask);
+ }
+
+ offs = templ->mysql_col_offset;
+ memcpy(prebuilt->fetch_cache[prebuilt->n_fetch_cached]
+ + offs,
+ remainder_buf + offs,
+ templ->mysql_col_len);
+ }
+ }
+
+ prebuilt->n_fetch_cached++;
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode.
+@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+ const rec_t** out_rec,/*!< out: record if found */
+ row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
+ ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+ mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
+ mtr_t* mtr) /*!< in: started mtr */
+{
+ dict_index_t* index = prebuilt->index;
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ const rec_t* rec;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(!prebuilt->templ_contains_blob);
+
+#ifndef UNIV_SEARCH_DEBUG
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur,
+ RW_S_LATCH,
+ mtr);
+#else /* UNIV_SEARCH_DEBUG */
+ btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+ BTR_SEARCH_LEAF, pcur,
+ 0,
+ mtr);
+#endif /* UNIV_SEARCH_DEBUG */
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!page_rec_is_user_rec(rec)) {
+
+ return(SEL_RETRY);
+ }
+
+ /* As the cursor is now placed on a user record after a search with
+ the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+ fields in the user record matched to the search tuple */
+
+ if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ *offsets = rec_get_offsets(rec, index, *offsets,
+ ULINT_UNDEFINED, heap);
+
+ if (!lock_clust_rec_cons_read_sees(rec, index,
+ *offsets, trx->read_view)) {
+
+ return(SEL_RETRY);
+ }
+
+ if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+
+ return(SEL_EXHAUSTED);
+ }
+
+ *out_rec = rec;
+
+ return(SEL_FOUND);
+}
+
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+ byte* buf, /*!< in/out: buffer for the fetched
+ row in the MySQL format */
+ ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
+ row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
+ table handle; this contains the info
+ of search_tuple, index; if search
+ tuple contains 0 fields then we
+ position the cursor at the start or
+ the end of the index, depending on
+ 'mode' */
+ ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
+ ROW_SEL_EXACT_PREFIX */
+ ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
+ ROW_SEL_PREV; NOTE: if this is != 0,
+ then prebuilt must have a pcur
+ with stored position! In opening of a
+ cursor 'direction' should be 0. */
+{
+ dict_index_t* index = prebuilt->index;
+ ibool comp = dict_table_is_comp(index->table);
+ const dtuple_t* search_tuple = prebuilt->search_tuple;
+ btr_pcur_t* pcur = prebuilt->pcur;
+ trx_t* trx = prebuilt->trx;
+ dict_index_t* clust_index;
+ que_thr_t* thr;
+ const rec_t* rec;
+ const rec_t* result_rec;
+ const rec_t* clust_rec;
+ ulint err = DB_SUCCESS;
+ ibool unique_search = FALSE;
+ ibool unique_search_from_clust_index = FALSE;
+ ibool mtr_has_extra_clust_latch = FALSE;
+ ibool moves_up = FALSE;
+ ibool set_also_gap_locks = TRUE;
+ /* if the query is a plain locking SELECT, and the isolation level
+ is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+ ibool did_semi_consistent_read = FALSE;
+ /* if the returned record was locked and we did a semi-consistent
+ read (fetch the newest committed version), then this is set to
+ TRUE */
+#ifdef UNIV_SEARCH_DEBUG
+ ulint cnt = 0;
+#endif /* UNIV_SEARCH_DEBUG */
+ ulint next_offs;
+ ibool same_user_rec;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ ibool some_fields_in_buffer;
+ ibool problematic_use = FALSE;
+ ibool get_clust_rec = 0;
+
+ rec_offs_init(offsets_);
+
+ ut_ad(index && pcur && search_tuple);
+ ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+ if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: Error:\n"
+ "InnoDB: MySQL is trying to use a table handle"
+ " but the .ibd file for\n"
+ "InnoDB: table %s does not exist.\n"
+ "InnoDB: Have you deleted the .ibd file"
+ " from the database directory under\n"
+ "InnoDB: the MySQL datadir, or have you used"
+ " DISCARD TABLESPACE?\n"
+ "InnoDB: Look from\n"
+ "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+ "InnoDB: how you can resolve the problem.\n",
+ prebuilt->table->name);
+
+ return(DB_ERROR);
+ }
+
+ if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+
+ return(DB_MISSING_HISTORY);
+ }
+
+ if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to free a corrupt\n"
+ "InnoDB: table handle. Magic n %lu, table name ",
+ (ulong) prebuilt->magic_n);
+ ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+ putc('\n', stderr);
+
+ mem_analyze_corruption(prebuilt);
+
+ ut_error;
+ }
+
+#if 0
+ /* August 19, 2005 by Heikki: temporarily disable this error
+ print until the cursor lock count is done correctly.
+ See bugs #12263 and #12456!*/
+
+ if (trx->n_mysql_tables_in_use == 0
+ && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
+ /* Note that if MySQL uses an InnoDB temp table that it
+ created inside LOCK TABLES, then n_mysql_tables_in_use can
+ be zero; in that case select_lock_type is set to LOCK_X in
+ ::start_stmt. */
+
+ fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
+ "InnoDB: but it has not locked"
+ " any tables in ::external_lock()!\n",
+ stderr);
+ trx_print(stderr, trx, 600);
+ fputc('\n', stderr);
+ }
+#endif
+
+#if 0
+ fprintf(stderr, "Match mode %lu\n search tuple ",
+ (ulong) match_mode);
+ dtuple_print(search_tuple);
+ fprintf(stderr, "N tables locked %lu\n",
+ (ulong) trx->mysql_n_tables_locked);
+#endif
+ /*-------------------------------------------------------------*/
+ /* PHASE 0: Release a possible s-latch we are holding on the
+ adaptive hash index latch if there is someone waiting behind */
+
+ if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
+ && trx->has_search_latch) {
+
+ /* There is an x-latch request on the adaptive hash index:
+ release the s-latch to reduce starvation and wait for
+ BTR_SEA_TIMEOUT rounds before trying to keep it again over
+ calls from MySQL */
+
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+
+ trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+ }
+
+ /* Reset the new record lock info if srv_locks_unsafe_for_binlog
+ is set or session is using a READ COMMITED isolation level. Then
+ we are able to remove the record locks set here on an individual
+ row. */
+ prebuilt->new_rec_locks = 0;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 1: Try to pop the row from the prefetch cache */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ trx->op_info = "starting index read";
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ if (prebuilt->sel_graph == NULL) {
+ /* Build a dummy select query graph */
+ row_prebuild_sel_graph(prebuilt);
+ }
+ } else {
+ trx->op_info = "fetching rows";
+
+ if (prebuilt->n_rows_fetched == 0) {
+ prebuilt->fetch_direction = direction;
+ }
+
+ if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+ if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+ ut_error;
+ /* TODO: scrollable cursor: restore cursor to
+ the place of the latest returned row,
+ or better: prevent caching for a scroll
+ cursor! */
+ }
+
+ prebuilt->n_rows_fetched = 0;
+ prebuilt->n_fetch_cached = 0;
+ prebuilt->fetch_cache_first = 0;
+
+ } else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ prebuilt->n_rows_fetched++;
+
+ srv_n_rows_read++;
+ err = DB_SUCCESS;
+ goto func_exit;
+ }
+
+ if (prebuilt->fetch_cache_first > 0
+ && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+ /* The previous returned row was popped from the fetch
+ cache, but the cache was not full at the time of the
+ popping: no more rows can exist in the result set */
+
+ err = DB_RECORD_NOT_FOUND;
+ goto func_exit;
+ }
+
+ prebuilt->n_rows_fetched++;
+
+ if (prebuilt->n_rows_fetched > 1000000000) {
+ /* Prevent wrap-over */
+ prebuilt->n_rows_fetched = 500000000;
+ }
+
+ mode = pcur->search_mode;
+ }
+
+ /* In a search where at most one record in the index may match, we
+ can use a LOCK_REC_NOT_GAP type record lock when locking a
+ non-delete-marked matching record.
+
+ Note that in a unique secondary index there may be different
+ delete-marked versions of a record where only the primary key
+ values differ: thus in a secondary index we must use next-key
+ locks when locking delete-marked records. */
+
+ if (match_mode == ROW_SEL_EXACT
+ && dict_index_is_unique(index)
+ && dtuple_get_n_fields(search_tuple)
+ == dict_index_get_n_unique(index)
+ && (dict_index_is_clust(index)
+ || !dtuple_contains_null(search_tuple))) {
+
+ /* Note above that a UNIQUE secondary index can contain many
+ rows with the same key value if one of the columns is the SQL
+ null. A clustered index under MySQL can never contain null
+ columns because we demand that all the columns in primary key
+ are non-null. */
+
+ unique_search = TRUE;
+
+ /* Even if the condition is unique, MySQL seems to try to
+ retrieve also a second row if a primary key contains more than
+ 1 column. Return immediately if this is not a HANDLER
+ command. */
+
+ if (UNIV_UNLIKELY(direction != 0
+ && !prebuilt->used_in_HANDLER)) {
+
+ err = DB_RECORD_NOT_FOUND;
+ goto func_exit;
+ }
+ }
+
+ mtr_start(&mtr);
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 2: Try fast adaptive hash index search if possible */
+
+ /* Next test if this is the special case where we can use the fast
+ adaptive hash index to try the search. Since we must release the
+ search system latch when we retrieve an externally stored field, we
+ cannot use the adaptive hash index in a search in the case the row
+ may be long and there may be externally stored fields */
+
+ if (UNIV_UNLIKELY(direction == 0)
+ && unique_search
+ && dict_index_is_clust(index)
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->used_in_HANDLER
+ && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+
+ mode = PAGE_CUR_GE;
+
+ unique_search_from_clust_index = TRUE;
+
+ if (trx->mysql_n_tables_locked == 0
+ && prebuilt->select_lock_type == LOCK_NONE
+ && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+ && trx->read_view) {
+
+ /* This is a SELECT query done as a consistent read,
+ and the read view has already been allocated:
+ let us try a search shortcut through the hash
+ index.
+ NOTE that we must also test that
+ mysql_n_tables_locked == 0, because this might
+ also be INSERT INTO ... SELECT ... or
+ CREATE TABLE ... SELECT ... . Our algorithm is
+ NOT prepared to inserts interleaved with the SELECT,
+ and if we try that, we can deadlock on the adaptive
+ hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG
+ if (!trx->has_search_latch) {
+ rw_lock_s_lock(&btr_search_latch);
+ trx->has_search_latch = TRUE;
+ }
+#endif
+ switch (row_sel_try_search_shortcut_for_mysql(
+ &rec, prebuilt, &offsets, &heap,
+ &mtr)) {
+ case SEL_FOUND:
+#ifdef UNIV_SEARCH_DEBUG
+ ut_a(0 == cmp_dtuple_rec(search_tuple,
+ rec, offsets));
+#endif
+ /* At this point, rec is protected by
+ a page latch that was acquired by
+ row_sel_try_search_shortcut_for_mysql().
+ The latch will not be released until
+ mtr_commit(&mtr). */
+ ut_ad(!rec_get_deleted_flag(rec, comp));
+
+ if (!row_sel_store_mysql_rec(buf, prebuilt,
+ rec, offsets, 0,
+ prebuilt->n_template)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such
+ records do not exist. Such
+ records may only be accessed
+ at the READ UNCOMMITTED
+ isolation level or when
+ rolling back a recovered
+ transaction. Rollback happens
+ at a lower level, not here. */
+ ut_a(trx->isolation_level
+ == TRX_ISO_READ_UNCOMMITTED);
+
+ /* Proceed as in case SEL_RETRY. */
+ break;
+ }
+
+ mtr_commit(&mtr);
+
+ /* ut_print_name(stderr, index->name);
+ fputs(" shortcut\n", stderr); */
+
+ srv_n_rows_read++;
+
+ err = DB_SUCCESS;
+ goto release_search_latch_if_needed;
+
+ case SEL_EXHAUSTED:
+ mtr_commit(&mtr);
+
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 2\n", stderr); */
+
+ err = DB_RECORD_NOT_FOUND;
+release_search_latch_if_needed:
+ if (trx->search_latch_timeout > 0
+ && trx->has_search_latch) {
+
+ trx->search_latch_timeout--;
+
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+ }
+
+ /* NOTE that we do NOT store the cursor
+ position */
+ goto func_exit;
+
+ case SEL_RETRY:
+ break;
+
+ default:
+ ut_ad(0);
+ }
+
+ mtr_commit(&mtr);
+ mtr_start(&mtr);
+ }
+ }
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 3: Open or restore index cursor position */
+
+ if (trx->has_search_latch) {
+ rw_lock_s_unlock(&btr_search_latch);
+ trx->has_search_latch = FALSE;
+ }
+
+ ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
+ ut_ad(trx->conc_state == TRX_NOT_STARTED
+ || trx->conc_state == TRX_ACTIVE);
+ ut_ad(prebuilt->sql_stat_start
+ || prebuilt->select_lock_type != LOCK_NONE
+ || trx->read_view);
+
+ trx_start_if_not_started(trx);
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE
+ && trx->mysql_thd != NULL
+ && thd_is_select(trx->mysql_thd)) {
+ /* It is a plain locking SELECT and the isolation
+ level is low: do not lock gaps */
+
+ set_also_gap_locks = FALSE;
+ }
+
+ /* Note that if the search mode was GE or G, then the cursor
+ naturally moves upward (in fetch next) in alphabetical order,
+ otherwise downward */
+
+ if (UNIV_UNLIKELY(direction == 0)) {
+ if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+ moves_up = TRUE;
+ }
+ } else if (direction == ROW_SEL_NEXT) {
+ moves_up = TRUE;
+ }
+
+ thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+ que_thr_move_to_run_state_for_mysql(thr, trx);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ if (UNIV_LIKELY(direction != 0)) {
+ ibool need_to_process = sel_restore_position_for_mysql(
+ &same_user_rec, BTR_SEARCH_LEAF,
+ pcur, moves_up, &mtr);
+
+ if (UNIV_UNLIKELY(need_to_process)) {
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ /* We did a semi-consistent read,
+ but the record was removed in
+ the meantime. */
+ prebuilt->row_read_type
+ = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ } else if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+ /* The cursor was positioned on the record
+ that we returned previously. If we need
+ to repeat a semi-consistent read as a
+ pessimistic locking read, the record
+ cannot be skipped. */
+
+ goto next_rec;
+ }
+
+ } else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+ btr_pcur_open_with_no_init(index, search_tuple, mode,
+ BTR_SEARCH_LEAF,
+ pcur, 0, &mtr);
+
+ pcur->trx_if_known = trx;
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (!moves_up
+ && !page_rec_is_supremum(rec)
+ && set_also_gap_locks
+ && !(srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* Try to place a gap lock on the next index record
+ to prevent phantoms in ORDER BY ... DESC queries */
+ const rec_t* next = page_rec_get_next_const(rec);
+
+ offsets = rec_get_offsets(next, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+ next, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_GAP, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+ } else {
+ if (mode == PAGE_CUR_G) {
+ btr_pcur_open_at_index_side(
+ TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+ &mtr);
+ } else if (mode == PAGE_CUR_L) {
+ btr_pcur_open_at_index_side(
+ FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+ &mtr);
+ }
+ }
+
+ if (!prebuilt->mysql_has_locked) {
+ fprintf(stderr, "InnoDB: Error: row_search_for_mysql() is called without ha_innobase::external_lock()\n");
+ if (trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
+ }
+ problematic_use = TRUE;
+ }
+retry_check:
+
+ if (!prebuilt->sql_stat_start) {
+ /* No need to set an intention lock or assign a read view */
+
+ if (trx->read_view == NULL
+ && prebuilt->select_lock_type == LOCK_NONE) {
+
+ fputs("InnoDB: Error: MySQL is trying to"
+ " perform a consistent read\n"
+ "InnoDB: but the read view is not assigned!\n",
+ stderr);
+ if (problematic_use) {
+ fprintf(stderr, "InnoDB: It may be caused by calling "
+ "without ha_innobase::external_lock()\n"
+ "InnoDB: For the first-aid, avoiding the crash. "
+ "But it should be fixed ASAP.\n");
+ prebuilt->sql_stat_start = TRUE;
+ goto retry_check;
+ }
+ trx_print(stderr, trx, 600);
+ fputc('\n', stderr);
+ ut_a(0);
+ }
+ } else if (prebuilt->select_lock_type == LOCK_NONE) {
+ /* This is a consistent read */
+ /* Assign a read view for the query */
+
+ trx_assign_read_view(trx);
+ prebuilt->sql_stat_start = FALSE;
+ } else {
+ ulint lock_mode;
+ if (prebuilt->select_lock_type == LOCK_S) {
+ lock_mode = LOCK_IS;
+ } else {
+ lock_mode = LOCK_IX;
+ }
+ err = lock_table(0, index->table, lock_mode, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+ prebuilt->sql_stat_start = FALSE;
+ }
+
+rec_loop:
+ /*-------------------------------------------------------------*/
+ /* PHASE 4: Look for matching records in a loop */
+
+ rec = btr_pcur_get_rec(pcur);
+
+ if (srv_pass_corrupt_table && !rec) {
+ err = DB_CORRUPTION;
+ goto lock_wait_or_error;
+ }
+ ut_a(rec);
+
+ ut_ad(!!page_rec_is_comp(rec) == comp);
+#ifdef UNIV_SEARCH_DEBUG
+ /*
+ fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+ page_get_page_no(page_align(rec)));
+ rec_print(rec);
+ */
+#endif /* UNIV_SEARCH_DEBUG */
+
+ if (page_rec_is_infimum(rec)) {
+
+ /* The infimum record on a page cannot be in the result set,
+ and neither can a record lock be placed on it: we skip such
+ a record. */
+
+ goto next_rec;
+ }
+
+ if (page_rec_is_supremum(rec)) {
+
+ if (set_also_gap_locks
+ && !(srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* Try to place a lock on the index record */
+
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using a READ COMMITTED isolation
+ level we do not lock gaps. Supremum record is really
+ a gap and therefore we do not set locks there. */
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ LOCK_ORDINARY, thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+ /* A page supremum record cannot be in the result set: skip
+ it now that we have placed a possible lock on it */
+
+ goto next_rec;
+ }
+
+ /*-------------------------------------------------------------*/
+ /* Do sanity checks in case our cursor has bumped into page
+ corruption */
+
+ if (comp) {
+ next_offs = rec_get_next_offs(rec, TRUE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ } else {
+ next_offs = rec_get_next_offs(rec, FALSE);
+ if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+ goto wrong_offs;
+ }
+ }
+
+ if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
+
+wrong_offs:
+ if (srv_force_recovery == 0 || moves_up == FALSE) {
+ ut_print_timestamp(stderr);
+ buf_page_print(page_align(rec), 0);
+ fprintf(stderr,
+ "\nInnoDB: rec address %p,"
+ " buf block fix count %lu\n",
+ (void*) rec, (ulong)
+ btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
+ ->page.buf_fix_count);
+ fprintf(stderr,
+ "InnoDB: Index corruption: rec offs %lu"
+ " next offs %lu, page no %lu,\n"
+ "InnoDB: ",
+ (ulong) page_offset(rec),
+ (ulong) next_offs,
+ (ulong) page_get_page_no(page_align(rec)));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". Run CHECK TABLE. You may need to\n"
+ "InnoDB: restore from a backup, or"
+ " dump + drop + reimport the table.\n",
+ stderr);
+
+ err = DB_CORRUPTION;
+
+ goto lock_wait_or_error;
+ } else {
+ /* The user may be dumping a corrupt table. Jump
+ over the corruption to recover as much as possible. */
+
+ fprintf(stderr,
+ "InnoDB: Index corruption: rec offs %lu"
+ " next offs %lu, page no %lu,\n"
+ "InnoDB: ",
+ (ulong) page_offset(rec),
+ (ulong) next_offs,
+ (ulong) page_get_page_no(page_align(rec)));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". We try to skip the rest of the page.\n",
+ stderr);
+
+ btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+ goto next_rec;
+ }
+ }
+ /*-------------------------------------------------------------*/
+
+ /* Calculate the 'offsets' associated with 'rec' */
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+ if (!rec_validate(rec, offsets)
+ || !btr_index_rec_validate(rec, index, FALSE)) {
+ fprintf(stderr,
+ "InnoDB: Index corruption: rec offs %lu"
+ " next offs %lu, page no %lu,\n"
+ "InnoDB: ",
+ (ulong) page_offset(rec),
+ (ulong) next_offs,
+ (ulong) page_get_page_no(page_align(rec)));
+ dict_index_name_print(stderr, trx, index);
+ fputs(". We try to skip the record.\n",
+ stderr);
+
+ goto next_rec;
+ }
+ }
+
+ /* Note that we cannot trust the up_match value in the cursor at this
+ place because we can arrive here after moving the cursor! Thus
+ we have to recompare rec and search_tuple to determine if they
+ match enough. */
+
+ if (match_mode == ROW_SEL_EXACT) {
+ /* Test if the index record matches completely to search_tuple
+ in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+ /* fputs("Comparing rec and search tuple\n", stderr); */
+
+ if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+ if (set_also_gap_locks
+ && !(srv_locks_unsafe_for_binlog
+ || trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* Try to place a gap lock on the index
+ record only if innodb_locks_unsafe_for_binlog
+ option is not set or this session is not
+ using a READ COMMITTED isolation level. */
+
+ err = sel_set_rec_lock(
+ btr_pcur_get_block(pcur),
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ err = DB_RECORD_NOT_FOUND;
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 3\n", stderr); */
+
+ goto normal_return;
+ }
+
+ } else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+ if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+ if (set_also_gap_locks
+ && !(srv_locks_unsafe_for_binlog
+ || trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* Try to place a gap lock on the index
+ record only if innodb_locks_unsafe_for_binlog
+ option is not set or this session is not
+ using a READ COMMITTED isolation level. */
+
+ err = sel_set_rec_lock(
+ btr_pcur_get_block(pcur),
+ rec, index, offsets,
+ prebuilt->select_lock_type, LOCK_GAP,
+ thr);
+
+ switch (err) {
+ case DB_SUCCESS_LOCKED_REC:
+ case DB_SUCCESS:
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+ }
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ err = DB_RECORD_NOT_FOUND;
+ /* ut_print_name(stderr, index->name);
+ fputs(" record not found 4\n", stderr); */
+
+ goto normal_return;
+ }
+ }
+
+ /* We are ready to look at a possible new index entry in the result
+ set: the cursor is now placed on a user record */
+
+ if (prebuilt->select_lock_type != LOCK_NONE) {
+ /* Try to place a lock on the index record; note that delete
+ marked records are a special case in a unique search. If there
+ is a non-delete marked record, then it is enough to lock its
+ existence with LOCK_REC_NOT_GAP. */
+
+ /* If innodb_locks_unsafe_for_binlog option is used
+ or this session is using a READ COMMITED isolation
+ level we lock only the record, i.e., next-key locking is
+ not used. */
+
+ ulint lock_type;
+
+ if (!set_also_gap_locks
+ || srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ || (unique_search
+ && !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
+
+ goto no_gap_lock;
+ } else {
+ lock_type = LOCK_ORDINARY;
+ }
+
+ /* If we are doing a 'greater or equal than a primary key
+ value' search from a clustered index, and we find a record
+ that has that exact primary key value, then there is no need
+ to lock the gap before the record, because no insert in the
+ gap can be in our search range. That is, no phantom row can
+ appear that way.
+
+ An example: if col1 is the primary key, the search is WHERE
+ col1 >= 100, and we find a record where col1 = 100, then no
+ need to lock the gap before that record. */
+
+ if (index == clust_index
+ && mode == PAGE_CUR_GE
+ && direction == 0
+ && dtuple_get_n_fields_cmp(search_tuple)
+ == dict_index_get_n_unique(index)
+ && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+no_gap_lock:
+ lock_type = LOCK_REC_NOT_GAP;
+ }
+
+ err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+ rec, index, offsets,
+ prebuilt->select_lock_type,
+ lock_type, thr);
+
+ switch (err) {
+ const rec_t* old_vers;
+ case DB_SUCCESS_LOCKED_REC:
+ if (srv_locks_unsafe_for_binlog
+ || trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED) {
+ /* Note that a record of
+ prebuilt->index was locked. */
+ prebuilt->new_rec_locks = 1;
+ }
+ err = DB_SUCCESS;
+ case DB_SUCCESS:
+ break;
+ case DB_LOCK_WAIT:
+ /* Never unlock rows that were part of a conflict. */
+ prebuilt->new_rec_locks = 0;
+
+ if (UNIV_LIKELY(prebuilt->row_read_type
+ != ROW_READ_TRY_SEMI_CONSISTENT)
+ || unique_search
+ || index != clust_index) {
+
+ goto lock_wait_or_error;
+ }
+
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ err = row_sel_build_committed_vers_for_mysql(
+ clust_index, prebuilt, rec,
+ &offsets, &heap, &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ mutex_enter(&kernel_mutex);
+ if (trx->was_chosen_as_deadlock_victim) {
+ mutex_exit(&kernel_mutex);
+ err = DB_DEADLOCK;
+
+ goto lock_wait_or_error;
+ }
+ if (UNIV_LIKELY(trx->wait_lock != NULL)) {
+ lock_cancel_waiting_and_release(
+ trx->wait_lock);
+ } else {
+ mutex_exit(&kernel_mutex);
+
+ /* The lock was granted while we were
+ searching for the last committed version.
+ Do a normal locking read. */
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED,
+ &heap);
+ err = DB_SUCCESS;
+ break;
+ }
+ mutex_exit(&kernel_mutex);
+
+ if (old_vers == NULL) {
+ /* The row was not yet committed */
+
+ goto next_rec;
+ }
+
+ did_semi_consistent_read = TRUE;
+ rec = old_vers;
+ break;
+ default:
+
+ goto lock_wait_or_error;
+ }
+ } else {
+ /* This is a non-locking consistent read: if necessary, fetch
+ a previous version of the record */
+
+ if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+ /* Do nothing: we let a non-locking SELECT read the
+ latest version of the record */
+
+ } else if (index == clust_index) {
+
+ /* Fetch a previous version of the row if the current
+ one is not visible in the snapshot; if we have a very
+ high force recovery level set, we try to avoid crashes
+ by skipping this lookup */
+
+ if (UNIV_LIKELY(srv_force_recovery < 5)
+ && !lock_clust_rec_cons_read_sees(
+ rec, index, offsets, trx->read_view)) {
+
+ rec_t* old_vers;
+ /* The following call returns 'offsets'
+ associated with 'old_vers' */
+ err = row_sel_build_prev_vers_for_mysql(
+ trx->read_view, clust_index,
+ prebuilt, rec, &offsets, &heap,
+ &old_vers, &mtr);
+
+ if (err != DB_SUCCESS) {
+
+ goto lock_wait_or_error;
+ }
+
+ if (old_vers == NULL) {
+ /* The row did not exist yet in
+ the read view */
+
+ goto next_rec;
+ }
+
+ rec = old_vers;
+ }
+ } else {
+ /* We are looking into a non-clustered index,
+ and to get the right version of the record we
+ have to look also into the clustered index: this
+ is necessary, because we can only get the undo
+ information via the clustered index record. */
+
+ ut_ad(index != clust_index);
+ ut_ad(!dict_index_is_clust(index));
+
+ if (!lock_sec_rec_cons_read_sees(
+ rec, trx->read_view)) {
+ get_clust_rec = TRUE;
+ goto idx_cond_check;
+ }
+ }
+ }
+
+ /* NOTE that at this point rec can be an old version of a clustered
+ index record built for a consistent read. We cannot assume after this
+ point that rec is on a buffer pool page. Functions like
+ page_rec_is_comp() cannot be used! */
+
+ if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
+
+ /* The record is delete-marked: we can skip it */
+
+ if ((srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE
+ && !did_semi_consistent_read) {
+
+ /* No need to keep a lock on a delete-marked record
+ if we do not want to use next-key locking. */
+
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+
+ /* This is an optimization to skip setting the next key lock
+ on the record that follows this delete-marked record. This
+ optimization works because of the unique search criteria
+ which precludes the presence of a range lock between this
+ delete marked record and the record following it.
+
+ For now this is applicable only to clustered indexes while
+ doing a unique search. There is scope for further optimization
+ applicable to unique secondary indexes. Current behaviour is
+ to widen the scope of a lock on an already delete marked record
+ if the same record is deleted twice by the same transaction */
+ if (index == clust_index && unique_search) {
+ err = DB_RECORD_NOT_FOUND;
+
+ goto normal_return;
+ }
+
+ goto next_rec;
+ }
+
+
+idx_cond_check:
+ if (prebuilt->idx_cond_func) {
+ int res;
+ ut_ad(prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE);
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+ row_sel_store_mysql_rec(buf, prebuilt, rec,
+ offsets, 0, prebuilt->n_index_fields);
+ res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
+ if (res == 0)
+ goto next_rec;
+ if (res == 2) {
+ err = DB_RECORD_NOT_FOUND;
+ goto idx_cond_failed;
+ }
+ }
+
+ /* Get the clustered index record if needed, if we did not do the
+ search using the clustered index. */
+ if (get_clust_rec || (index != clust_index
+ && prebuilt->need_to_access_clustered)) {
+
+ /* We use a 'goto' to the preceding label if a consistent
+ read of a secondary index record requires us to look up old
+ versions of the associated clustered index record. */
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ /* It was a non-clustered index and we must fetch also the
+ clustered index record */
+
+ mtr_has_extra_clust_latch = TRUE;
+
+ /* The following call returns 'offsets' associated with
+ 'clust_rec'. Note that 'clust_rec' can be an old version
+ built for a consistent read. */
+
+ err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+ thr, &clust_rec,
+ &offsets, &heap, &mtr);
+ switch (err) {
+ case DB_SUCCESS:
+ if (clust_rec == NULL) {
+ /* The record did not exist in the read view */
+ ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+ goto next_rec;
+ }
+ break;
+ case DB_SUCCESS_LOCKED_REC:
+ ut_a(clust_rec != NULL);
+ if (srv_locks_unsafe_for_binlog
+ || trx->isolation_level
+ <= TRX_ISO_READ_COMMITTED) {
+ /* Note that the clustered index record
+ was locked. */
+ prebuilt->new_rec_locks = 2;
+ }
+ err = DB_SUCCESS;
+ break;
+ default:
+ goto lock_wait_or_error;
+ }
+
+ if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
+
+ /* The record is delete marked: we can skip it */
+
+ if ((srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+ && prebuilt->select_lock_type != LOCK_NONE) {
+
+ /* No need to keep a lock on a delete-marked
+ record if we do not want to use next-key
+ locking. */
+
+ row_unlock_for_mysql(prebuilt, TRUE);
+ }
+
+ goto next_rec;
+ }
+
+ if (prebuilt->need_to_access_clustered) {
+
+ result_rec = clust_rec;
+
+ ut_ad(rec_offs_validate(result_rec, clust_index,
+ offsets));
+ } else {
+ /* We used 'offsets' for the clust rec, recalculate
+ them for 'rec' */
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ result_rec = rec;
+ }
+
+ /* result_rec can legitimately be delete-marked
+ now that it has been established that it points to a
+ clustered index record that exists in the read view. */
+ } else {
+ result_rec = rec;
+ ut_ad(!rec_get_deleted_flag(rec, comp));
+ }
+
+ /* We found a qualifying record 'result_rec'. At this point,
+ 'offsets' are associated with 'result_rec'. */
+
+ ut_ad(rec_offs_validate(result_rec,
+ result_rec != rec ? clust_index : index,
+ offsets));
+
+ /* At this point, the clustered index record is protected
+ by a page latch that was acquired when pcur was positioned.
+ The latch will not be released until mtr_commit(&mtr). */
+
+ if ((match_mode == ROW_SEL_EXACT
+ || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+ && prebuilt->select_lock_type == LOCK_NONE
+ && !prebuilt->templ_contains_blob
+ && !prebuilt->clust_index_was_generated
+ && !prebuilt->used_in_HANDLER
+ && prebuilt->template_type
+ != ROW_MYSQL_DUMMY_TEMPLATE) {
+
+ /* Inside an update, for example, we do not cache rows,
+ since we may use the cursor position to do the actual
+ update, that is why we require ...lock_type == LOCK_NONE.
+ Since we keep space in prebuilt only for the BLOBs of
+ a single row, we cannot cache rows in the case there
+ are BLOBs in the fields to be fetched. In HANDLER we do
+ not cache rows because there the cursor is a scrollable
+ cursor. */
+ some_fields_in_buffer = (index != clust_index
+ && prebuilt->idx_cond_func);
+
+ if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
+ offsets,
+ some_fields_in_buffer?
+ prebuilt->n_index_fields : 0,
+ buf)) {
+ /* Only fresh inserts may contain incomplete
+ externally stored columns. Pretend that such
+ records do not exist. Such records may only be
+ accessed at the READ UNCOMMITTED isolation
+ level or when rolling back a recovered
+ transaction. Rollback happens at a lower
+ level, not here. */
+ ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+ } else if (prebuilt->n_fetch_cached
+ == MYSQL_FETCH_CACHE_SIZE) {
+
+ goto got_row;
+ }
+
+ goto next_rec;
+ } else {
+ if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
+ memcpy(buf + 4, result_rec
+ - rec_offs_extra_size(offsets),
+ rec_offs_size(offsets));
+ mach_write_to_4(buf,
+ rec_offs_extra_size(offsets) + 4);
+ } else {
+ if (!row_sel_store_mysql_rec(buf, prebuilt,
+ result_rec, offsets,
+ prebuilt->idx_cond_func?
+ prebuilt->n_index_fields: 0,
+ prebuilt->n_template)) {
+ /* Only fresh inserts may contain
+ incomplete externally stored
+ columns. Pretend that such records do
+ not exist. Such records may only be
+ accessed at the READ UNCOMMITTED
+ isolation level or when rolling back a
+ recovered transaction. Rollback
+ happens at a lower level, not here. */
+ ut_a(trx->isolation_level
+ == TRX_ISO_READ_UNCOMMITTED);
+ goto next_rec;
+ }
+ }
+
+ if (prebuilt->clust_index_was_generated) {
+ if (result_rec != rec) {
+ offsets = rec_get_offsets(
+ rec, index, offsets, ULINT_UNDEFINED,
+ &heap);
+ }
+ row_sel_store_row_id_to_prebuilt(prebuilt, rec,
+ index, offsets);
+ }
+ }
+
+ /* From this point on, 'offsets' are invalid. */
+
+got_row:
+ /* We have an optimization to save CPU time: if this is a consistent
+ read on a unique condition on the clustered index, then we do not
+ store the pcur position, because any fetch next or prev will anyway
+ return 'end of file'. Exceptions are locking reads and the MySQL
+ HANDLER command where the user can move the cursor with PREV or NEXT
+ even after a unique search. */
+
+ err = DB_SUCCESS;
+
+idx_cond_failed:
+ if (!unique_search_from_clust_index
+ || prebuilt->select_lock_type != LOCK_NONE
+ || prebuilt->used_in_HANDLER) {
+
+ /* Inside an update always store the cursor position */
+
+ btr_pcur_store_position(pcur, &mtr);
+ }
+
+ goto normal_return;
+
+next_rec:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ get_clust_rec = FALSE;
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ did_semi_consistent_read = FALSE;
+ prebuilt->new_rec_locks = 0;
+
+ /*-------------------------------------------------------------*/
+ /* PHASE 5: Move the cursor to the next index record */
+
+ if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
+ /* We must commit mtr if we are moving to the next
+ non-clustered index record, because we could break the
+ latching order if we would access a different clustered
+ index page right away without releasing the previous. */
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ mtr_start(&mtr);
+ if (sel_restore_position_for_mysql(&same_user_rec,
+ BTR_SEARCH_LEAF,
+ pcur, moves_up, &mtr)) {
+#ifdef UNIV_SEARCH_DEBUG
+ cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+ goto rec_loop;
+ }
+ }
+
+ if (moves_up) {
+ if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
+not_moved:
+ btr_pcur_store_position(pcur, &mtr);
+
+ if (match_mode != 0) {
+ err = DB_RECORD_NOT_FOUND;
+ } else {
+ err = DB_END_OF_INDEX;
+ }
+
+ goto normal_return;
+ }
+ } else {
+ if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
+ goto not_moved;
+ }
+ }
+
+#ifdef UNIV_SEARCH_DEBUG
+ cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+ goto rec_loop;
+
+lock_wait_or_error:
+ /* Reset the old and new "did semi-consistent read" flags. */
+ if (UNIV_UNLIKELY(prebuilt->row_read_type
+ == ROW_READ_DID_SEMI_CONSISTENT)) {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ did_semi_consistent_read = FALSE;
+
+ /*-------------------------------------------------------------*/
+
+ btr_pcur_store_position(pcur, &mtr);
+
+ mtr_commit(&mtr);
+ mtr_has_extra_clust_latch = FALSE;
+
+ trx->error_state = err;
+
+ /* The following is a patch for MySQL */
+
+ que_thr_stop_for_mysql(thr);
+
+ thr->lock_state = QUE_THR_LOCK_ROW;
+
+ if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+ /* It was a lock wait, and it ended */
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+ mtr_start(&mtr);
+
+ sel_restore_position_for_mysql(&same_user_rec,
+ BTR_SEARCH_LEAF, pcur,
+ moves_up, &mtr);
+
+ if ((srv_locks_unsafe_for_binlog
+ || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+ && !same_user_rec) {
+
+ /* Since we were not able to restore the cursor
+ on the same user record, we cannot use
+ row_unlock_for_mysql() to unlock any records, and
+ we must thus reset the new rec lock info. Since
+ in lock0lock.c we have blocked the inheriting of gap
+ X-locks, we actually do not have any new record locks
+ set in this case.
+
+ Note that if we were able to restore on the 'same'
+ user record, it is still possible that we were actually
+ waiting on a delete-marked record, and meanwhile
+ it was removed by purge and inserted again by some
+ other user. But that is no problem, because in
+ rec_loop we will again try to set a lock, and
+ new_rec_lock_info in trx will be right at the end. */
+
+ prebuilt->new_rec_locks = 0;
+ }
+
+ mode = pcur->search_mode;
+
+ goto rec_loop;
+ }
+
+ thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+#ifdef UNIV_SEARCH_DEBUG
+ /* fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+ goto func_exit;
+
+normal_return:
+ /*-------------------------------------------------------------*/
+ que_thr_stop_for_mysql_no_error(thr, trx);
+
+ mtr_commit(&mtr);
+
+ if (prebuilt->n_fetch_cached > 0) {
+ row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+ err = DB_SUCCESS;
+ }
+
+#ifdef UNIV_SEARCH_DEBUG
+ /* fputs("Using ", stderr);
+ dict_index_name_print(stderr, index);
+ fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+ if (err == DB_SUCCESS) {
+ srv_n_rows_read++;
+ }
+
+func_exit:
+ trx->op_info = "";
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ /* Set or reset the "did semi-consistent read" flag on return.
+ The flag did_semi_consistent_read is set if and only if
+ the record being returned was fetched with a semi-consistent read. */
+ ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+ || !did_semi_consistent_read);
+
+ if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
+ if (UNIV_UNLIKELY(did_semi_consistent_read)) {
+ prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+ } else {
+ prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+ }
+ }
+ return(err);
+}
+
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+ trx_t* trx, /*!< in: transaction object */
+ const char* norm_name) /*!< in: concatenation of database name,
+ '/' char, table name */
+{
+ dict_table_t* table;
+ ibool ret = FALSE;
+
+ table = dict_table_get(norm_name, FALSE);
+
+ if (table == NULL) {
+
+ return(FALSE);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ /* Start the transaction if it is not started yet */
+
+ trx_start_if_not_started_low(trx);
+
+ /* If there are locks on the table or some trx has invalidated the
+ cache up to our trx id, then ret = FALSE.
+ We do not check what type locks there are on the table, though only
+ IX type locks actually would require ret = FALSE. */
+
+ if (UT_LIST_GET_LEN(table->locks) == 0
+ && ut_dulint_cmp(trx->id,
+ table->query_cache_inv_trx_id) >= 0) {
+
+ ret = TRUE;
+
+ /* If the isolation level is high, assign a read view for the
+ transaction if it does not yet have one */
+
+ if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+ && !trx->read_view) {
+
+ trx->read_view = read_view_open_now(
+ trx->id, trx->global_read_view_heap);
+ trx->global_read_view = trx->read_view;
+ }
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+ dict_index_t* index, /*!< in: index to read from */
+ const rec_t* rec, /*!< in: current rec */
+ ulint col_no, /*!< in: column number */
+ ulint mtype, /*!< in: column main type */
+ ibool unsigned_type) /*!< in: signed or unsigned flag */
+{
+ ulint len;
+ const byte* data;
+ ib_uint64_t value;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+
+ rec_offs_init(offsets_);
+
+ offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+ data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+ ut_a(len != UNIV_SQL_NULL);
+
+ switch (mtype) {
+ case DATA_INT:
+ ut_a(len <= sizeof value);
+ value = mach_read_int_type(data, len, unsigned_type);
+ break;
+
+ case DATA_FLOAT:
+ ut_a(len == sizeof(float));
+ value = (ib_uint64_t) mach_float_read(data);
+ break;
+
+ case DATA_DOUBLE:
+ ut_a(len == sizeof(double));
+ value = (ib_uint64_t) mach_double_read(data);
+ break;
+
+ default:
+ ut_error;
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (!unsigned_type && (ib_int64_t) value < 0) {
+ value = 0;
+ }
+
+ return(value);
+}
+
+/*******************************************************************//**
+Get the last row.
+@return current rec or NULL */
+static
+const rec_t*
+row_search_autoinc_get_rec(
+/*=======================*/
+ btr_pcur_t* pcur, /*!< in: the current cursor */
+ mtr_t* mtr) /*!< in: mini transaction */
+{
+ do {
+ const rec_t* rec = btr_pcur_get_rec(pcur);
+
+ if (page_rec_is_user_rec(rec)) {
+ return(rec);
+ }
+ } while (btr_pcur_move_to_prev(pcur, mtr));
+
+ return(NULL);
+}
+
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
+column name can't be found in index */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+ dict_index_t* index, /*!< in: index to search */
+ const char* col_name, /*!< in: name of autoinc column */
+ ib_uint64_t* value) /*!< out: AUTOINC value read */
+{
+ ulint i;
+ ulint n_cols;
+ dict_field_t* dfield = NULL;
+ ulint error = DB_SUCCESS;
+
+ n_cols = dict_index_get_n_ordering_defined_by_user(index);
+
+ /* Search the index for the AUTOINC column name */
+ for (i = 0; i < n_cols; ++i) {
+ dfield = dict_index_get_nth_field(index, i);
+
+ if (strcmp(col_name, dfield->name) == 0) {
+ break;
+ }
+ }
+
+ *value = 0;
+
+ /* Must find the AUTOINC column name */
+ if (i < n_cols && dfield) {
+ mtr_t mtr;
+ btr_pcur_t pcur;
+
+ mtr_start(&mtr);
+
+ /* Open at the high/right end (FALSE), and INIT
+ cursor (TRUE) */
+ btr_pcur_open_at_index_side(
+ FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+ if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
+ const rec_t* rec;
+
+ rec = row_search_autoinc_get_rec(&pcur, &mtr);
+
+ if (rec != NULL) {
+ ibool unsigned_type = (
+ dfield->col->prtype & DATA_UNSIGNED);
+
+ *value = row_search_autoinc_read_column(
+ index, rec, i,
+ dfield->col->mtype, unsigned_type);
+ }
+ }
+
+ btr_pcur_close(&pcur);
+
+ mtr_commit(&mtr);
+ } else {
+ error = DB_RECORD_NOT_FOUND;
+ }
+
+ return(error);
+}
diff --git a/storage/xtradb/row/row0uins.c b/storage/xtradb/row/row0uins.c
new file mode 100644
index 00000000000..930a5cf13b6
--- /dev/null
+++ b/storage/xtradb/row/row0uins.c
@@ -0,0 +1,361 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.c
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+ undo_node_t* node) /*!< in: undo node */
+{
+ btr_cur_t* btr_cur;
+ ibool success;
+ ulint err;
+ ulint n_tries = 0;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+ &mtr);
+ ut_a(success);
+
+ if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+ ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
+
+ /* Drop the index tree associated with the row in
+ SYS_INDEXES table: */
+
+ dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+ &(node->pcur), &mtr);
+ ut_a(success);
+ }
+
+ btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ if (success) {
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(DB_SUCCESS);
+ }
+retry:
+ /* If did not succeed, try pessimistic descent to tree */
+ mtr_start(&mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+ &(node->pcur), &mtr);
+ ut_a(success);
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ trx_is_recv(node->trx)
+ ? RB_RECOVERY
+ : RB_NORMAL, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err == DB_OUT_OF_FILE_SPACE
+ && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+ depending on whether we wish optimistic or
+ pessimistic descent down the index tree */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry to remove */
+{
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool found;
+ ibool success;
+ ulint err;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* Not found */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ /* No need to distinguish RB_RECOVERY here, because we
+ are deleting a secondary index record: the distinction
+ between RB_NORMAL and RB_RECOVERY only matters when
+ deleting a record that contains externally stored
+ columns. */
+ ut_ad(!dict_index_is_clust(index));
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ RB_NORMAL, &mtr);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry to insert */
+{
+ ulint err;
+ ulint n_tries = 0;
+
+ /* Try first optimistic descent to the B-tree */
+
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ /* Try then pessimistic descent to the B-tree */
+retry:
+ err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+
+ if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+ n_tries++;
+
+ os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+ goto retry;
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+ undo_node_t* node) /*!< in/out: row undo node */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ undo_no_t undo_no;
+ dulint table_id;
+ ulint type;
+ ulint dummy;
+ ibool dummy_extern;
+
+ ut_ad(node);
+
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+ &dummy_extern, &undo_no, &table_id);
+ ut_ad(type == TRX_UNDO_INSERT_REC);
+ node->rec_type = type;
+
+ node->update = NULL;
+ node->table = dict_table_get_on_id(table_id, node->trx);
+
+ /* Skip the UNDO if we can't find the table or the .ibd file. */
+ if (UNIV_UNLIKELY(node->table == NULL)) {
+ } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+ node->table = NULL;
+ } else {
+ clust_index = dict_table_get_first_index(node->table);
+
+ if (clust_index != NULL) {
+ ptr = trx_undo_rec_get_row_ref(
+ ptr, clust_index, &node->ref, node->heap);
+ } else {
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: table ");
+ ut_print_name(stderr, node->trx, TRUE,
+ node->table->name);
+ fprintf(stderr, " has no indexes, "
+ "ignoring the table\n");
+
+ node->table = NULL;
+ }
+ }
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert. InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+ undo_node_t* node) /*!< in: row undo node */
+{
+ ut_ad(node);
+ ut_ad(node->state == UNDO_NODE_INSERT);
+
+ row_undo_ins_parse_undo_rec(node);
+
+ if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ return(DB_SUCCESS);
+ }
+
+ /* Iterate over all the indexes and undo the insert.*/
+
+ /* Skip the clustered index (the first index) */
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ while (node->index != NULL) {
+ dtuple_t* entry;
+ ulint err;
+
+ entry = row_build_index_entry(node->row, node->ext,
+ node->index, node->heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record. Because secondary index entries
+ are inserted after the clustered index record,
+ we may assume that the secondary index record
+ does not exist. However, this situation may
+ only occur during the rollback of incomplete
+ transactions. */
+ ut_a(trx_is_recv(node->trx));
+ } else {
+ log_free_check();
+ err = row_undo_ins_remove_sec(node->index, entry);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ log_free_check();
+ return(row_undo_ins_remove_clust_rec(node));
+}
diff --git a/storage/xtradb/row/row0umod.c b/storage/xtradb/row/row0umod.c
new file mode 100644
index 00000000000..8464b0f95cc
--- /dev/null
+++ b/storage/xtradb/row/row0umod.c
@@ -0,0 +1,866 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.c
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback.
+@return TRUE if also previous modify or insert of this row should be undone */
+static
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+ undo_node_t* node, /*!< in: row undo node */
+ undo_no_t* undo_no)/*!< out: the undo number */
+{
+ trx_undo_rec_t* undo_rec;
+ trx_t* trx;
+
+ trx = node->trx;
+
+ if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+ *undo_no = ut_dulint_zero;
+ return(FALSE);
+ }
+
+ undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+ *undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in: mtr; must be committed before
+ latching any further pages */
+ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ ut_ad(success);
+
+ if (mode == BTR_MODIFY_LEAF) {
+
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ mem_heap_t* heap = NULL;
+ big_rec_t* dummy_big_rec;
+
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ err = btr_cur_pessimistic_update(
+ BTR_NO_LOCKING_FLAG
+ | BTR_NO_UNDO_LOG_FLAG
+ | BTR_KEEP_SYS_FLAG,
+ btr_cur, &heap, &dummy_big_rec, node->update,
+ node->cmpl_info, thr, mtr);
+
+ ut_a(!dummy_big_rec);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Removes a clustered index record after undo if possible.
+This is attempted when the record was inserted by updating a
+delete-marked record and there no longer exist transactions
+that would see the delete-marked record. In other words, we
+roll back the insert by purging the record.
+@return DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr, /*!< in: mtr */
+ ulint mode) /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+ ibool success;
+
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ pcur = &(node->pcur);
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ success = btr_pcur_restore_position(mode, pcur, mtr);
+
+ if (!success) {
+
+ return(DB_SUCCESS);
+ }
+
+ /* Find out if we can remove the whole clustered index record */
+
+ if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+ && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+ /* Ok, we can remove */
+ } else {
+ return(DB_SUCCESS);
+ }
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ /* This operation is analogous to purge, we can free also
+ inherited externally stored fields */
+
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ thr_is_recv(thr)
+ ? RB_RECOVERY_PURGE_REC
+ : RB_NONE, mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return DB_SUCCESS or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ btr_pcur_t* pcur;
+ mtr_t mtr;
+ ulint err;
+ ibool success;
+ ibool more_vers;
+ undo_no_t new_undo_no;
+
+ ut_ad(node && thr);
+
+ log_free_check();
+
+ /* Check if also the previous version of the clustered index record
+ should be undone in this same rollback operation */
+
+ more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no);
+
+ pcur = &(node->pcur);
+
+ mtr_start(&mtr);
+
+ /* Try optimistic processing of the record, keeping changes within
+ the index page */
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a pessimistic
+ descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_LEAF);
+ if (err != DB_SUCCESS) {
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+ /* We may have to modify tree structure: do a
+ pessimistic descent down the index tree */
+
+ mtr_start(&mtr);
+
+ err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+ BTR_MODIFY_TREE);
+ }
+
+ btr_pcur_commit_specify_mtr(pcur, &mtr);
+ }
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+
+ if (more_vers && err == DB_SUCCESS) {
+
+ /* Reserve the undo log record to the prior version after
+ committing &mtr: this is necessary to comply with the latching
+ order, as &mtr may contain the fsp latch which is lower in
+ the latch hierarchy than trx->undo_mutex. */
+
+ success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+ if (success) {
+ node->state = UNDO_NODE_PREV_VERS;
+ }
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry, /*!< in: index entry */
+ ulint mode) /*!< in: latch mode BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+{
+ ibool found;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ ibool success;
+ ibool old_has;
+ ulint err;
+ mtr_t mtr;
+ mtr_t mtr_vers;
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ if (!found) {
+ /* In crash recovery, the secondary index record may
+ be missing if the UPDATE did not have time to insert
+ the secondary index records before the crash. When we
+ are undoing that UPDATE in crash recovery, the record
+ may be missing.
+
+ In normal processing, if an update ends in a deadlock
+ before it has inserted all updated secondary index
+ records, then the undo will not find those records. */
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(DB_SUCCESS);
+ }
+
+ /* We should remove the index record if no prior version of the row,
+ which cannot be purged yet, requires its existence. If some requires,
+ we should delete mark the record. */
+
+ mtr_start(&mtr_vers);
+
+ success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+ &mtr_vers);
+ ut_a(success);
+
+ old_has = row_vers_old_has_index_entry(FALSE,
+ btr_pcur_get_rec(&(node->pcur)),
+ &mtr_vers, index, entry);
+ if (old_has) {
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, &mtr);
+ ut_ad(err == DB_SUCCESS);
+ } else {
+ /* Remove the index record */
+
+ if (mode == BTR_MODIFY_LEAF) {
+ success = btr_cur_optimistic_delete(btr_cur, &mtr);
+ if (success) {
+ err = DB_SUCCESS;
+ } else {
+ err = DB_FAIL;
+ }
+ } else {
+ ut_ad(mode == BTR_MODIFY_TREE);
+
+ /* No need to distinguish RB_RECOVERY_PURGE here,
+ because we are deleting a secondary index record:
+ the distinction between RB_NORMAL and
+ RB_RECOVERY_PURGE only matters when deleting a
+ record that contains externally stored
+ columns. */
+ ut_ad(!dict_index_is_clust(index));
+ btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+ RB_NORMAL, &mtr);
+
+ /* The delete operation may fail if we have little
+ file space left: TODO: easiest to crash the database
+ and restart with more file space */
+ }
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.c, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ dtuple_t* entry) /*!< in: index entry */
+{
+ ulint err;
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_LEAF);
+ if (err == DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+ entry, BTR_MODIFY_TREE);
+ return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@return DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+ ulint mode, /*!< in: search mode: BTR_MODIFY_LEAF or
+ BTR_MODIFY_TREE */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry) /*!< in: index entry */
+{
+ mem_heap_t* heap;
+ btr_pcur_t pcur;
+ upd_t* update;
+ ulint err = DB_SUCCESS;
+ big_rec_t* dummy_big_rec;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+
+ /* Ignore indexes that are being created. */
+ if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) {
+
+ return(DB_SUCCESS);
+ }
+
+ log_free_check();
+ mtr_start(&mtr);
+
+ if (UNIV_UNLIKELY(!row_search_index_entry(index, entry,
+ mode, &pcur, &mtr))) {
+ fputs("InnoDB: error in sec index entry del undo in\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+ putc('\n', stderr);
+ trx_print(stderr, trx, 0);
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n", stderr);
+ } else {
+ btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, FALSE, thr, &mtr);
+ ut_a(err == DB_SUCCESS);
+ heap = mem_heap_create(100);
+
+ update = row_upd_build_sec_rec_difference_binary(
+ index, entry, btr_cur_get_rec(btr_cur), trx, heap);
+ if (upd_get_n_fields(update) == 0) {
+
+ /* Do nothing */
+
+ } else if (mode == BTR_MODIFY_LEAF) {
+ /* Try an optimistic updating of the record, keeping
+ changes within the page */
+
+ err = btr_cur_optimistic_update(
+ BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+ btr_cur, update, 0, thr, &mtr);
+ switch (err) {
+ case DB_OVERFLOW:
+ case DB_UNDERFLOW:
+ case DB_ZIP_OVERFLOW:
+ err = DB_FAIL;
+ }
+ } else {
+ ut_a(mode == BTR_MODIFY_TREE);
+ err = btr_cur_pessimistic_update(
+ BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+ btr_cur, &heap, &dummy_big_rec,
+ update, 0, thr, &mtr);
+ ut_a(!dummy_big_rec);
+ }
+
+ mem_heap_free(heap);
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, node->ext,
+ index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The database must have crashed after
+ inserting a clustered index record but before
+ writing all the externally stored columns of
+ that record. Because secondary index entries
+ are inserted after the clustered index record,
+ we may assume that the secondary index record
+ does not exist. However, this situation may
+ only occur during the rollback of incomplete
+ transactions. */
+ ut_a(thr_is_recv(thr));
+ } else {
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+
+ if (err != DB_SUCCESS) {
+
+ break;
+ }
+ }
+
+ mem_heap_empty(heap);
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ entry = row_build_index_entry(node->row, node->ext,
+ index, heap);
+ ut_a(entry);
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ dict_index_t* index;
+ ulint err;
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+ /* No change in secondary indexes */
+
+ return(DB_SUCCESS);
+ }
+
+ heap = mem_heap_create(1024);
+
+ while (node->index != NULL) {
+ index = node->index;
+
+ if (row_upd_changes_ord_field_binary(node->row, node->index,
+ node->update)) {
+
+ /* Build the newest version of the index entry */
+ entry = row_build_index_entry(node->row, node->ext,
+ index, heap);
+ if (UNIV_UNLIKELY(!entry)) {
+ /* The server must have crashed in
+ row_upd_clust_rec_by_insert(), in
+ row_ins_index_entry_low() before
+ btr_store_big_rec_extern_fields()
+ has written the externally stored columns
+ (BLOBs) of the new clustered index entry. */
+
+ /* The table must be in DYNAMIC or COMPRESSED
+ format. REDUNDANT and COMPACT formats
+ store a local 768-byte prefix of each
+ externally stored column. */
+ ut_a(dict_table_get_format(index->table)
+ >= DICT_TF_FORMAT_ZIP);
+
+ /* This is only legitimate when
+ rolling back an incomplete transaction
+ after crash recovery. */
+ ut_a(thr_get_trx(thr)->is_recovered);
+
+ /* The server must have crashed before
+ completing the insert of the new
+ clustered index entry and before
+ inserting to the secondary indexes.
+ Because node->row was not yet written
+ to this index, we can ignore it. But
+ we must restore node->undo_row. */
+ } else {
+ /* NOTE that if we updated the fields of a
+ delete-marked secondary index record so that
+ alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc', we cannot return to the
+ original values because we do not know them.
+ But this should not cause problems because
+ in row0sel.c, in queries we always retrieve
+ the clustered index record or an earlier
+ version of it, if the secondary index record
+ through which we do the search is
+ delete-marked. */
+
+ err = row_undo_mod_del_mark_or_remove_sec(
+ node, thr, index, entry);
+ if (err != DB_SUCCESS) {
+ mem_heap_free(heap);
+
+ return(err);
+ }
+
+ mem_heap_empty(heap);
+ }
+
+ /* We may have to update the delete mark in the
+ secondary index record of the previous version of
+ the row. We also need to update the fields of
+ the secondary index record if we updated its fields
+ but alphabetically they stayed the same, e.g.,
+ 'abc' -> 'aBc'. */
+ entry = row_build_index_entry(node->undo_row,
+ node->undo_ext,
+ index, heap);
+ ut_a(entry);
+
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_LEAF, thr, index, entry);
+ if (err == DB_FAIL) {
+ err = row_undo_mod_del_unmark_sec_and_undo_update(
+ BTR_MODIFY_TREE, thr, index, entry);
+ }
+
+ if (err != DB_SUCCESS) {
+ mem_heap_free(heap);
+
+ return(err);
+ }
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+ mem_heap_free(heap);
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* clust_index;
+ byte* ptr;
+ undo_no_t undo_no;
+ dulint table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ ulint info_bits;
+ ulint type;
+ ulint cmpl_info;
+ ibool dummy_extern;
+ trx_t* trx;
+
+ ut_ad(node && thr);
+ trx = thr_get_trx(thr);
+ ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+ node->rec_type = type;
+
+ node->table = dict_table_get_on_id(table_id, trx);
+
+ /* TODO: other fixes associated with DROP TABLE + rollback in the
+ same table by another user */
+
+ if (node->table == NULL) {
+ /* Table was dropped */
+ return;
+ }
+
+ if (node->table->ibd_file_missing) {
+ /* We skip undo operations to missing .ibd files */
+ node->table = NULL;
+
+ return;
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+ node->heap);
+
+ trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+ roll_ptr, info_bits, trx,
+ node->heap, &(node->update));
+ node->new_roll_ptr = roll_ptr;
+ node->new_trx_id = trx_id;
+ node->cmpl_info = cmpl_info;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+
+ ut_ad(node && thr);
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+
+ row_undo_mod_parse_undo_rec(node, thr);
+
+ if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+ /* It is already undone, or will be undone by another query
+ thread, or table was dropped */
+
+ trx_undo_rec_release(node->trx, node->undo_no);
+ node->state = UNDO_NODE_FETCH_NEXT;
+
+ return(DB_SUCCESS);
+ }
+
+ node->index = dict_table_get_next_index(
+ dict_table_get_first_index(node->table));
+
+ if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+ err = row_undo_mod_upd_exist_sec(node, thr);
+
+ } else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+ err = row_undo_mod_del_mark_sec(node, thr);
+ } else {
+ ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+ err = row_undo_mod_upd_del_sec(node, thr);
+ }
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ err = row_undo_mod_clust(node, thr);
+
+ return(err);
+}
diff --git a/storage/xtradb/row/row0undo.c b/storage/xtradb/row/row0undo.c
new file mode 100644
index 00000000000..fd28a4f6520
--- /dev/null
+++ b/storage/xtradb/row/row0undo.c
@@ -0,0 +1,393 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.c
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t* parent, /*!< in: parent node, i.e., a thr node */
+ mem_heap_t* heap) /*!< in: memory heap where created */
+{
+ undo_node_t* undo;
+
+ ut_ad(trx && parent && heap);
+
+ undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+ undo->common.type = QUE_NODE_UNDO;
+ undo->common.parent = parent;
+
+ undo->state = UNDO_NODE_FETCH_NEXT;
+ undo->trx = trx;
+
+ btr_pcur_init(&(undo->pcur));
+
+ undo->heap = mem_heap_create(256);
+
+ return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+ undo_node_t* node) /*!< in: row undo node */
+{
+ dict_index_t* clust_index;
+ ibool found;
+ mtr_t mtr;
+ ibool ret;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ mtr_start(&mtr);
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+ node->table, node->ref, &mtr);
+
+ rec = btr_pcur_get_rec(&(node->pcur));
+
+ offsets = rec_get_offsets(rec, clust_index, offsets,
+ ULINT_UNDEFINED, &heap);
+
+ if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+ row_get_rec_roll_ptr(rec, clust_index,
+ offsets))) {
+
+ /* We must remove the reservation on the undo log record
+ BEFORE releasing the latch on the clustered index page: this
+ is to make sure that some thread will eventually undo the
+ modification corresponding to node->roll_ptr. */
+
+ /* fputs("--------------------undoing a previous version\n",
+ stderr); */
+
+ ret = FALSE;
+ } else {
+ row_ext_t** ext;
+
+ if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+ /* In DYNAMIC or COMPRESSED format, there is
+ no prefix of externally stored columns in the
+ clustered index record. Build a cache of
+ column prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored
+ column. No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+ offsets, NULL, ext, node->heap);
+ if (node->update) {
+ node->undo_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->undo_row, &node->undo_ext,
+ clust_index, node->update, node->heap);
+ } else {
+ node->undo_row = NULL;
+ node->undo_ext = NULL;
+ }
+
+ btr_pcur_store_position(&(node->pcur), &mtr);
+
+ ret = TRUE;
+ }
+
+ btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(ret);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_undo(
+/*=====*/
+ undo_node_t* node, /*!< in: row undo node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+ trx_t* trx;
+ roll_ptr_t roll_ptr;
+ ibool locked_data_dict;
+
+ ut_ad(node && thr);
+
+ trx = node->trx;
+
+ if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+ node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+ trx->roll_limit,
+ &roll_ptr,
+ node->heap);
+ if (!node->undo_rec) {
+ /* Rollback completed for this query thread */
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(DB_SUCCESS);
+ }
+
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+
+ } else if (node->state == UNDO_NODE_PREV_VERS) {
+
+ /* Undo should be done to the same clustered index record
+ again in this same rollback, restoring the previous version */
+
+ roll_ptr = node->new_roll_ptr;
+
+ node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+ node->heap);
+ node->roll_ptr = roll_ptr;
+ node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ node->state = UNDO_NODE_INSERT;
+ } else {
+ node->state = UNDO_NODE_MODIFY;
+ }
+ }
+
+ /* Prevent DROP TABLE etc. while we are rolling back this row.
+ If we are doing a TABLE CREATE or some other dictionary operation,
+ then we already have dict_operation_lock locked in x-mode. Do not
+ try to lock again, because that would cause a hang. */
+
+ locked_data_dict = (trx->dict_operation_lock_mode == 0);
+
+ if (locked_data_dict) {
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ if (node->state == UNDO_NODE_INSERT) {
+
+ err = row_undo_ins(node);
+
+ node->state = UNDO_NODE_FETCH_NEXT;
+ } else {
+ ut_ad(node->state == UNDO_NODE_MODIFY);
+ err = row_undo_mod(node, thr);
+ }
+
+ if (locked_data_dict) {
+
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ /* Do some cleanup */
+ btr_pcur_close(&(node->pcur));
+
+ mem_heap_empty(node->heap);
+
+ thr->run_node = node;
+
+ return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err;
+ undo_node_t* node;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ srv_activity_count++;
+
+ trx = thr_get_trx(thr);
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+ err = row_undo(node, thr);
+
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ /* SQL error detected */
+
+ fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+ (ulong) err);
+
+ if (err == DB_OUT_OF_FILE_SPACE) {
+ fprintf(stderr,
+ "InnoDB: Error 13 means out of tablespace.\n"
+ "InnoDB: Consider increasing"
+ " your tablespace.\n");
+
+ exit(1);
+ }
+
+ ut_error;
+
+ return(NULL);
+ }
+
+ return(thr);
+}
diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c
new file mode 100644
index 00000000000..04c3139fcc7
--- /dev/null
+++ b/storage/xtradb/row/row0upd.c
@@ -0,0 +1,2203 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.c
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+ -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+ Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: old value of index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n); /*!< in: how many first fields to check */
+
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+NOTE that since we do not hold dict_operation_lock when leaving the
+function, it may be that the referencing table has been dropped when
+we leave this function: this function is only for heuristic use!
+
+@return TRUE if referenced */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+ dict_index_t* index, /*!< in: index */
+ trx_t* trx) /*!< in: transaction */
+{
+ dict_table_t* table = index->table;
+ dict_foreign_t* foreign;
+ ibool froze_data_dict = FALSE;
+ ibool is_referenced = FALSE;
+
+ if (!UT_LIST_GET_FIRST(table->referenced_list)) {
+
+ return(FALSE);
+ }
+
+ if (trx->dict_operation_lock_mode == 0) {
+ row_mysql_freeze_data_dictionary(trx);
+ froze_data_dict = TRUE;
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign) {
+ if (foreign->referenced_index == index) {
+
+ is_referenced = TRUE;
+ goto func_exit;
+ }
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+func_exit:
+ if (froze_data_dict) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ return(is_referenced);
+}
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return DB_SUCCESS or an error code */
+static
+ulint
+row_upd_check_references_constraints(
+/*=================================*/
+ upd_node_t* node, /*!< in: row update node */
+ btr_pcur_t* pcur, /*!< in: cursor positioned on a record; NOTE: the
+ cursor position is lost in this function! */
+ dict_table_t* table, /*!< in: table in question */
+ dict_index_t* index, /*!< in: index of the cursor */
+ ulint* offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_foreign_t* foreign;
+ mem_heap_t* heap;
+ dtuple_t* entry;
+ trx_t* trx;
+ const rec_t* rec;
+ ulint n_ext;
+ ulint err;
+ ibool got_s_lock = FALSE;
+
+ if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx = thr_get_trx(thr);
+
+ rec = btr_pcur_get_rec(pcur);
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ heap = mem_heap_create(500);
+
+ entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+ &n_ext, heap);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ if (trx->dict_operation_lock_mode == 0) {
+ got_s_lock = TRUE;
+
+ row_mysql_freeze_data_dictionary(trx);
+ }
+
+ foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+ while (foreign) {
+ /* Note that we may have an update which updates the index
+ record, but does NOT update the first fields which are
+ referenced in a foreign key constraint. Then the update does
+ NOT break the constraint. */
+
+ if (foreign->referenced_index == index
+ && (node->is_delete
+ || row_upd_changes_first_fields_binary(
+ entry, index, node->update,
+ foreign->n_fields))) {
+
+ if (foreign->foreign_table == NULL) {
+ dict_table_get(foreign->foreign_table_name,
+ FALSE);
+ }
+
+ if (foreign->foreign_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ (foreign->foreign_table
+ ->n_foreign_key_checks_running)++;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ /* NOTE that if the thread ends up waiting for a lock
+ we will release dict_operation_lock temporarily!
+ But the counter on the table protects 'foreign' from
+ being dropped while the check is running. */
+
+ err = row_ins_check_foreign_constraint(
+ FALSE, foreign, table, entry, thr);
+
+ if (foreign->foreign_table) {
+ mutex_enter(&(dict_sys->mutex));
+
+ ut_a(foreign->foreign_table
+ ->n_foreign_key_checks_running > 0);
+
+ (foreign->foreign_table
+ ->n_foreign_key_checks_running)--;
+
+ mutex_exit(&(dict_sys->mutex));
+ }
+
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+ }
+
+ foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+ }
+
+ err = DB_SUCCESS;
+
+func_exit:
+ if (got_s_lock) {
+ row_mysql_unfreeze_data_dictionary(trx);
+ }
+
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ upd_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(upd_node_t));
+ node->common.type = QUE_NODE_UPDATE;
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ node->in_mysql_interface = FALSE;
+
+ node->row = NULL;
+ node->ext = NULL;
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ node->index = NULL;
+ node->update = NULL;
+
+ node->foreign = NULL;
+ node->cascade_heap = NULL;
+ node->cascade_node = NULL;
+
+ node->select = NULL;
+
+ node->heap = mem_heap_create(128);
+ node->magic_n = UPD_NODE_MAGIC_N;
+
+ node->cmpl_info = 0;
+
+ return(node);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+ rec_t* rec, /*!< in/out: record */
+ page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ ulint pos, /*!< in: TRX_ID position in rec */
+ trx_id_t trx_id, /*!< in: transaction id */
+ roll_ptr_t roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_write_trx_id_and_roll_ptr(
+ page_zip, rec, offsets, pos, trx_id, roll_ptr);
+ } else {
+ byte* field;
+ ulint len;
+
+ field = rec_get_nth_field(rec, offsets, pos, &len);
+ ut_ad(len == DATA_TRX_ID_LEN);
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+ trx_write_trx_id(field, trx_id);
+ trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+ const dtuple_t* entry, /*!< in: index entry, where the memory buffers
+ for sys fields are already allocated:
+ the function just copies the new values to
+ them */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+ dulint val) /*!< in: value to write */
+{
+ dfield_t* dfield;
+ byte* field;
+ ulint pos;
+
+ ut_ad(dict_index_is_clust(index));
+
+ pos = dict_index_get_sys_col_pos(index, type);
+
+ dfield = dtuple_get_nth_field(entry, pos);
+ field = dfield_get_data(dfield);
+
+ if (type == DATA_TRX_ID) {
+ trx_write_trx_id(field, val);
+ } else {
+ ut_ad(type == DATA_ROLL_PTR);
+ trx_write_roll_ptr(field, val);
+ }
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+ dict_index_t* index, /*!< in: index */
+ const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update) /*!< in: update vector */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint old_len;
+ ulint new_len;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(NULL, index, offsets));
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+ new_len = dfield_get_len(new_val);
+
+ if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+ /* A bug fixed on Dec 31st, 2004: we looked at the
+ SQL NULL size from the wrong field! We may backport
+ this fix also to 4.0. The merge to 5.0 will be made
+ manually immediately after we commit this to 4.1. */
+
+ new_len = dict_col_get_sql_null_size(
+ dict_index_get_nth_col(index,
+ upd_field->field_no),
+ 0);
+ }
+
+ old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+ if (rec_offs_comp(offsets)
+ && rec_offs_nth_sql_null(offsets,
+ upd_field->field_no)) {
+ /* Note that in the compact table format, for a
+ variable length field, an SQL NULL will use zero
+ bytes in the offset array at the start of the physical
+ record, but a zero-length value (empty string) will
+ use one byte! Thus, we cannot use update-in-place
+ if we update an SQL NULL varchar to an empty string! */
+
+ old_len = UNIV_SQL_NULL;
+ }
+
+ if (dfield_is_ext(new_val) || old_len != new_len
+ || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+ rec_t* rec, /*!< in/out: record where replaced */
+ dict_index_t* index, /*!< in: the index the record belongs to */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ const upd_t* update, /*!< in: update vector */
+ page_zip_des_t* page_zip)/*!< in: compressed page with enough space
+ available, or NULL */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint n_fields;
+ ulint i;
+
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (rec_offs_comp(offsets)) {
+ rec_set_info_bits_new(rec, update->info_bits);
+ } else {
+ rec_set_info_bits_old(rec, update->info_bits);
+ }
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+ ut_ad(!dfield_is_ext(new_val) ==
+ !rec_offs_nth_extern(offsets, upd_field->field_no));
+
+ rec_set_nth_field(rec, offsets, upd_field->field_no,
+ dfield_get_data(new_val),
+ dfield_get_len(new_val));
+ }
+
+ if (UNIV_LIKELY_NULL(page_zip)) {
+ page_zip_write_rec(page_zip, rec, index, offsets, 0);
+ }
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+ dict_index_t* index, /*!< in: clustered index */
+ trx_t* trx, /*!< in: transaction */
+ roll_ptr_t roll_ptr,/*!< in: roll ptr of the undo log record */
+ byte* log_ptr,/*!< pointer to a buffer of size > 20 opened
+ in mlog */
+ mtr_t* mtr __attribute__((unused))) /*!< in: mtr */
+{
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mtr);
+
+ log_ptr += mach_write_compressed(log_ptr,
+ dict_index_get_sys_col_pos(
+ index, DATA_TRX_ID));
+
+ trx_write_roll_ptr(log_ptr, roll_ptr);
+ log_ptr += DATA_ROLL_PTR_LEN;
+
+ log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+ return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data of system field values.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ ulint* pos, /*!< out: TRX_ID position in record */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr)/*!< out: roll ptr */
+{
+ ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+ return(NULL);
+ }
+
+ *roll_ptr = trx_read_roll_ptr(ptr);
+ ptr += DATA_ROLL_PTR_LEN;
+
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+ const upd_t* update, /*!< in: update vector */
+ byte* log_ptr,/*!< in: pointer to mlog buffer: must
+ contain at least MLOG_BUF_MARGIN bytes
+ of free space; the buffer is closed
+ within this function */
+ mtr_t* mtr) /*!< in: mtr into whose log to write */
+{
+ const upd_field_t* upd_field;
+ const dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ byte* buf_end;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+ mach_write_to_1(log_ptr, update->info_bits);
+ log_ptr++;
+ log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+ for (i = 0; i < n_fields; i++) {
+
+#if MLOG_BUF_MARGIN <= 30
+# error "MLOG_BUF_MARGIN <= 30"
+#endif
+
+ if (log_ptr + 30 > buf_end) {
+ mlog_close(mtr, log_ptr);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+
+ upd_field = upd_get_nth_field(update, i);
+
+ new_val = &(upd_field->new_val);
+
+ len = dfield_get_len(new_val);
+
+ log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+ log_ptr += mach_write_compressed(log_ptr, len);
+
+ if (len != UNIV_SQL_NULL) {
+ if (log_ptr + len < buf_end) {
+ memcpy(log_ptr, dfield_get_data(new_val), len);
+
+ log_ptr += len;
+ } else {
+ mlog_close(mtr, log_ptr);
+
+ mlog_catenate_string(mtr,
+ dfield_get_data(new_val),
+ len);
+
+ log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+ buf_end = log_ptr + MLOG_BUF_MARGIN;
+ }
+ }
+ }
+
+ mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ mem_heap_t* heap, /*!< in: memory heap where update vector is
+ built */
+ upd_t** update_out)/*!< out: update vector */
+{
+ upd_t* update;
+ upd_field_t* upd_field;
+ dfield_t* new_val;
+ ulint len;
+ ulint n_fields;
+ ulint info_bits;
+ ulint i;
+
+ if (end_ptr < ptr + 1) {
+
+ return(NULL);
+ }
+
+ info_bits = mach_read_from_1(ptr);
+ ptr++;
+ ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ update = upd_create(n_fields, heap);
+ update->info_bits = info_bits;
+
+ for (i = 0; i < n_fields; i++) {
+ ulint field_no;
+ upd_field = upd_get_nth_field(update, i);
+ new_val = &(upd_field->new_val);
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &field_no);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ upd_field->field_no = field_no;
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (len != UNIV_SQL_NULL) {
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ }
+
+ dfield_set_data(new_val,
+ mem_heap_dup(heap, ptr, len), len);
+ ptr += len;
+ } else {
+ dfield_set_null(new_val);
+ }
+ }
+
+ *update_out = update;
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+ dict_index_t* index, /*!< in: index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: secondary index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ const dfield_t* dfield;
+ const byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ ulint i;
+ ulint offsets_[REC_OFFS_SMALL_SIZE];
+ const ulint* offsets;
+ rec_offs_init(offsets_);
+
+ /* This function is used only for a secondary index */
+ ut_a(!dict_index_is_clust(index));
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE that it may be that len != dfield_get_len(dfield) if we
+ are updating in a character set and collation where strings of
+ different length can be equal in an alphabetical comparison,
+ and also in the case where we have a column prefix index
+ and the last characters in the index field are spaces; the
+ latter case probably caused the assertion failures reported at
+ row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index, trx);
+
+ n_diff++;
+ }
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* entry, /*!< in: entry to insert */
+ const rec_t* rec, /*!< in: clustered index record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap) /*!< in: memory heap from which allocated */
+{
+ upd_field_t* upd_field;
+ const dfield_t* dfield;
+ const byte* data;
+ ulint len;
+ upd_t* update;
+ ulint n_diff;
+ ulint roll_ptr_pos;
+ ulint trx_id_pos;
+ ulint i;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ rec_offs_init(offsets_);
+
+ /* This function is used only for a clustered index */
+ ut_a(dict_index_is_clust(index));
+
+ update = upd_create(dtuple_get_n_fields(entry), heap);
+
+ n_diff = 0;
+
+ roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+ trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+ data = rec_get_nth_field(rec, offsets, i, &len);
+
+ dfield = dtuple_get_nth_field(entry, i);
+
+ /* NOTE: we compare the fields as binary strings!
+ (No collation) */
+
+ if (i == trx_id_pos || i == roll_ptr_pos) {
+
+ goto skip_compare;
+ }
+
+ if (UNIV_UNLIKELY(!dfield_is_ext(dfield)
+ != !rec_offs_nth_extern(offsets, i))
+ || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+ upd_field = upd_get_nth_field(update, n_diff);
+
+ dfield_copy(&(upd_field->new_val), dfield);
+
+ upd_field_set_field_no(upd_field, i, index, trx);
+
+ n_diff++;
+ }
+skip_compare:
+ ;
+ }
+
+ update->n_fields = n_diff;
+
+ return(update);
+}
+
+/***********************************************************//**
+Fetch a prefix of an externally stored column. This is similar
+to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@return BLOB prefix */
+static
+byte*
+row_upd_ext_fetch(
+/*==============*/
+ const byte* data, /*!< in: 'internally' stored part of the
+ field containing also the reference to
+ the external part */
+ ulint local_len, /*!< in: length of data, in bytes */
+ ulint zip_size, /*!< in: nonzero=compressed BLOB
+ page size, zero for uncompressed
+ BLOBs */
+ ulint* len, /*!< in: length of prefix to fetch;
+ out: fetched length of the prefix */
+ mem_heap_t* heap) /*!< in: heap where to allocate */
+{
+ byte* buf = mem_heap_alloc(heap, *len);
+
+ *len = btr_copy_externally_stored_field_prefix(buf, *len,
+ zip_size,
+ data, local_len);
+ /* We should never update records containing a half-deleted BLOB. */
+ ut_a(*len);
+
+ return(buf);
+}
+
+/***********************************************************//**
+Replaces the new column value stored in the update vector in
+the given index entry field. */
+static
+void
+row_upd_index_replace_new_col_val(
+/*==============================*/
+ dfield_t* dfield, /*!< in/out: data field
+ of the index entry */
+ const dict_field_t* field, /*!< in: index field */
+ const dict_col_t* col, /*!< in: field->col */
+ const upd_field_t* uf, /*!< in: update field */
+ mem_heap_t* heap, /*!< in: memory heap for allocating
+ and copying the new value */
+ ulint zip_size)/*!< in: compressed page
+ size of the table, or 0 */
+{
+ ulint len;
+ const byte* data;
+
+ dfield_copy_data(dfield, &uf->new_val);
+
+ if (dfield_is_null(dfield)) {
+ return;
+ }
+
+ len = dfield_get_len(dfield);
+ data = dfield_get_data(dfield);
+
+ if (field->prefix_len > 0) {
+ ibool fetch_ext = dfield_is_ext(dfield)
+ && len < (ulint) field->prefix_len
+ + BTR_EXTERN_FIELD_REF_SIZE;
+
+ if (fetch_ext) {
+ ulint l = len;
+
+ len = field->prefix_len;
+
+ data = row_upd_ext_fetch(data, l, zip_size,
+ &len, heap);
+ }
+
+ len = dtype_get_at_most_n_mbchars(col->prtype,
+ col->mbminlen, col->mbmaxlen,
+ field->prefix_len, len,
+ (const char*) data);
+
+ dfield_set_data(dfield, data, len);
+
+ if (!fetch_ext) {
+ dfield_dup(dfield, heap);
+ }
+
+ return;
+ }
+
+ switch (uf->orig_len) {
+ byte* buf;
+ case BTR_EXTERN_FIELD_REF_SIZE:
+ /* Restore the original locally stored
+ part of the column. In the undo log,
+ InnoDB writes a longer prefix of externally
+ stored columns, so that column prefixes
+ in secondary indexes can be reconstructed. */
+ dfield_set_data(dfield,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ dfield_set_ext(dfield);
+ /* fall through */
+ case 0:
+ dfield_dup(dfield, heap);
+ break;
+ default:
+ /* Reconstruct the original locally
+ stored part of the column. The data
+ will have to be copied. */
+ ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+ buf = mem_heap_alloc(heap, uf->orig_len);
+ /* Copy the locally stored prefix. */
+ memcpy(buf, data,
+ uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+ /* Copy the BLOB pointer. */
+ memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+ data + len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+
+ dfield_set_data(dfield, buf, uf->orig_len);
+ dfield_set_ext(dfield);
+ break;
+ }
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the index so
+ that the field number in an upd_field is the
+ index position */
+ ibool order_only,
+ /*!< in: if TRUE, limit the replacement to
+ ordering fields of index; note that this
+ does not work for non-clustered indexes. */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+{
+ ulint i;
+ ulint n_fields;
+ const ulint zip_size = dict_table_zip_size(index->table);
+
+ ut_ad(index);
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ if (order_only) {
+ n_fields = dict_index_get_n_unique(index);
+ } else {
+ n_fields = dict_index_get_n_fields(index);
+ }
+
+ for (i = 0; i < n_fields; i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+ const upd_field_t* uf;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+ uf = upd_get_field_by_field_no(update, i);
+
+ if (uf) {
+ row_upd_index_replace_new_col_val(
+ dtuple_get_nth_field(entry, i),
+ field, col, uf, heap, zip_size);
+ }
+ }
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+ dtuple_t* entry, /*!< in/out: index entry where replaced;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ dict_index_t* index, /*!< in: index; NOTE that this may also be a
+ non-clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ CLUSTERED index so that the field number in
+ an upd_field is the clustered index position */
+ mem_heap_t* heap) /*!< in: memory heap for allocating and
+ copying the new values */
+{
+ ulint i;
+ const dict_index_t* clust_index
+ = dict_table_get_first_index(index->table);
+ const ulint zip_size
+ = dict_table_zip_size(index->table);
+
+ dtuple_set_info_bits(entry, update->info_bits);
+
+ for (i = 0; i < dict_index_get_n_fields(index); i++) {
+ const dict_field_t* field;
+ const dict_col_t* col;
+ const upd_field_t* uf;
+
+ field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(field);
+ uf = upd_get_field_by_field_no(
+ update, dict_col_get_clust_pos(col, clust_index));
+
+ if (uf) {
+ row_upd_index_replace_new_col_val(
+ dtuple_get_nth_field(entry, i),
+ field, col, uf, heap, zip_size);
+ }
+ }
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+ dtuple_t* row, /*!< in/out: row where replaced,
+ indexed by col_no;
+ the clustered index record must be
+ covered by a lock or a page latch to
+ prevent deletion (rollback or purge) */
+ row_ext_t** ext, /*!< out, own: NULL, or externally
+ stored column prefixes */
+ const dict_index_t* index, /*!< in: clustered index */
+ const upd_t* update, /*!< in: an update vector built for the
+ clustered index */
+ mem_heap_t* heap) /*!< in: memory heap */
+{
+ ulint col_no;
+ ulint i;
+ ulint n_cols;
+ ulint n_ext_cols;
+ ulint* ext_cols;
+ const dict_table_t* table;
+
+ ut_ad(row);
+ ut_ad(ext);
+ ut_ad(index);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(update);
+ ut_ad(heap);
+
+ n_cols = dtuple_get_n_fields(row);
+ table = index->table;
+ ut_ad(n_cols == dict_table_get_n_cols(table));
+
+ ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols);
+ n_ext_cols = 0;
+
+ dtuple_set_info_bits(row, update->info_bits);
+
+ for (col_no = 0; col_no < n_cols; col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+ const ulint clust_pos
+ = dict_col_get_clust_pos(col, index);
+ dfield_t* dfield;
+
+ if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+ continue;
+ }
+
+ dfield = dtuple_get_nth_field(row, col_no);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, i);
+
+ if (upd_field->field_no != clust_pos) {
+
+ continue;
+ }
+
+ dfield_copy_data(dfield, &upd_field->new_val);
+ break;
+ }
+
+ if (dfield_is_ext(dfield) && col->ord_part) {
+ ext_cols[n_ext_cols++] = col_no;
+ }
+ }
+
+ if (n_ext_cols) {
+ *ext = row_ext_create(n_ext_cols, ext_cols, row,
+ dict_table_zip_size(table), heap);
+ } else {
+ *ext = NULL;
+ }
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+ const dtuple_t* row, /*!< in: old value of row, or NULL if the
+ row and the data values in update are not
+ known when this function is called, e.g., at
+ compile time */
+ dict_index_t* index, /*!< in: index of the record */
+ const upd_t* update) /*!< in: update vector for the row; NOTE: the
+ field numbers in this MUST be clustered index
+ positions! */
+{
+ ulint n_unique;
+ ulint n_upd_fields;
+ ulint i, j;
+ dict_index_t* clust_index;
+
+ ut_ad(update && index);
+
+ n_unique = dict_index_get_n_unique(index);
+ n_upd_fields = upd_get_n_fields(update);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n_unique; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_pos;
+ ulint col_no;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col, clust_index);
+ col_no = dict_col_get_no(col);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ const upd_field_t* upd_field
+ = upd_get_nth_field(update, j);
+
+ /* Note that if the index field is a column prefix
+ then it may be that row does not contain an externally
+ stored part of the column value, and we cannot compare
+ the datas */
+
+ if (col_pos == upd_field->field_no
+ && (row == NULL
+ || ind_field->prefix_len > 0
+ || !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(row, col_no),
+ &(upd_field->new_val)))) {
+
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+ const dict_table_t* table, /*!< in: table */
+ const upd_t* update) /*!< in: update vector for the row */
+{
+ upd_field_t* upd_field;
+ dict_index_t* index;
+ ulint i;
+
+ index = dict_table_get_first_index(table);
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ upd_field = upd_get_nth_field(update, i);
+
+ if (dict_field_get_col(dict_index_get_nth_field(
+ index, upd_field->field_no))
+ ->ord_part) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+ dtuple_t* entry, /*!< in: index entry */
+ dict_index_t* index, /*!< in: index of entry */
+ const upd_t* update, /*!< in: update vector for the row */
+ ulint n) /*!< in: how many first fields to check */
+{
+ ulint n_upd_fields;
+ ulint i, j;
+ dict_index_t* clust_index;
+
+ ut_ad(update && index);
+ ut_ad(n <= dict_index_get_n_fields(index));
+
+ n_upd_fields = upd_get_n_fields(update);
+ clust_index = dict_table_get_first_index(index->table);
+
+ for (i = 0; i < n; i++) {
+
+ const dict_field_t* ind_field;
+ const dict_col_t* col;
+ ulint col_pos;
+
+ ind_field = dict_index_get_nth_field(index, i);
+ col = dict_field_get_col(ind_field);
+ col_pos = dict_col_get_clust_pos(col, clust_index);
+
+ ut_a(ind_field->prefix_len == 0);
+
+ for (j = 0; j < n_upd_fields; j++) {
+
+ upd_field_t* upd_field
+ = upd_get_nth_field(update, j);
+
+ if (col_pos == upd_field->field_no
+ && !dfield_datas_are_binary_equal(
+ dtuple_get_nth_field(entry, i),
+ &(upd_field->new_val))) {
+
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+ rec_t* rec, /*!< in: record in a clustered index */
+ const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
+ sym_node_t* column) /*!< in: first column in a column list, or
+ NULL */
+{
+ byte* data;
+ ulint len;
+
+ while (column) {
+ data = rec_get_nth_field(rec, offsets,
+ column->field_nos[SYM_CLUST_FIELD_NO],
+ &len);
+ eval_node_copy_and_alloc_val(column, data, len);
+
+ column = UT_LIST_GET_NEXT(col_var_list, column);
+ }
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+ upd_t* update) /*!< in/out: update vector */
+{
+ que_node_t* exp;
+ upd_field_t* upd_field;
+ ulint n_fields;
+ ulint i;
+
+ n_fields = upd_get_n_fields(update);
+
+ for (i = 0; i < n_fields; i++) {
+ upd_field = upd_get_nth_field(update, i);
+
+ exp = upd_field->exp;
+
+ eval_exp(exp);
+
+ dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+ }
+}
+
+/***********************************************************//**
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+ upd_node_t* node) /*!< in: row update node */
+{
+ dict_index_t* clust_index;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ row_ext_t** ext;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ const ulint* offsets;
+ rec_offs_init(offsets_);
+
+ ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+ if (node->row != NULL) {
+ mem_heap_empty(node->heap);
+ }
+
+ clust_index = dict_table_get_first_index(node->table);
+
+ rec = btr_pcur_get_rec(node->pcur);
+
+ offsets = rec_get_offsets(rec, clust_index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+ /* In DYNAMIC or COMPRESSED format, there is no prefix
+ of externally stored columns in the clustered index
+ record. Build a cache of column prefixes. */
+ ext = &node->ext;
+ } else {
+ /* REDUNDANT and COMPACT formats store a local
+ 768-byte prefix of each externally stored column.
+ No cache is needed. */
+ ext = NULL;
+ node->ext = NULL;
+ }
+
+ node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+ NULL, ext, node->heap);
+ if (node->is_delete) {
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ } else {
+ node->upd_row = dtuple_copy(node->row, node->heap);
+ row_upd_replace(node->upd_row, &node->upd_ext,
+ clust_index, node->update, node->heap);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ibool check_ref;
+ ibool found;
+ dict_index_t* index;
+ dtuple_t* entry;
+ btr_pcur_t pcur;
+ btr_cur_t* btr_cur;
+ mem_heap_t* heap;
+ rec_t* rec;
+ ulint err = DB_SUCCESS;
+ mtr_t mtr;
+ trx_t* trx = thr_get_trx(thr);
+
+ index = node->index;
+
+ check_ref = row_upd_index_is_referenced(index, trx);
+
+ heap = mem_heap_create(1024);
+
+ /* Build old index entry */
+ entry = row_build_index_entry(node->row, node->ext, index, heap);
+ ut_a(entry);
+
+ mtr_start(&mtr);
+
+ found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+ &mtr);
+ btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+ rec = btr_cur_get_rec(btr_cur);
+
+ if (UNIV_UNLIKELY(!found)) {
+ fputs("InnoDB: error in sec index entry update in\n"
+ "InnoDB: ", stderr);
+ dict_index_name_print(stderr, trx, index);
+ fputs("\n"
+ "InnoDB: tuple ", stderr);
+ dtuple_print(stderr, entry);
+ fputs("\n"
+ "InnoDB: record ", stderr);
+ rec_print(stderr, rec, index);
+ putc('\n', stderr);
+
+ trx_print(stderr, trx, 0);
+
+ fputs("\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n", stderr);
+ } else {
+ /* Delete mark the old index record; it can already be
+ delete marked if we return after a lock wait in
+ row_ins_index_entry below */
+
+ if (!rec_get_deleted_flag(rec,
+ dict_table_is_comp(index->table))) {
+ err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
+ thr, &mtr);
+ if (err == DB_SUCCESS && check_ref) {
+
+ ulint* offsets = rec_get_offsets(
+ rec, index, NULL,
+ ULINT_UNDEFINED, &heap);
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(
+ node, &pcur, index->table,
+ index, offsets, thr, &mtr);
+ }
+ }
+ }
+
+ btr_pcur_close(&pcur);
+ mtr_commit(&mtr);
+
+ if (node->is_delete || err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+
+ /* Build a new index entry */
+ entry = row_build_index_entry(node->upd_row, node->upd_ext,
+ index, heap);
+ ut_a(entry);
+
+ /* Insert new index entry */
+ err = row_ins_index_entry(index, entry, 0, TRUE, thr);
+
+func_exit:
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_sec_step(
+/*=============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+ || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+ ut_ad(!dict_index_is_clust(node->index));
+
+ if (node->state == UPD_NODE_UPDATE_ALL_SEC
+ || row_upd_changes_ord_field_binary(node->row, node->index,
+ node->update)) {
+ return(row_upd_sec_index_entry(node, thr));
+ }
+
+ return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index of the record */
+ que_thr_t* thr, /*!< in: query thread */
+ ibool check_ref,/*!< in: TRUE if index may be referenced in
+ a foreign key constraint */
+ mtr_t* mtr) /*!< in: mtr; gets committed here */
+{
+ mem_heap_t* heap = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ trx_t* trx;
+ dict_table_t* table;
+ dtuple_t* entry;
+ ulint err;
+ ibool change_ownership = FALSE;
+
+ ut_ad(node);
+ ut_ad(dict_index_is_clust(index));
+
+ trx = thr_get_trx(thr);
+ table = node->table;
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+ rec_t* rec;
+ dict_index_t* index;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets;
+ rec_offs_init(offsets_);
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ return(err);
+ }
+
+ /* Mark as not-owned the externally stored fields which the new
+ row inherits from the delete marked record: purge should not
+ free those externally stored fields even if the delete marked
+ record is removed from the index tree, or updated. */
+
+ rec = btr_cur_get_rec(btr_cur);
+ index = dict_table_get_first_index(table);
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+ change_ownership = btr_cur_mark_extern_inherited_fields(
+ btr_cur_get_page_zip(btr_cur), rec, index, offsets,
+ node->update, mtr);
+ if (check_ref) {
+ /* NOTE that the following call loses
+ the position of pcur ! */
+ err = row_upd_check_references_constraints(
+ node, pcur, table, index, offsets, thr, mtr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+ }
+ }
+ }
+
+ mtr_commit(mtr);
+
+ if (!heap) {
+ heap = mem_heap_create(500);
+ }
+ node->state = UPD_NODE_INSERT_CLUSTERED;
+
+ entry = row_build_index_entry(node->upd_row, node->upd_ext,
+ index, heap);
+ ut_a(entry);
+
+ row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+ if (change_ownership) {
+ /* If we return from a lock wait, for example, we may have
+ extern fields marked as not-owned in entry (marked in the
+ if-branch above). We must unmark them, take the ownership
+ back. */
+
+ btr_cur_unmark_dtuple_extern_fields(entry);
+
+ /* We must mark non-updated extern fields in entry as
+ inherited, so that a possible rollback will not free them. */
+
+ btr_cur_mark_dtuple_inherited_extern(entry, node->update);
+ }
+
+ err = row_ins_index_entry(index, entry,
+ node->upd_ext ? node->upd_ext->n_ext : 0,
+ TRUE, thr);
+ mem_heap_free(heap);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ que_thr_t* thr, /*!< in: query thread */
+ mtr_t* mtr) /*!< in: mtr; gets committed here */
+{
+ mem_heap_t* heap = NULL;
+ big_rec_t* big_rec = NULL;
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(dict_index_is_clust(index));
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table)));
+
+ /* Try optimistic updating of the record, keeping changes within
+ the page; we do not check locks because we assume the x-lock on the
+ record to update */
+
+ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+ err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ } else {
+ err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+ btr_cur, node->update,
+ node->cmpl_info, thr, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ if (UNIV_LIKELY(err == DB_SUCCESS)) {
+
+ return(DB_SUCCESS);
+ }
+
+ if (buf_LRU_buf_pool_running_out()) {
+
+ return(DB_LOCK_TABLE_FULL);
+ }
+ /* We may have to modify the tree structure: do a pessimistic descent
+ down the index tree */
+
+ mtr_start(mtr);
+
+ /* NOTE: this transaction has an s-lock or x-lock on the record and
+ therefore other transactions cannot modify the record when we have no
+ latch on the page. In addition, we assume that other query threads of
+ the same transaction do not modify the record in the meantime.
+ Therefore we can assert that the restoration of the cursor succeeds. */
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+ ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+ dict_table_is_comp(index->table)));
+
+ err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+ &heap, &big_rec, node->update,
+ node->cmpl_info, thr, mtr);
+ mtr_commit(mtr);
+
+ if (err == DB_SUCCESS && big_rec) {
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ rec_t* rec;
+ rec_offs_init(offsets_);
+
+ mtr_start(mtr);
+
+ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+ rec = btr_cur_get_rec(btr_cur);
+ err = btr_store_big_rec_extern_fields(
+ index, btr_cur_get_block(btr_cur), rec,
+ rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap),
+ big_rec, mtr);
+ mtr_commit(mtr);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (big_rec) {
+ dtuple_big_rec_free(big_rec);
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+ upd_node_t* node, /*!< in: row update node */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint* offsets,/*!< in/out: rec_get_offsets() for the
+ record under the cursor */
+ que_thr_t* thr, /*!< in: query thread */
+ ibool check_ref,/*!< in: TRUE if index may be referenced in
+ a foreign key constraint */
+ mtr_t* mtr) /*!< in: mtr; gets committed here */
+{
+ btr_pcur_t* pcur;
+ btr_cur_t* btr_cur;
+ ulint err;
+
+ ut_ad(node);
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(node->is_delete);
+
+ pcur = node->pcur;
+ btr_cur = btr_pcur_get_btr_cur(pcur);
+
+ /* Store row because we have to build also the secondary index
+ entries */
+
+ row_upd_store_row(node);
+
+ /* Mark the clustered index record deleted; we do not have to check
+ locks, because we assume that we have an x-lock on the record */
+
+ err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+ btr_cur, TRUE, thr, mtr);
+ if (err == DB_SUCCESS && check_ref) {
+ /* NOTE that the following call loses the position of pcur ! */
+
+ err = row_upd_check_references_constraints(node,
+ pcur, index->table,
+ index, offsets,
+ thr, mtr);
+ }
+
+ mtr_commit(mtr);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ dict_index_t* index;
+ btr_pcur_t* pcur;
+ ibool success;
+ ibool check_ref;
+ ulint err;
+ mtr_t* mtr;
+ mtr_t mtr_buf;
+ rec_t* rec;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets;
+ rec_offs_init(offsets_);
+
+ index = dict_table_get_first_index(node->table);
+
+ check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+ pcur = node->pcur;
+
+ /* We have to restore the cursor to its position */
+ mtr = &mtr_buf;
+
+ mtr_start(mtr);
+
+ /* If the restoration does not succeed, then the same
+ transaction has deleted the record on which the cursor was,
+ and that is an SQL error. If the restoration succeeds, it may
+ still be that the same transaction has successively deleted
+ and inserted a record with the same ordering fields, but in
+ that case we know that the transaction has at least an
+ implicit x-lock on the record. */
+
+ ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+ if (!success) {
+ err = DB_RECORD_NOT_FOUND;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+
+ /* If this is a row in SYS_INDEXES table of the data dictionary,
+ then we have to free the file segments of the index tree associated
+ with the index */
+
+ if (node->is_delete
+ && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+ dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+ mtr_commit(mtr);
+
+ mtr_start(mtr);
+
+ success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+ mtr);
+ if (!success) {
+ err = DB_ERROR;
+
+ mtr_commit(mtr);
+
+ return(err);
+ }
+ }
+
+ rec = btr_pcur_get_rec(pcur);
+ offsets = rec_get_offsets(rec, index, offsets_,
+ ULINT_UNDEFINED, &heap);
+
+ if (!node->has_clust_rec_x_lock) {
+ err = lock_clust_rec_modify_check_and_lock(
+ 0, btr_pcur_get_block(pcur),
+ rec, index, offsets, thr);
+ if (err != DB_SUCCESS) {
+ mtr_commit(mtr);
+ goto exit_func;
+ }
+ }
+
+ /* NOTE: the following function calls will also commit mtr */
+
+ if (node->is_delete) {
+ err = row_upd_del_mark_clust_rec(node, index, offsets,
+ thr, check_ref, mtr);
+ if (err == DB_SUCCESS) {
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ node->index = dict_table_get_next_index(index);
+ }
+exit_func:
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(err);
+ }
+
+ /* If the update is made for MySQL, we already have the update vector
+ ready, else we have to do some evaluation: */
+
+ if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+ /* Copy the necessary columns from clust_rec and calculate the
+ new values to set */
+ row_upd_copy_columns(rec, offsets,
+ UT_LIST_GET_FIRST(node->columns));
+ row_upd_eval_new_vals(node->update);
+ }
+
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+
+ if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+ err = row_upd_clust_rec(node, index, thr, mtr);
+ return(err);
+ }
+
+ row_upd_store_row(node);
+
+ if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {
+
+ /* Update causes an ordering field (ordering fields within
+ the B-tree) of the clustered index record to change: perform
+ the update by delete marking and inserting.
+
+ TODO! What to do to the 'Halloween problem', where an update
+ moves the record forward in index so that it is again
+ updated when the cursor arrives there? Solution: the
+ read operation must check the undo record undo number when
+ choosing records to update. MySQL solves now the problem
+ externally! */
+
+ err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
+ mtr);
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_ALL_SEC;
+ } else {
+ err = row_upd_clust_rec(node, index, thr, mtr);
+
+ if (err != DB_SUCCESS) {
+
+ return(err);
+ }
+
+ node->state = UPD_NODE_UPDATE_SOME_SEC;
+ }
+
+ node->index = dict_table_get_next_index(index);
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd(
+/*====*/
+ upd_node_t* node, /*!< in: row update node */
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ulint err = DB_SUCCESS;
+
+ ut_ad(node && thr);
+
+ if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+ /* We do not get the cmpl_info value from the MySQL
+ interpreter: we must calculate it on the fly: */
+
+ if (node->is_delete
+ || row_upd_changes_some_index_ord_field_binary(
+ node->table, node->update)) {
+ node->cmpl_info = 0;
+ } else {
+ node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+ }
+ }
+
+ if (node->state == UPD_NODE_UPDATE_CLUSTERED
+ || node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+ log_free_check();
+ err = row_upd_clust_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+ }
+
+ if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+ goto function_exit;
+ }
+
+ while (node->index != NULL) {
+
+ log_free_check();
+ err = row_upd_sec_step(node, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto function_exit;
+ }
+
+ node->index = dict_table_get_next_index(node->index);
+ }
+
+function_exit:
+ if (err == DB_SUCCESS) {
+ /* Do some cleanup */
+
+ if (node->row != NULL) {
+ node->row = NULL;
+ node->ext = NULL;
+ node->upd_row = NULL;
+ node->upd_ext = NULL;
+ mem_heap_empty(node->heap);
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+ }
+
+ return(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ upd_node_t* node;
+ sel_node_t* sel_node;
+ que_node_t* parent;
+ ulint err = DB_SUCCESS;
+ trx_t* trx;
+
+ ut_ad(thr);
+
+ trx = thr_get_trx(thr);
+
+ trx_start_if_not_started(trx);
+
+ node = thr->run_node;
+
+ sel_node = node->select;
+
+ parent = que_node_get_parent(node);
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+ if (thr->prev_node == parent) {
+ node->state = UPD_NODE_SET_IX_LOCK;
+ }
+
+ if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+ if (!node->has_clust_rec_x_lock) {
+ /* It may be that the current session has not yet
+ started its transaction, or it has been committed: */
+
+ err = lock_table(0, node->table, LOCK_IX, thr);
+
+ if (err != DB_SUCCESS) {
+
+ goto error_handling;
+ }
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ if (node->searched_update) {
+ /* Reset the cursor */
+ sel_node->state = SEL_NODE_OPEN;
+
+ /* Fetch a row to update */
+
+ thr->run_node = sel_node;
+
+ return(thr);
+ }
+ }
+
+ /* sel_node is NULL if we are in the MySQL interface */
+
+ if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+ if (!node->searched_update) {
+ /* An explicit cursor should be positioned on a row
+ to update */
+
+ ut_error;
+
+ err = DB_ERROR;
+
+ goto error_handling;
+ }
+
+ ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+ /* No more rows to update, or the select node performed the
+ updates directly in-place */
+
+ thr->run_node = parent;
+
+ return(thr);
+ }
+
+ /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+ err = row_upd(node, thr);
+
+error_handling:
+ trx->error_state = err;
+
+ if (err != DB_SUCCESS) {
+ return(NULL);
+ }
+
+ /* DO THE TRIGGER ACTIONS HERE */
+
+ if (node->searched_update) {
+ /* Fetch next row to update */
+
+ thr->run_node = sel_node;
+ } else {
+ /* It was an explicit cursor update */
+
+ thr->run_node = parent;
+ }
+
+ node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+ return(thr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.c
new file mode 100644
index 00000000000..a4fbb5289aa
--- /dev/null
+++ b/storage/xtradb/row/row0vers.c
@@ -0,0 +1,741 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.c
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+ const rec_t* rec, /*!< in: record in a secondary index */
+ dict_index_t* index, /*!< in: the secondary index */
+ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+ dict_index_t* clust_index;
+ rec_t* clust_rec;
+ ulint* clust_offsets;
+ rec_t* version;
+ trx_id_t trx_id;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ dtuple_t* row;
+ dtuple_t* entry = NULL; /* assignment to eliminate compiler
+ warning */
+ trx_t* trx;
+ ulint rec_del;
+ ulint err;
+ mtr_t mtr;
+ ulint comp;
+
+ ut_ad(mutex_own(&kernel_mutex));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Search for the clustered index record: this is a time-consuming
+ operation: therefore we release the kernel mutex; also, the release
+ is required by the latching order convention. The latch on the
+ clustered index locks the top of the stack of versions. We also
+ reserve purge_latch to lock the bottom of the version stack. */
+
+ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+ &clust_index, &mtr);
+ if (!clust_rec) {
+ /* In a rare case it is possible that no clust rec is found
+ for a secondary index record: if in row0umod.c
+ row_undo_mod_remove_clust_low() we have already removed the
+ clust rec, while purge is still cleaning and removing
+ secondary index records associated with earlier versions of
+ the clustered index record. In that case there cannot be
+ any implicit lock on the secondary index record, because
+ an active transaction which has modified the secondary index
+ record has also modified the clustered index record. And in
+ a rollback we always undo the modifications to secondary index
+ records before the clustered index record. */
+
+ mutex_enter(&kernel_mutex);
+ mtr_commit(&mtr);
+
+ return(NULL);
+ }
+
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+ ULINT_UNDEFINED, &heap);
+ trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+
+ mtr_s_lock(&(purge_sys->latch), &mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ trx = NULL;
+ if (!trx_is_active(trx_id)) {
+ /* The transaction that modified or inserted clust_rec is no
+ longer active: no implicit lock on rec */
+ goto exit_func;
+ }
+
+ if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
+ clust_offsets, TRUE)) {
+ /* Corruption noticed: try to avoid a crash by returning */
+ goto exit_func;
+ }
+
+ comp = page_rec_is_comp(rec);
+ ut_ad(index->table == clust_index->table);
+ ut_ad(!!comp == dict_table_is_comp(index->table));
+ ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+ /* We look up if some earlier version, which was modified by the trx_id
+ transaction, of the clustered index record would require rec to be in
+ a different state (delete marked or unmarked, or have different field
+ values, or not existing). If there is such a version, then rec was
+ modified by the trx_id transaction, and it has an implicit x-lock on
+ rec. Note that if clust_rec itself would require rec to be in a
+ different state, then the trx_id transaction has not yet had time to
+ modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+ rec_del = rec_get_deleted_flag(rec, comp);
+ trx = NULL;
+
+ version = clust_rec;
+
+ for (;;) {
+ rec_t* prev_version;
+ ulint vers_del;
+ row_ext_t* ext;
+ trx_id_t prev_trx_id;
+
+ mutex_exit(&kernel_mutex);
+
+ /* While we retrieve an earlier version of clust_rec, we
+ release the kernel mutex, because it may take time to access
+ the disk. After the release, we have to check if the trx_id
+ transaction is still active. We keep the semaphore in mtr on
+ the clust_rec page, so that no other transaction can update
+ it and get an implicit x-lock on rec. */
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+ clust_index, clust_offsets,
+ heap, &prev_version);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (prev_version == NULL) {
+ mutex_enter(&kernel_mutex);
+
+ if (!trx_is_active(trx_id)) {
+ /* Transaction no longer active: no
+ implicit x-lock */
+
+ break;
+ }
+
+ /* If the transaction is still active,
+ clust_rec must be a fresh insert, because no
+ previous version was found. */
+ ut_ad(err == DB_SUCCESS);
+
+ /* It was a freshly inserted version: there is an
+ implicit x-lock on rec */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL, ULINT_UNDEFINED, &heap);
+
+ vers_del = rec_get_deleted_flag(prev_version, comp);
+ prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+ clust_offsets);
+
+ /* If the trx_id and prev_trx_id are different and if
+ the prev_version is marked deleted then the
+ prev_trx_id must have already committed for the trx_id
+ to be able to modify the row. Therefore, prev_trx_id
+ cannot hold any implicit lock. */
+ if (vers_del && 0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+
+ mutex_enter(&kernel_mutex);
+ break;
+ }
+
+ /* The stack of versions is locked by mtr. Thus, it
+ is safe to fetch the prefixes for externally stored
+ columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+ clust_offsets, NULL, &ext, heap);
+ entry = row_build_index_entry(row, ext, index, heap);
+ /* entry may be NULL if a record was inserted in place
+ of a deleted record, and the BLOB pointers of the new
+ record were not initialized yet. But in that case,
+ prev_version should be NULL. */
+ ut_a(entry);
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx_is_active(trx_id)) {
+ /* Transaction no longer active: no implicit x-lock */
+
+ break;
+ }
+
+ /* If we get here, we know that the trx_id transaction is
+ still active and it has modified prev_version. Let us check
+ if prev_version would require rec to be in a different
+ state. */
+
+ /* The previous version of clust_rec must be
+ accessible, because the transaction is still active
+ and clust_rec was not a fresh insert. */
+ ut_ad(err == DB_SUCCESS);
+
+ /* We check if entry and rec are identified in the alphabetical
+ ordering */
+ if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+ /* The delete marks of rec and prev_version should be
+ equal for rec to be in the state required by
+ prev_version */
+
+ if (rec_del != vers_del) {
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ /* It is possible that the row was updated so that the
+ secondary index record remained the same in
+ alphabetical ordering, but the field values changed
+ still. For example, 'abc' -> 'ABC'. Check also that. */
+
+ dtuple_set_types_binary(entry,
+ dtuple_get_n_fields(entry));
+ if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+ } else if (!rec_del) {
+ /* The delete mark should be set in rec for it to be
+ in the state required by prev_version */
+
+ trx = trx_get_on_id(trx_id);
+
+ break;
+ }
+
+ if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+ /* The versions modified by the trx_id transaction end
+ to prev_version: no implicit x-lock */
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+exit_func:
+ mtr_commit(&mtr);
+ mem_heap_free(heap);
+
+ return(trx);
+}
+
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+ trx_id_t trx_id, /*!< in: transaction id in the version */
+ mtr_t* mtr) /*!< in: mtr holding the latch on the
+ clustered index record; it will also
+ hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ if (trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* A purge operation is not yet allowed to remove this
+ delete marked record */
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE.
+@return TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+ ibool also_curr,/*!< in: TRUE if also rec is included in the
+ versions to search; otherwise only versions
+ prior to it are searched */
+ const rec_t* rec, /*!< in: record in the clustered index; the
+ caller must have a latch on the page */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will
+ also hold the latch on purge_view */
+ dict_index_t* index, /*!< in: the secondary index */
+ const dtuple_t* ientry) /*!< in: the secondary index entry */
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ dict_index_t* clust_index;
+ ulint* clust_offsets;
+ mem_heap_t* heap;
+ mem_heap_t* heap2;
+ const dtuple_t* row;
+ const dtuple_t* entry;
+ ulint err;
+ ulint comp;
+
+ ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ mtr_s_lock(&(purge_sys->latch), mtr);
+
+ clust_index = dict_table_get_first_index(index->table);
+
+ comp = page_rec_is_comp(rec);
+ ut_ad(!dict_table_is_comp(index->table) == !comp);
+ heap = mem_heap_create(1024);
+ clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+ ULINT_UNDEFINED, &heap);
+
+ if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+ row_ext_t* ext;
+
+ /* The stack of versions is locked by mtr.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ rec, clust_offsets, NULL, &ext, heap);
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset BLOB
+ pointers. This must be a freshly inserted record. If
+ this is called from
+ row_purge_remove_sec_if_poss_low(), the thread will
+ hold latches on the clustered index and the secondary
+ index. Because the insert works in three steps:
+
+ (1) insert the record to clustered index
+ (2) store the BLOBs and update BLOB pointers
+ (3) insert records to secondary indexes
+
+ the purge thread can safely ignore freshly inserted
+ records and delete the secondary index record. The
+ thread that inserted the new record will be inserting
+ the secondary index records. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because the row is maybe being modified so that
+ the clustered index record has already been updated to
+ a different binary value in a char field, but the
+ collation identifies the old and new value anyway! */
+ if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+ }
+
+ version = rec;
+
+ for (;;) {
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+ err = trx_undo_prev_version_build(rec, mtr, version,
+ clust_index, clust_offsets,
+ heap, &prev_version);
+ mem_heap_free(heap2); /* free version and clust_offsets */
+
+ if (err != DB_SUCCESS || !prev_version) {
+ /* Versions end here */
+
+ mem_heap_free(heap);
+
+ return(FALSE);
+ }
+
+ clust_offsets = rec_get_offsets(prev_version, clust_index,
+ NULL, ULINT_UNDEFINED, &heap);
+
+ if (!rec_get_deleted_flag(prev_version, comp)) {
+ row_ext_t* ext;
+
+ /* The stack of versions is locked by mtr.
+ Thus, it is safe to fetch the prefixes for
+ externally stored columns. */
+ row = row_build(ROW_COPY_POINTERS, clust_index,
+ prev_version, clust_offsets,
+ NULL, &ext, heap);
+ entry = row_build_index_entry(row, ext, index, heap);
+
+ /* If entry == NULL, the record contains unset
+ BLOB pointers. This must be a freshly
+ inserted record that we can safely ignore.
+ For the justification, see the comments after
+ the previous row_build_index_entry() call. */
+
+ /* NOTE that we cannot do the comparison as binary
+ fields because maybe the secondary index record has
+ already been updated to a different binary value in
+ a char field, but the collation identifies the old
+ and new value anyway! */
+
+ if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+ mem_heap_free(heap);
+
+ return(TRUE);
+ }
+ }
+
+ version = prev_version;
+ }
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ read_view_t* view, /*!< in: the consistent read view */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ rec_t** old_vers)/*!< out, own: old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+{
+ const rec_t* version;
+ rec_t* prev_version;
+ trx_id_t trx_id;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ ulint err;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+ ut_ad(!read_view_sees_trx_id(view, trx_id));
+
+ rw_lock_s_lock(&(purge_sys->latch));
+ version = rec;
+
+ for (;;) {
+ mem_heap_t* heap2 = heap;
+ trx_undo_rec_t* undo_rec;
+ roll_ptr_t roll_ptr;
+ undo_no_t undo_no;
+ heap = mem_heap_create(1024);
+
+ /* If we have high-granularity consistent read view and
+ creating transaction of the view is the same as trx_id in
+ the record we see this record only in the case when
+ undo_no of the record is < undo_no in the view. */
+
+ if (view->type == VIEW_HIGH_GRANULARITY
+ && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) {
+
+ roll_ptr = row_get_rec_roll_ptr(version, index,
+ *offsets);
+ undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+ mem_heap_empty(heap);
+
+ if (ut_dulint_cmp(view->undo_no, undo_no) > 0) {
+ /* The view already sees this version: we can
+ copy it to in_heap and return */
+
+ buf = mem_heap_alloc(in_heap,
+ rec_offs_size(*offsets));
+ *old_vers = rec_copy(buf, version, *offsets);
+ rec_offs_make_valid(*old_vers, index,
+ *offsets);
+ err = DB_SUCCESS;
+
+ break;
+ }
+ }
+
+ err = trx_undo_prev_version_build(rec, mtr, version, index,
+ *offsets, heap,
+ &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* free version */
+ }
+
+ if (err != DB_SUCCESS) {
+ break;
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ *offsets = rec_get_offsets(prev_version, index, *offsets,
+ ULINT_UNDEFINED, offset_heap);
+
+ trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+ if (read_view_sees_trx_id(view, trx_id)) {
+
+ /* The view already sees this version: we can copy
+ it to in_heap and return */
+
+ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+ *old_vers = rec_copy(buf, prev_version, *offsets);
+ rec_offs_make_valid(*old_vers, index, *offsets);
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ version = prev_version;
+ }/* for (;;) */
+
+ mem_heap_free(heap);
+ rw_lock_s_unlock(&(purge_sys->latch));
+
+ return(err);
+}
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+ const rec_t* rec, /*!< in: record in a clustered index; the
+ caller must have a latch on the page; this
+ latch locks the top of the stack of versions
+ of this records */
+ mtr_t* mtr, /*!< in: mtr holding the latch on rec */
+ dict_index_t* index, /*!< in: the clustered index */
+ ulint** offsets,/*!< in/out: offsets returned by
+ rec_get_offsets(rec, index) */
+ mem_heap_t** offset_heap,/*!< in/out: memory heap from which
+ the offsets are allocated */
+ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for
+ *old_vers is allocated; memory for possible
+ intermediate versions is allocated and freed
+ locally within the function */
+ const rec_t** old_vers)/*!< out: rec, old version, or NULL if the
+ record does not exist in the view, that is,
+ it was freshly inserted afterwards */
+{
+ const rec_t* version;
+ mem_heap_t* heap = NULL;
+ byte* buf;
+ ulint err;
+ trx_id_t rec_trx_id = ut_dulint_zero;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+ || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(rec_offs_validate(rec, index, *offsets));
+
+ rw_lock_s_lock(&(purge_sys->latch));
+ /* The S-latch on purge_sys prevents the purge view from
+ changing. Thus, if we have an uncommitted transaction at
+ this point, then purge cannot remove its undo log even if
+ the transaction could commit now. */
+
+ version = rec;
+
+ for (;;) {
+ trx_t* version_trx;
+ mem_heap_t* heap2;
+ rec_t* prev_version;
+ trx_id_t version_trx_id;
+
+ version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+ if (rec == version) {
+ rec_trx_id = version_trx_id;
+ }
+
+ mutex_enter(&kernel_mutex);
+ version_trx = trx_get_on_id(version_trx_id);
+ mutex_exit(&kernel_mutex);
+
+ if (!version_trx
+ || version_trx->conc_state == TRX_NOT_STARTED
+ || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+
+ /* We found a version that belongs to a
+ committed transaction: return it. */
+
+ if (rec == version) {
+ *old_vers = rec;
+ err = DB_SUCCESS;
+ break;
+ }
+
+ /* We assume that a rolled-back transaction stays in
+ TRX_ACTIVE state until all the changes have been
+ rolled back and the transaction is removed from
+ the global list of transactions. */
+
+ if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) {
+ /* The transaction was committed while
+ we searched for earlier versions.
+ Return the current version as a
+ semi-consistent read. */
+
+ version = rec;
+ *offsets = rec_get_offsets(version,
+ index, *offsets,
+ ULINT_UNDEFINED,
+ offset_heap);
+ }
+
+ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+ *old_vers = rec_copy(buf, version, *offsets);
+ rec_offs_make_valid(*old_vers, index, *offsets);
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ heap2 = heap;
+ heap = mem_heap_create(1024);
+
+ err = trx_undo_prev_version_build(rec, mtr, version, index,
+ *offsets, heap,
+ &prev_version);
+ if (heap2) {
+ mem_heap_free(heap2); /* free version */
+ }
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ break;
+ }
+
+ if (prev_version == NULL) {
+ /* It was a freshly inserted version */
+ *old_vers = NULL;
+ err = DB_SUCCESS;
+
+ break;
+ }
+
+ version = prev_version;
+ *offsets = rec_get_offsets(version, index, *offsets,
+ ULINT_UNDEFINED, offset_heap);
+ }/* for (;;) */
+
+ if (heap) {
+ mem_heap_free(heap);
+ }
+ rw_lock_s_unlock(&(purge_sys->latch));
+
+ return(err);
+}
diff --git a/storage/xtradb/scripts/install_innodb_plugins.sql b/storage/xtradb/scripts/install_innodb_plugins.sql
new file mode 100644
index 00000000000..5a555a652f7
--- /dev/null
+++ b/storage/xtradb/scripts/install_innodb_plugins.sql
@@ -0,0 +1,17 @@
+-- execute these to install InnoDB if it is built as a dynamic plugin
+INSTALL PLUGIN innodb SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.so';
+INSTALL PLUGIN XTRADB_ENHANCEMENTS SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_table_stats SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_index_stats SONAME 'ha_innodb.so';
+INSTALL PLUGIN xtradb_admin_command SONAME 'ha_innodb.so';
diff --git a/storage/xtradb/scripts/install_innodb_plugins_win.sql b/storage/xtradb/scripts/install_innodb_plugins_win.sql
new file mode 100644
index 00000000000..7cda3335694
--- /dev/null
+++ b/storage/xtradb/scripts/install_innodb_plugins_win.sql
@@ -0,0 +1,17 @@
+-- execute these to install InnoDB if it is built as a dynamic plugin
+INSTALL PLUGIN innodb SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.dll';
+INSTALL PLUGIN XTRADB_ENHANCEMENTS SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_table_stats SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_index_stats SONAME 'ha_innodb.dll';
+INSTALL PLUGIN xtradb_admin_command SONAME 'ha_innodb.dll';
diff --git a/storage/xtradb/srv/srv0que.c b/storage/xtradb/srv/srv0que.c
new file mode 100644
index 00000000000..fc50a86a55c
--- /dev/null
+++ b/storage/xtradb/srv/srv0que.c
@@ -0,0 +1,49 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0que.c
+Server query execution
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0que.h"
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "os0thread.h"
+#include "usr0sess.h"
+#include "que0que.h"
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ ut_ad(thr);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
+
+ srv_release_threads(SRV_WORKER, 1);
+}
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
new file mode 100644
index 00000000000..43799aab196
--- /dev/null
+++ b/storage/xtradb/srv/srv0srv.c
@@ -0,0 +1,3440 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.c
+The database server main program
+
+NOTE: SQL Server 7 uses something which the documentation
+calls user mode scheduled threads (UMS threads). One such
+thread is usually allocated per processor. Win32
+documentation does not know any UMS threads, which suggests
+that the concept is internal to SQL Server 7. It may mean that
+SQL Server 7 does all the scheduling of threads itself, even
+in i/o waits. We should maybe modify InnoDB to use the same
+technique, because thread switches within NT may be too slow.
+
+SQL Server 7 also mentions fibers, which are cooperatively
+scheduled threads. They can boost performance by 5 %,
+according to the Delaney and Soukup's book.
+
+Windows 2000 will have something called thread pooling
+(see msdn website), which we could possibly use.
+
+Another possibility could be to use some very fast user space
+thread library. This might confuse NT though.
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "ut0ut.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "thr0loc.h"
+#include "que0que.h"
+#include "srv0que.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "srv0start.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h"
+#include "trx0i_s.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+
+/* prototypes for new functions added to ha_innodb.cc */
+ibool innobase_get_slow_log();
+
+/* This is set to TRUE if the MySQL user has set it in MySQL; currently
+affects only FOREIGN KEY definition parsing */
+UNIV_INTERN ibool srv_lower_case_table_names = FALSE;
+
+/* The following counter is incremented whenever there is some user activity
+in the server */
+UNIV_INTERN ulint srv_activity_count = 0;
+
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulint srv_fatal_semaphore_wait_threshold = 600;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+UNIV_INTERN ulint srv_dml_needed_delay = 0;
+
+UNIV_INTERN ibool srv_lock_timeout_active = FALSE;
+UNIV_INTERN ibool srv_monitor_active = FALSE;
+UNIV_INTERN ibool srv_error_monitor_active = FALSE;
+
+UNIV_INTERN const char* srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+UNIV_INTERN const char srv_mysql50_table_name_prefix[9] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+UNIV_INTERN char* srv_data_home = NULL;
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN char* srv_arch_dir = NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+UNIV_INTERN my_bool srv_file_per_table;
+/** The file format to use on new *.ibd files. */
+UNIV_INTERN ulint srv_file_format = 0;
+/** Whether to check file format during startup. A value of
+DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE. The default is to
+set it to the highest format we support. */
+UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
+
+#if DICT_TF_FORMAT_51
+# error "DICT_TF_FORMAT_51 must be 0!"
+#endif
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE;
+
+UNIV_INTERN ulint srv_n_data_files = 0;
+UNIV_INTERN char** srv_data_file_names = NULL;
+/* size in database pages */
+UNIV_INTERN ulint* srv_data_file_sizes = NULL;
+
+UNIV_INTERN char* srv_doublewrite_file = NULL;
+
+UNIV_INTERN ibool srv_extra_undoslots = FALSE;
+
+UNIV_INTERN ibool srv_recovery_stats = FALSE;
+
+UNIV_INTERN ulint srv_use_purge_thread = 0;
+
+/* if TRUE, then we auto-extend the last data file */
+UNIV_INTERN ibool srv_auto_extend_last_data_file = FALSE;
+/* if != 0, this tells the max size auto-extending may increase the
+last data file size */
+UNIV_INTERN ulint srv_last_file_size_max = 0;
+/* If the last data file is auto-extended, we add this
+many pages to it at a time */
+UNIV_INTERN ulong srv_auto_extend_increment = 8;
+UNIV_INTERN ulint* srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+UNIV_INTERN ibool srv_created_new_raw = FALSE;
+
+UNIV_INTERN char** srv_log_group_home_dirs = NULL;
+
+UNIV_INTERN ulint srv_n_log_groups = ULINT_MAX;
+UNIV_INTERN ulint srv_n_log_files = ULINT_MAX;
+/* size in database pages */
+UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
+/* size in database pages */
+UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
+UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
+
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char srv_adaptive_flushing = TRUE;
+
+UNIV_INTERN ulong srv_show_locks_held = 10;
+UNIV_INTERN ulong srv_show_verbose_locks = 0;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT 20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped) ((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+UNIV_INTERN const byte* srv_latin1_ordering;
+
+/* use os/external memory allocator */
+UNIV_INTERN my_bool srv_use_sys_malloc = TRUE;
+/* requested size in kilobytes */
+UNIV_INTERN ulint srv_buf_pool_size = ULINT_MAX;
+/* previously requested size */
+UNIV_INTERN ulint srv_buf_pool_old_size;
+/* current size in kilobytes */
+UNIV_INTERN ulint srv_buf_pool_curr_size = 0;
+/* size in bytes */
+UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX;
+UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX;
+
+/* key value for shm */
+UNIV_INTERN uint srv_buffer_pool_shm_key = 0;
+UNIV_INTERN ibool srv_buffer_pool_shm_is_reused = FALSE;
+UNIV_INTERN ibool srv_buffer_pool_shm_checksum = TRUE;
+
+/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
+instead. */
+UNIV_INTERN ulint srv_n_file_io_threads = ULINT_MAX;
+UNIV_INTERN ulint srv_n_read_io_threads = ULINT_MAX;
+UNIV_INTERN ulint srv_n_write_io_threads = ULINT_MAX;
+
+/* The universal page size of the database */
+UNIV_INTERN ulint srv_page_size_shift = 0;
+UNIV_INTERN ulint srv_page_size = 0;
+
+/* User settable value of the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+UNIV_INTERN ulong srv_read_ahead_threshold = 56;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN ibool srv_log_archive_on = FALSE;
+UNIV_INTERN ibool srv_archive_recovery = 0;
+UNIV_INTERN ib_uint64_t srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* This parameter is used to throttle the number of insert buffers that are
+merged in a batch. By increasing this parameter on a faster disk you can
+possibly reduce the number of I/O operations performed to complete the
+merge operation. The value of this parameter is used as is by the
+background loop when the system is idle (low load), on a busy system
+the parameter is scaled down by a factor of 4, this is to avoid putting
+a heavier load on the I/O sub system. */
+
+UNIV_INTERN ulong srv_insert_buffer_batch_size = 20;
+
+UNIV_INTERN char* srv_file_flush_method_str = NULL;
+UNIV_INTERN ulint srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+UNIV_INTERN ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+UNIV_INTERN ulint srv_max_n_open_files = 300;
+
+/* Number of IO operations per second the server can do */
+UNIV_INTERN ulong srv_io_capacity = 200;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75;
+
+/* variable counts amount of data read in total (in bytes) */
+UNIV_INTERN ulint srv_data_read = 0;
+
+/* here we count the amount of data written in total (in bytes) */
+UNIV_INTERN ulint srv_data_written = 0;
+
+/* the number of the log write requests done */
+UNIV_INTERN ulint srv_log_write_requests = 0;
+
+/* the number of physical writes to the log performed */
+UNIV_INTERN ulint srv_log_writes = 0;
+
+/* amount of data written to the log files in bytes */
+UNIV_INTERN ulint srv_os_log_written = 0;
+
+/* amount of writes being done to the log files */
+UNIV_INTERN ulint srv_os_log_pending_writes = 0;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+UNIV_INTERN ulint srv_log_waits = 0;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+UNIV_INTERN ulint srv_dblwr_writes = 0;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+UNIV_INTERN ulint srv_dblwr_pages_written = 0;
+
+/* in this variable we store the number of write requests issued */
+UNIV_INTERN ulint srv_buf_pool_write_requests = 0;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+UNIV_INTERN ulint srv_buf_pool_wait_free = 0;
+
+/* variable to count the number of pages that were written from buffer
+pool to the disk */
+UNIV_INTERN ulint srv_buf_pool_flushed = 0;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+UNIV_INTERN ulint srv_buf_pool_reads = 0;
+
+/** Time in seconds between automatic buffer pool dumps */
+UNIV_INTERN uint srv_auto_lru_dump = 0;
+
+/* structure to pass status variables to MySQL */
+UNIV_INTERN export_struc export_vars;
+
+/* If the following is != 0 we do not allow inserts etc. This protects
+the user from forgetting the innodb_force_recovery keyword to my.cnf */
+
+UNIV_INTERN ulint srv_force_recovery = 0;
+/*-----------------------*/
+/* We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+UNIV_INTERN ulint srv_max_n_threads = 0;
+
+/* The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. MySQL creates a thread for each user session, and
+semaphore contention and convoy problems can occur withput this restriction.
+Value 10 should be good if there are less than 4 processors + 4 disks in the
+computer. Bigger computers need bigger values. Value 0 will disable the
+concurrency check. */
+
+UNIV_INTERN ibool srv_thread_concurrency_timer_based = FALSE;
+UNIV_INTERN ulong srv_thread_concurrency = 0;
+
+/* this mutex protects srv_conc data structures */
+UNIV_INTERN os_fast_mutex_t srv_conc_mutex;
+/* number of transactions that have declared_to_be_inside_innodb set.
+It used to be a non-error for this value to drop below zero temporarily.
+This is no longer true. We'll, however, keep the lint datatype to add
+assertions to catch any corner cases that we may have missed. */
+UNIV_INTERN lint srv_conc_n_threads = 0;
+/* number of OS threads waiting in the FIFO for a permission to enter
+InnoDB */
+UNIV_INTERN ulint srv_conc_n_waiting_threads = 0;
+
+typedef struct srv_conc_slot_struct srv_conc_slot_t;
+struct srv_conc_slot_struct{
+ os_event_t event; /*!< event to wait */
+ ibool reserved; /*!< TRUE if slot
+ reserved */
+ ibool wait_ended; /*!< TRUE when another
+ thread has already set
+ the event and the
+ thread in this slot is
+ free to proceed; but
+ reserved may still be
+ TRUE at that point */
+ UT_LIST_NODE_T(srv_conc_slot_t) srv_conc_queue; /*!< queue node */
+};
+
+/* queue of threads waiting to get in */
+UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t) srv_conc_queue;
+/* array of wait slots */
+UNIV_INTERN srv_conc_slot_t* srv_conc_slots;
+
+/* Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket at srv_conc_enter_innodb */
+#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
+#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
+/*-----------------------*/
+/* If the following is set to 1 then we do not run purge and insert buffer
+merge to completion before shutdown. If it is set to 2, do not even flush the
+buffer pool to data files at the shutdown: we effectively 'crash'
+InnoDB (but lose no committed transactions). */
+UNIV_INTERN ulint srv_fast_shutdown = 0;
+
+/* Generate a innodb_status.<pid> file */
+UNIV_INTERN ibool srv_innodb_status = FALSE;
+
+/* When estimating number of different key values in an index, sample
+this many index pages */
+UNIV_INTERN unsigned long long srv_stats_sample_pages = 8;
+UNIV_INTERN ulong srv_stats_method = 0;
+UNIV_INTERN ulong srv_stats_auto_update = 1;
+UNIV_INTERN ulint srv_stats_update_need_lock = 1;
+UNIV_INTERN ibool srv_use_sys_stats_table = FALSE;
+
+UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE;
+UNIV_INTERN ibool srv_use_checksums = TRUE;
+UNIV_INTERN ibool srv_fast_checksum = FALSE;
+
+UNIV_INTERN ibool srv_set_thread_priorities = TRUE;
+UNIV_INTERN int srv_query_thread_priority = 0;
+
+UNIV_INTERN ulong srv_replication_delay = 0;
+
+UNIV_INTERN long long srv_ibuf_max_size = 0;
+UNIV_INTERN ulong srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulong srv_ibuf_accel_rate = 100;
+#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
+
+UNIV_INTERN ulint srv_checkpoint_age_target = 0;
+UNIV_INTERN ulong srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
+
+UNIV_INTERN ulong srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulong srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
+UNIV_INTERN ulong srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
+
+UNIV_INTERN ulong srv_expand_import = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulint srv_pass_corrupt_table = 0; /* 0:disable 1:enable */
+
+UNIV_INTERN ulong srv_extra_rsegments = 0; /* extra rseg for users */
+UNIV_INTERN ulong srv_dict_size_limit = 0;
+/*-------------------------------------------*/
+UNIV_INTERN ulong srv_n_spin_wait_rounds = 30;
+UNIV_INTERN ulong srv_n_free_tickets_to_enter = 500;
+UNIV_INTERN ulong srv_thread_sleep_delay = 10000;
+UNIV_INTERN ulong srv_spin_wait_delay = 6;
+UNIV_INTERN ibool srv_priority_boost = TRUE;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool srv_print_thread_releases = FALSE;
+UNIV_INTERN ibool srv_print_lock_waits = FALSE;
+UNIV_INTERN ibool srv_print_buf_io = FALSE;
+UNIV_INTERN ibool srv_print_log_io = FALSE;
+UNIV_INTERN ibool srv_print_latch_waits = FALSE;
+#endif /* UNIV_DEBUG */
+
+UNIV_INTERN ulint srv_n_rows_inserted = 0;
+UNIV_INTERN ulint srv_n_rows_updated = 0;
+UNIV_INTERN ulint srv_n_rows_deleted = 0;
+UNIV_INTERN ulint srv_n_rows_read = 0;
+
+static ulint srv_n_rows_inserted_old = 0;
+static ulint srv_n_rows_updated_old = 0;
+static ulint srv_n_rows_deleted_old = 0;
+static ulint srv_n_rows_read_old = 0;
+UNIV_INTERN ulint srv_n_lock_deadlock_count = 0;
+UNIV_INTERN ulint srv_n_lock_wait_count = 0;
+UNIV_INTERN ulint srv_n_lock_wait_current_count = 0;
+UNIV_INTERN ib_int64_t srv_n_lock_wait_time = 0;
+UNIV_INTERN ulint srv_n_lock_max_wait_time = 0;
+
+
+/*
+ Set the following to 0 if you want InnoDB to write messages on
+ stderr on startup/shutdown
+*/
+UNIV_INTERN ibool srv_print_verbose_log = TRUE;
+UNIV_INTERN ibool srv_print_innodb_monitor = FALSE;
+UNIV_INTERN ibool srv_print_innodb_lock_monitor = FALSE;
+UNIV_INTERN ibool srv_print_innodb_tablespace_monitor = FALSE;
+UNIV_INTERN ibool srv_print_innodb_table_monitor = FALSE;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+UNIV_INTERN time_t srv_last_monitor_time;
+
+UNIV_INTERN mutex_t srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file */
+UNIV_INTERN mutex_t srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+UNIV_INTERN FILE* srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+UNIV_INTERN mutex_t srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+UNIV_INTERN FILE* srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+UNIV_INTERN mutex_t srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+UNIV_INTERN FILE* srv_misc_tmpfile;
+
+UNIV_INTERN ulint srv_main_thread_process_no = 0;
+UNIV_INTERN ulint srv_main_thread_id = 0;
+
+/* The following count work done by srv_master_thread. */
+
+/* Iterations by the 'once per second' loop. */
+static ulint srv_main_1_second_loops = 0;
+/* Calls to sleep by the 'once per second' loop. */
+static ulint srv_main_sleeps = 0;
+/* Iterations by the 'once per 10 seconds' loop. */
+static ulint srv_main_10_second_loops = 0;
+/* Iterations of the loop bounded by the 'background_loop' label. */
+static ulint srv_main_background_loops = 0;
+/* Iterations of the loop bounded by the 'flush_loop' label. */
+static ulint srv_main_flush_loops = 0;
+/* Log writes involving flush. */
+static ulint srv_log_writes_and_flush = 0;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t srv_last_log_flush_time;
+
+/* The master thread performs various tasks based on the current
+state of IO activity and the level of IO utilization is past
+intervals. Following macros define thresholds for these conditions. */
+#define SRV_PEND_IO_THRESHOLD (PCT_IO(3))
+#define SRV_RECENT_IO_ACTIVITY (PCT_IO(5))
+#define SRV_PAST_IO_ACTIVITY (PCT_IO(200))
+
+/*
+ IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+ =========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept equivalent OS concept
+---------- ---------------------
+transaction -- process;
+
+query thread -- thread;
+
+lock -- semaphore;
+
+transaction set to
+the rollback state -- kill signal delivered to a process;
+
+kernel -- kernel;
+
+query thread execution:
+(a) without kernel mutex
+reserved -- process executing in user mode;
+(b) with kernel mutex reserved
+ -- process executing in kernel mode;
+
+The server is controlled by a master thread which runs at
+a priority higher than normal, that is, higher than user threads.
+It sleeps most of the time, and wakes up, say, every 300 milliseconds,
+to check whether there is anything happening in the server which
+requires intervention of the master thread. Such situations may be,
+for example, when flushing of dirty blocks is needed in the buffer
+pool or old version of database rows have to be cleaned away.
+
+The threads which we call user threads serve the queries of
+the clients and input from the console of the server.
+They run at normal priority. The server may have several
+communications endpoints. A dedicated set of user threads waits
+at each of these endpoints ready to receive a client request.
+Each request is taken by a single user thread, which then starts
+processing and, when the result is ready, sends it to the client
+and returns to wait at the same endpoint the thread started from.
+
+So, we do not have dedicated communication threads listening at
+the endpoints and dealing the jobs to dedicated worker threads.
+Our architecture saves one thread swithch per request, compared
+to the solution with dedicated communication threads
+which amounts to 15 microseconds on 100 MHz Pentium
+running NT. If the client
+is communicating over a network, this saving is negligible, but
+if the client resides in the same machine, maybe in an SMP machine
+on a different processor from the server thread, the saving
+can be important as the threads can communicate over shared
+memory with an overhead of a few microseconds.
+
+We may later implement a dedicated communication thread solution
+for those endpoints which communicate over a network.
+
+Our solution with user threads has two problems: for each endpoint
+there has to be a number of listening threads. If there are many
+communication endpoints, it may be difficult to set the right number
+of concurrent threads in the system, as many of the threads
+may always be waiting at less busy endpoints. Another problem
+is queuing of the messages, as the server internally does not
+offer any queue for jobs.
+
+Another group of user threads is intended for splitting the
+queries and processing them in parallel. Let us call these
+parallel communication threads. These threads are waiting for
+parallelized tasks, suspended on event semaphores.
+
+A single user thread waits for input from the console,
+like a command to shut the database.
+
+Utility threads are a different group of threads which takes
+care of the buffer pool flushing and other, mainly background
+operations, in the server.
+Some of these utility threads always run at a lower than normal
+priority, so that they are always in background. Some of them
+may dynamically boost their priority by the pri_adjust function,
+even to higher than normal priority, if their task becomes urgent.
+The running of utilities is controlled by high- and low-water marks
+of urgency. The urgency may be measured by the number of dirty blocks
+in the buffer pool, in the case of the flush thread, for example.
+When the high-water mark is exceeded, an utility starts running, until
+the urgency drops under the low-water mark. Then the utility thread
+suspend itself to wait for an event. The master thread is
+responsible of signaling this event when the utility thread is
+again needed.
+
+For each individual type of utility, some threads always remain
+at lower than normal priority. This is because pri_adjust is implemented
+so that the threads at normal or higher priority control their
+share of running time by calling sleep. Thus, if the load of the
+system sudenly drops, these threads cannot necessarily utilize
+the system fully. The background priority threads make up for this,
+starting to run when the load drops.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making
+the server totally silent. The responsibility to signal this
+event is on the user thread which again receives a message
+from a client.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to
+keep record of which thread owns which resource and
+in the above case boost the priority of the background thread
+so that it will be scheduled and it can release the resource.
+This solution is called priority inheritance in real-time programming.
+A drawback of this solution is that the overhead of acquiring a mutex
+increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
+the thread has to call os_thread_get_curr_id.
+This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
+pair. Note that the thread
+cannot store the information in the resource, say mutex, itself,
+because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards,
+the information is outdated for the time of one machine instruction,
+at least. (To be precise, the information could be stored to
+lock_word in mutex if the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, but at the moment we plan to implement a more coarse solution,
+which could be called a global priority inheritance. If a thread
+has to wait for a long time, say 300 milliseconds, for a resource,
+we just guess that it may be waiting for a resource owned by a background
+thread, and boost the priority of all runnable background threads
+to the normal level. The background threads then themselves adjust
+their fixed priority back to background after releasing all resources
+they had (or, at some fixed points in their program code).
+
+What is the performance of the global priority inheritance solution?
+We may weigh the length of the wait time 300 milliseconds, during
+which the system processes some other thread
+to the cost of boosting the priority of each runnable background
+thread, rescheduling it, and lowering the priority again.
+On 100 MHz Pentium + NT this overhead may be of the order 100
+microseconds per thread. So, if the number of runnable background
+threads is not very big, say < 100, the cost is tolerable.
+Utility threads probably will access resources used by
+user threads not very often, so collisions of user threads
+to preempted utility threads should not happen very often.
+
+The thread table contains
+information of the current status of each thread existing in the system,
+and also the event semaphores used in suspending the master thread
+and utility and parallel communication threads when they have nothing to do.
+The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation.
+
+The thread table is also used in the global priority inheritance
+scheme. This brings in one additional complication: threads accessing
+the thread table must have at least normal fixed priority,
+because the priority inheritance solution does not work if a background
+thread is preempted while possessing the mutex protecting the thread table.
+So, if a thread accesses the thread table, its priority has to be
+boosted at least to normal. This priority requirement can be seen similar to
+the privileged mode used when processing the kernel calls in traditional
+Unix.*/
+
+/* Thread slot in the thread table */
+struct srv_slot_struct{
+ os_thread_id_t id; /*!< thread id */
+ os_thread_t handle; /*!< thread handle */
+ unsigned type:3; /*!< thread type: user, utility etc. */
+ unsigned in_use:1; /*!< TRUE if this slot is in use */
+ unsigned suspended:1; /*!< TRUE if the thread is waiting
+ for the event of this slot */
+ ib_time_t suspend_time; /*!< time when the thread was
+ suspended */
+ os_event_t event; /*!< event used in suspending the
+ thread when it has nothing to do */
+ que_thr_t* thr; /*!< suspended query thread (only
+ used for MySQL threads) */
+};
+
+/* Table for MySQL threads where they will be suspended to wait for locks */
+UNIV_INTERN srv_slot_t* srv_mysql_table = NULL;
+
+UNIV_INTERN os_event_t srv_lock_timeout_thread_event;
+
+UNIV_INTERN os_event_t srv_purge_thread_event;
+
+UNIV_INTERN srv_sys_t* srv_sys = NULL;
+
+/* padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte srv_pad1[64];
+/* mutex protecting the server, trx structs, query threads, and lock table */
+UNIV_INTERN mutex_t* kernel_mutex_temp;
+/* padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte srv_pad2[64];
+
+#if 0
+/* The following three values measure the urgency of the jobs of
+buffer, version, and insert threads. They may vary from 0 - 1000.
+The server mutex protects all these variables. The low-water values
+tell that the server can acquiesce the utility when the value
+drops below this low-water mark. */
+
+static ulint srv_meter[SRV_MASTER + 1];
+static ulint srv_meter_low_water[SRV_MASTER + 1];
+static ulint srv_meter_high_water[SRV_MASTER + 1];
+static ulint srv_meter_high_water2[SRV_MASTER + 1];
+static ulint srv_meter_foreground[SRV_MASTER + 1];
+#endif
+
+/* The following values give info about the activity going on in
+the database. They are protected by the server mutex. The arrays
+are indexed by the type of the thread. */
+
+UNIV_INTERN ulint srv_n_threads_active[SRV_MASTER + 1];
+UNIV_INTERN ulint srv_n_threads[SRV_MASTER + 1];
+
+/***********************************************************************
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+ FILE *file) /* in: output stream */
+{
+ fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
+ "%lu 10_second, %lu background, %lu flush\n",
+ srv_main_1_second_loops, srv_main_sleeps,
+ srv_main_10_second_loops, srv_main_background_loops,
+ srv_main_flush_loops);
+ fprintf(file, "srv_master_thread log flush and writes: %lu\n",
+ srv_log_writes_and_flush);
+}
+
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+ ulint i, /*!< in: the 'segment' of the i/o thread */
+ const char* str) /*!< in: constant char string describing the
+ state */
+{
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+
+ srv_io_thread_op_info[i] = str;
+}
+
+/*********************************************************************//**
+Accessor function to get pointer to n'th slot in the server thread
+table.
+@return pointer to the slot */
+static
+srv_slot_t*
+srv_table_get_nth_slot(
+/*===================*/
+ ulint index) /*!< in: index of the slot */
+{
+ ut_a(index < OS_THREAD_MAX_N);
+
+ return(srv_sys->threads + index);
+}
+
+/*********************************************************************//**
+Gets the number of threads in the system.
+@return sum of srv_n_threads[] */
+UNIV_INTERN
+ulint
+srv_get_n_threads(void)
+/*===================*/
+{
+ ulint i;
+ ulint n_threads = 0;
+
+ mutex_enter(&kernel_mutex);
+
+ for (i = SRV_COM; i < SRV_MASTER + 1; i++) {
+
+ n_threads += srv_n_threads[i];
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(n_threads);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current thread. Also creates the
+thread local storage struct for the current thread. NOTE! The server mutex
+has to be reserved by the caller!
+@return reserved slot index */
+static
+ulint
+srv_table_reserve_slot(
+/*===================*/
+ enum srv_thread_type type) /*!< in: type of the thread */
+{
+ srv_slot_t* slot;
+ ulint i;
+
+ ut_a(type > 0);
+ ut_a(type <= SRV_MASTER);
+
+ i = 0;
+ slot = srv_table_get_nth_slot(i);
+
+ while (slot->in_use) {
+ i++;
+ slot = srv_table_get_nth_slot(i);
+ }
+
+ ut_a(slot->in_use == FALSE);
+
+ slot->in_use = TRUE;
+ slot->suspended = FALSE;
+ slot->type = type;
+ slot->id = os_thread_get_curr_id();
+ slot->handle = os_thread_get_curr();
+
+ thr_local_create();
+
+ thr_local_set_slot_no(os_thread_get_curr_id(), i);
+
+ return(i);
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+NOTE! The server mutex has to be reserved by the caller!
+@return event for the calling thread to wait */
+static
+os_event_t
+srv_suspend_thread(void)
+/*====================*/
+{
+ srv_slot_t* slot;
+ os_event_t event;
+ ulint slot_no;
+ enum srv_thread_type type;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Suspending thread %lu to slot %lu\n",
+ (ulong) os_thread_get_curr_id(), (ulong) slot_no);
+ }
+
+ slot = srv_table_get_nth_slot(slot_no);
+
+ type = slot->type;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+
+ event = slot->event;
+
+ slot->suspended = TRUE;
+
+ ut_ad(srv_n_threads_active[type] > 0);
+
+ srv_n_threads_active[type]--;
+
+ os_event_reset(event);
+
+ return(event);
+}
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+ enum srv_thread_type type, /*!< in: thread type */
+ ulint n) /*!< in: number of threads to release */
+{
+ srv_slot_t* slot;
+ ulint i;
+ ulint count = 0;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+ ut_ad(n > 0);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_table_get_nth_slot(i);
+
+ if (slot->in_use && slot->type == type && slot->suspended) {
+
+ slot->suspended = FALSE;
+
+ srv_n_threads_active[type]++;
+
+ os_event_set(slot->event);
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Releasing thread %lu type %lu"
+ " from slot %lu\n",
+ (ulong) slot->id, (ulong) type,
+ (ulong) i);
+ }
+
+ count++;
+
+ if (count == n) {
+ break;
+ }
+ }
+ }
+
+ return(count);
+}
+
+/*********************************************************************//**
+Returns the calling thread type.
+@return SRV_COM, ... */
+UNIV_INTERN
+enum srv_thread_type
+srv_get_thread_type(void)
+/*=====================*/
+{
+ ulint slot_no;
+ srv_slot_t* slot;
+ enum srv_thread_type type;
+
+ mutex_enter(&kernel_mutex);
+
+ slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+ slot = srv_table_get_nth_slot(slot_no);
+
+ type = slot->type;
+
+ ut_ad(type >= SRV_WORKER);
+ ut_ad(type <= SRV_MASTER);
+
+ mutex_exit(&kernel_mutex);
+
+ return(type);
+}
+
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void)
+/*==========*/
+{
+ srv_conc_slot_t* conc_slot;
+ srv_slot_t* slot;
+ ulint i;
+
+ srv_sys = mem_alloc(sizeof(srv_sys_t));
+
+ kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
+ mutex_create(&kernel_mutex, SYNC_KERNEL);
+
+ mutex_create(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+ srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_table_get_nth_slot(i);
+ slot->in_use = FALSE;
+ slot->type=0; /* Avoid purify errors */
+ slot->event = os_event_create(NULL);
+ ut_a(slot->event);
+ }
+
+ srv_mysql_table = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_mysql_table + i;
+ slot->in_use = FALSE;
+ slot->type = 0;
+ slot->event = os_event_create(NULL);
+ ut_a(slot->event);
+ }
+
+ srv_lock_timeout_thread_event = os_event_create(NULL);
+ srv_purge_thread_event = os_event_create(NULL);
+
+ for (i = 0; i < SRV_MASTER + 1; i++) {
+ srv_n_threads_active[i] = 0;
+ srv_n_threads[i] = 0;
+#if 0
+ srv_meter[i] = 30;
+ srv_meter_low_water[i] = 50;
+ srv_meter_high_water[i] = 100;
+ srv_meter_high_water2[i] = 200;
+ srv_meter_foreground[i] = 250;
+#endif
+ }
+
+ UT_LIST_INIT(srv_sys->tasks);
+
+ /* Create dummy indexes for infimum and supremum records */
+
+ dict_ind_init();
+
+ /* Init the server concurrency restriction data structures */
+
+ os_fast_mutex_init(&srv_conc_mutex);
+
+ UT_LIST_INIT(srv_conc_queue);
+
+ srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ conc_slot = srv_conc_slots + i;
+ conc_slot->reserved = FALSE;
+ conc_slot->event = os_event_create(NULL);
+ ut_a(conc_slot->event);
+ }
+
+ /* Initialize some INFORMATION SCHEMA internal structures */
+ trx_i_s_cache_init(trx_i_s_cache);
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void)
+/*==========*/
+{
+ os_fast_mutex_free(&srv_conc_mutex);
+ mem_free(srv_conc_slots);
+ srv_conc_slots = NULL;
+
+ mem_free(srv_sys->threads);
+ mem_free(srv_sys);
+ srv_sys = NULL;
+
+ mem_free(kernel_mutex_temp);
+ kernel_mutex_temp = NULL;
+ mem_free(srv_mysql_table);
+ srv_mysql_table = NULL;
+
+ trx_i_s_cache_free(trx_i_s_cache);
+}
+
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void)
+/*==================*/
+{
+ ut_mem_init();
+ /* Reset the system variables in the recovery module. */
+ recv_sys_var_init();
+ os_sync_init();
+ sync_init();
+ mem_init(srv_mem_pool_size);
+ thr_local_init();
+}
+
+/*======================= InnoDB Server FIFO queue =======================*/
+
+/* Maximum allowable purge history length. <=0 means 'infinite'. */
+UNIV_INTERN ulong srv_max_purge_lag = 0;
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+static void
+enter_innodb_with_tickets(trx_t* trx)
+{
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+ return;
+}
+
+static void
+srv_conc_enter_innodb_timer_based(trx_t* trx)
+{
+ lint conc_n_threads;
+ ibool has_yielded = FALSE;
+ ulint has_slept = 0;
+
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(
+" InnoDB: Error: trying to declare trx to enter InnoDB, but\n"
+"InnoDB: it already is declared.\n", stderr);
+ trx_print(stderr, trx, 0);
+ putc('\n', stderr);
+ }
+retry:
+ if (srv_conc_n_threads < (lint) srv_thread_concurrency) {
+ conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+ if (conc_n_threads <= (lint) srv_thread_concurrency) {
+ enter_innodb_with_tickets(trx);
+ return;
+ }
+ os_atomic_increment_lint(&srv_conc_n_threads, -1);
+ }
+ if (!has_yielded)
+ {
+ has_yielded = TRUE;
+ os_thread_yield();
+ goto retry;
+ }
+ if (trx->has_search_latch
+ || NULL != UT_LIST_GET_FIRST(trx->trx_locks)) {
+
+ conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+ enter_innodb_with_tickets(trx);
+ return;
+ }
+ if (has_slept < 2)
+ {
+ trx->op_info = "sleeping before entering InnoDB";
+ os_thread_sleep(10000);
+ trx->op_info = "";
+ has_slept++;
+ }
+ conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+ enter_innodb_with_tickets(trx);
+ return;
+}
+
+static void
+srv_conc_exit_innodb_timer_based(trx_t* trx)
+{
+ os_atomic_increment_lint(&srv_conc_n_threads, -1);
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+ return;
+}
+#endif
+
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+ trx_t* trx) /*!< in: transaction object associated with the
+ thread */
+{
+ ibool has_slept = FALSE;
+ srv_conc_slot_t* slot = NULL;
+ ulint i;
+ ib_uint64_t start_time = 0L;
+ ib_uint64_t finish_time = 0L;
+ ulint sec;
+ ulint ms;
+
+ if (trx->mysql_thd != NULL
+ && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+ UT_WAIT_FOR(srv_conc_n_threads
+ < (lint)srv_thread_concurrency,
+ srv_replication_delay * 1000);
+
+ return;
+ }
+
+ /* If trx has 'free tickets' to enter the engine left, then use one
+ such ticket */
+
+ if (trx->n_tickets_to_enter_innodb > 0) {
+ trx->n_tickets_to_enter_innodb--;
+
+ return;
+ }
+
+#ifdef HAVE_ATOMIC_BUILTINS
+ if (srv_thread_concurrency_timer_based) {
+ srv_conc_enter_innodb_timer_based(trx);
+ return;
+ }
+#endif
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: trying to declare trx"
+ " to enter InnoDB, but\n"
+ "InnoDB: it already is declared.\n", stderr);
+ trx_print(stderr, trx, 0);
+ putc('\n', stderr);
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ ut_ad(srv_conc_n_threads >= 0);
+
+ if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ /* If the transaction is not holding resources, let it sleep
+ for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
+
+ if (!has_slept && !trx->has_search_latch
+ && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
+
+ has_slept = TRUE; /* We let it sleep only once to avoid
+ starvation */
+
+ srv_conc_n_waiting_threads++;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ trx->op_info = "sleeping before joining InnoDB queue";
+
+ /* Peter Zaitsev suggested that we take the sleep away
+ altogether. But the sleep may be good in pathological
+ situations of lots of thread switches. Simply put some
+ threads aside for a while to reduce the number of thread
+ switches. */
+ if (SRV_THREAD_SLEEP_DELAY > 0) {
+ os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
+ trx->innodb_que_wait_timer += SRV_THREAD_SLEEP_DELAY;
+ }
+
+ trx->op_info = "";
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_waiting_threads--;
+
+ goto retry;
+ }
+
+ /* Too many threads inside: put the current thread to a queue */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+ slot = srv_conc_slots + i;
+
+ if (!slot->reserved) {
+
+ break;
+ }
+ }
+
+ if (i == OS_THREAD_MAX_N) {
+ /* Could not find a free wait slot, we must let the
+ thread enter */
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ return;
+ }
+
+ /* Release possible search system latch this thread has */
+ if (trx->has_search_latch) {
+ trx_search_latch_release_if_reserved(trx);
+ }
+
+ /* Add to the queue */
+ slot->reserved = TRUE;
+ slot->wait_ended = FALSE;
+
+ UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+ os_event_reset(slot->event);
+
+ srv_conc_n_waiting_threads++;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ /* Go to wait for the event; when a thread leaves InnoDB it will
+ release this thread */
+
+ if (innobase_get_slow_log() && trx->take_stats) {
+ ut_usectime(&sec, &ms);
+ start_time = (ib_uint64_t)sec * 1000000 + ms;
+ } else {
+ start_time = 0;
+ }
+
+ trx->op_info = "waiting in InnoDB queue";
+
+ os_event_wait(slot->event);
+
+ trx->op_info = "";
+
+ if (innobase_get_slow_log() && trx->take_stats && start_time) {
+ ut_usectime(&sec, &ms);
+ finish_time = (ib_uint64_t)sec * 1000000 + ms;
+ trx->innodb_que_wait_timer += (ulint)(finish_time - start_time);
+ }
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_waiting_threads--;
+
+ /* NOTE that the thread which released this thread already
+ incremented the thread counter on behalf of this thread */
+
+ slot->reserved = FALSE;
+
+ UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+ trx_t* trx) /*!< in: transaction object associated with the
+ thread */
+{
+ if (UNIV_LIKELY(!srv_thread_concurrency)) {
+
+ return;
+ }
+
+ ut_ad(srv_conc_n_threads >= 0);
+#ifdef HAVE_ATOMIC_BUILTINS
+ if (srv_thread_concurrency_timer_based) {
+ os_atomic_increment_lint(&srv_conc_n_threads, 1);
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = 1;
+ return;
+ }
+#endif
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_threads++;
+ trx->declared_to_be_inside_innodb = TRUE;
+ trx->n_tickets_to_enter_innodb = 1;
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+ trx_t* trx) /*!< in: transaction object associated with the
+ thread */
+{
+ srv_conc_slot_t* slot = NULL;
+
+ if (trx->mysql_thd != NULL
+ && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+ return;
+ }
+
+ if (trx->declared_to_be_inside_innodb == FALSE) {
+
+ return;
+ }
+
+#ifdef HAVE_ATOMIC_BUILTINS
+ if (srv_thread_concurrency_timer_based) {
+ srv_conc_exit_innodb_timer_based(trx);
+ return;
+ }
+#endif
+
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ ut_ad(srv_conc_n_threads > 0);
+ srv_conc_n_threads--;
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+ /* Look for a slot where a thread is waiting and no other
+ thread has yet released the thread */
+
+ slot = UT_LIST_GET_FIRST(srv_conc_queue);
+
+ while (slot && slot->wait_ended == TRUE) {
+ slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
+ }
+
+ if (slot != NULL) {
+ slot->wait_ended = TRUE;
+
+ /* We increment the count on behalf of the released
+ thread */
+
+ srv_conc_n_threads++;
+ }
+ }
+
+ os_fast_mutex_unlock(&srv_conc_mutex);
+
+ if (slot != NULL) {
+ os_event_set(slot->event);
+ }
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB. */
+UNIV_INTERN
+void
+srv_conc_exit_innodb(
+/*=================*/
+ trx_t* trx) /*!< in: transaction object associated with the
+ thread */
+{
+ if (trx->n_tickets_to_enter_innodb > 0) {
+ /* We will pretend the thread is still inside InnoDB though it
+ now leaves the InnoDB engine. In this way we save
+ a lot of semaphore operations. srv_conc_force_exit_innodb is
+ used to declare the thread definitely outside InnoDB. It
+ should be called when there is a lock wait or an SQL statement
+ ends. */
+
+ return;
+ }
+
+ srv_conc_force_exit_innodb(trx);
+}
+
+/*========================================================================*/
+
+/*********************************************************************//**
+Normalizes init parameter values to use units we use inside InnoDB.
+@return DB_SUCCESS or error code */
+static
+ulint
+srv_normalize_init_values(void)
+/*===========================*/
+{
+ ulint n;
+ ulint i;
+
+ n = srv_n_data_files;
+
+ for (i = 0; i < n; i++) {
+ srv_data_file_sizes[i] = srv_data_file_sizes[i]
+ * ((1024 * 1024) / UNIV_PAGE_SIZE);
+ }
+
+ srv_last_file_size_max = srv_last_file_size_max
+ * ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+ srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+ srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+ srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Boots the InnoDB server.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void)
+/*==========*/
+{
+ ulint err;
+
+ /* Transform the init parameter values given by MySQL to
+ use units we use inside InnoDB: */
+
+ err = srv_normalize_init_values();
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Initialize synchronization primitives, memory management, and thread
+ local storage */
+
+ srv_general_init();
+
+ /* Initialize this module */
+
+ srv_init();
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current MySQL OS thread.
+NOTE! The kernel mutex has to be reserved by the caller!
+@return reserved slot */
+static
+srv_slot_t*
+srv_table_reserve_slot_for_mysql(void)
+/*==================================*/
+{
+ srv_slot_t* slot;
+ ulint i;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ i = 0;
+ slot = srv_mysql_table + i;
+
+ while (slot->in_use) {
+ i++;
+
+ if (i >= OS_THREAD_MAX_N) {
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: There appear to be %lu MySQL"
+ " threads currently waiting\n"
+ "InnoDB: inside InnoDB, which is the"
+ " upper limit. Cannot continue operation.\n"
+ "InnoDB: We intentionally generate"
+ " a seg fault to print a stack trace\n"
+ "InnoDB: on Linux. But first we print"
+ " a list of waiting threads.\n", (ulong) i);
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ fprintf(stderr,
+ "Slot %lu: thread id %lu, type %lu,"
+ " in use %lu, susp %lu, time %lu\n",
+ (ulong) i,
+ (ulong) os_thread_pf(slot->id),
+ (ulong) slot->type,
+ (ulong) slot->in_use,
+ (ulong) slot->suspended,
+ (ulong) difftime(ut_time(),
+ slot->suspend_time));
+ }
+
+ ut_error;
+ }
+
+ slot = srv_mysql_table + i;
+ }
+
+ ut_a(slot->in_use == FALSE);
+
+ slot->in_use = TRUE;
+ slot->id = os_thread_get_curr_id();
+ slot->handle = os_thread_get_curr();
+
+ return(slot);
+}
+
+/***************************************************************//**
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+ que_thr_t* thr) /*!< in: query thread associated with the MySQL
+ OS thread */
+{
+ srv_slot_t* slot;
+ os_event_t event;
+ double wait_time;
+ trx_t* trx;
+ ulint had_dict_lock;
+ ibool was_declared_inside_innodb = FALSE;
+ ib_int64_t start_time = 0;
+ ib_int64_t finish_time;
+ ulint diff_time;
+ ulint sec;
+ ulint ms;
+ ulong lock_wait_timeout;
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ trx = thr_get_trx(thr);
+
+ os_event_set(srv_lock_timeout_thread_event);
+
+ mutex_enter(&kernel_mutex);
+
+ trx->error_state = DB_SUCCESS;
+
+ if (thr->state == QUE_THR_RUNNING) {
+
+ ut_ad(thr->is_active == TRUE);
+
+ /* The lock has already been released or this transaction
+ was chosen as a deadlock victim: no need to suspend */
+
+ if (trx->was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ ut_ad(thr->is_active == FALSE);
+
+ slot = srv_table_reserve_slot_for_mysql();
+
+ event = slot->event;
+
+ slot->thr = thr;
+
+ os_event_reset(event);
+
+ slot->suspend_time = ut_time();
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ srv_n_lock_wait_count++;
+ srv_n_lock_wait_current_count++;
+
+ if (ut_usectime(&sec, &ms) == -1) {
+ start_time = -1;
+ } else {
+ start_time = (ib_int64_t) sec * 1000000 + ms;
+ }
+ }
+ /* Wake the lock timeout monitor thread, if it is suspended */
+
+ os_event_set(srv_lock_timeout_thread_event);
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->declared_to_be_inside_innodb) {
+
+ was_declared_inside_innodb = TRUE;
+
+ /* We must declare this OS thread to exit InnoDB, since a
+ possible other thread holding a lock which this thread waits
+ for must be allowed to enter, sooner or later */
+
+ srv_conc_force_exit_innodb(trx);
+ }
+
+ had_dict_lock = trx->dict_operation_lock_mode;
+
+ switch (had_dict_lock) {
+ case RW_S_LATCH:
+ /* Release foreign key check latch */
+ row_mysql_unfreeze_data_dictionary(trx);
+ break;
+ case RW_X_LATCH:
+ /* Release fast index creation latch */
+ row_mysql_unlock_data_dictionary(trx);
+ break;
+ }
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ /* Suspend this thread and wait for the event. */
+
+ os_event_wait(event);
+
+ /* After resuming, reacquire the data dictionary latch if
+ necessary. */
+
+ switch (had_dict_lock) {
+ case RW_S_LATCH:
+ row_mysql_freeze_data_dictionary(trx);
+ break;
+ case RW_X_LATCH:
+ row_mysql_lock_data_dictionary(trx);
+ break;
+ }
+
+ if (was_declared_inside_innodb) {
+
+ /* Return back inside InnoDB */
+
+ srv_conc_force_enter_innodb(trx);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ /* Release the slot for others to use */
+
+ slot->in_use = FALSE;
+
+ wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+ if (thr->lock_state == QUE_THR_LOCK_ROW) {
+ if (ut_usectime(&sec, &ms) == -1) {
+ finish_time = -1;
+ } else {
+ finish_time = (ib_int64_t) sec * 1000000 + ms;
+ }
+
+ diff_time = (ulint) (finish_time - start_time);
+
+ srv_n_lock_wait_current_count--;
+ srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
+ if (diff_time > srv_n_lock_max_wait_time &&
+ /* only update the variable if we successfully
+ retrieved the start and finish times. See Bug#36819. */
+ start_time != -1 && finish_time != -1) {
+ srv_n_lock_max_wait_time = diff_time;
+ }
+ }
+
+ if (trx->was_chosen_as_deadlock_victim) {
+
+ trx->error_state = DB_DEADLOCK;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ /* InnoDB system transactions (such as the purge, and
+ incomplete transactions that are being rolled back after crash
+ recovery) will use the global value of
+ innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+ lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
+
+ if (lock_wait_timeout < 100000000
+ && wait_time > (double) lock_wait_timeout) {
+
+ trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+ }
+
+ if (trx_is_interrupted(trx)) {
+
+ trx->error_state = DB_INTERRUPTED;
+ }
+}
+
+/********************************************************************//**
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+ que_thr_t* thr) /*!< in: query thread associated with the
+ MySQL OS thread */
+{
+ srv_slot_t* slot;
+ ulint i;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ if (slot->in_use && slot->thr == thr) {
+ /* Found */
+
+ os_event_set(slot->event);
+
+ return;
+ }
+ }
+
+ /* not found */
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ srv_last_monitor_time = time(NULL);
+
+ os_aio_refresh_stats();
+
+ btr_cur_n_sea_old = btr_cur_n_sea;
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ log_refresh_stats();
+
+ buf_refresh_io_stats();
+
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+ srv_n_rows_read_old = srv_n_rows_read;
+
+ mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+ FILE* file, /*!< in: output stream */
+ ibool nowait, /*!< in: whether to wait for kernel mutex */
+ ulint* trx_start, /*!< out: file position of the start of
+ the list of active transactions */
+ ulint* trx_end) /*!< out: file position of the end of
+ the list of active transactions */
+{
+ double time_elapsed;
+ time_t current_time;
+ ulint n_reserved;
+ ibool ret;
+
+ ulint btr_search_sys_subtotal;
+ ulint lock_sys_subtotal;
+ ulint recv_sys_subtotal;
+
+ ulint i;
+ trx_t* trx;
+
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+
+ /* We add 0.001 seconds to time_elapsed to prevent division
+ by zero if two users happen to call SHOW INNODB STATUS at the same
+ time */
+
+ time_elapsed = difftime(current_time, srv_last_monitor_time)
+ + 0.001;
+
+ srv_last_monitor_time = time(NULL);
+
+ fputs("\n=====================================\n", file);
+
+ ut_print_timestamp(file);
+ fprintf(file,
+ " INNODB MONITOR OUTPUT\n"
+ "=====================================\n"
+ "Per second averages calculated from the last %lu seconds\n",
+ (ulong)time_elapsed);
+
+ fputs("-----------------\n"
+ "BACKGROUND THREAD\n"
+ "-----------------\n", file);
+ srv_print_master_thread_info(file);
+
+ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
+ sync_print(file);
+
+ /* Conceptually, srv_innodb_monitor_mutex has a very high latching
+ order level in sync0sync.h, while dict_foreign_err_mutex has a very
+ low level 135. Therefore we can reserve the latter mutex here without
+ a danger of a deadlock of threads. */
+
+ mutex_enter(&dict_foreign_err_mutex);
+
+ if (ftell(dict_foreign_err_file) != 0L) {
+ fputs("------------------------\n"
+ "LATEST FOREIGN KEY ERROR\n"
+ "------------------------\n", file);
+ ut_copy_file(file, dict_foreign_err_file);
+ }
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+ os_aio_print(file);
+
+ fputs("-------------------------------------\n"
+ "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+ "-------------------------------------\n", file);
+ ibuf_print(file);
+
+ ha_print_info(file, btr_search_sys->hash_index);
+
+ fprintf(file,
+ "%.2f hash searches/s, %.2f non-hash searches/s\n",
+ (btr_cur_n_sea - btr_cur_n_sea_old)
+ / time_elapsed,
+ (btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+ / time_elapsed);
+ btr_cur_n_sea_old = btr_cur_n_sea;
+ btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+ fputs("---\n"
+ "LOG\n"
+ "---\n", file);
+ log_print(file);
+
+ fputs("----------------------\n"
+ "BUFFER POOL AND MEMORY\n"
+ "----------------------\n", file);
+ fprintf(file,
+ "Total memory allocated " ULINTPF
+ "; in additional pool allocated " ULINTPF "\n",
+ ut_total_allocated_memory,
+ mem_pool_get_reserved(mem_comm_pool));
+ /* Calcurate reserved memories */
+ if (btr_search_sys && btr_search_sys->hash_index->heap) {
+ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap);
+ } else {
+ btr_search_sys_subtotal = 0;
+ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) {
+ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]);
+ }
+ }
+
+ lock_sys_subtotal = 0;
+ if (trx_sys) {
+ mutex_enter(&kernel_mutex);
+ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+ while (trx) {
+ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
+ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+ }
+ mutex_exit(&kernel_mutex);
+ }
+
+ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
+ ? mem_heap_get_size(recv_sys->heap) : 0);
+
+ fprintf(file,
+ "Internal hash tables (constant factor + variable factor)\n"
+ " Adaptive hash index %lu \t(%lu + %lu)\n"
+ " Page hash %lu\n"
+ " Dictionary cache %lu \t(%lu + %lu)\n"
+ " File system %lu \t(%lu + %lu)\n"
+ " Lock system %lu \t(%lu + %lu)\n"
+ " Recovery system %lu \t(%lu + %lu)\n"
+ " Threads %lu \t(%lu + %lu)\n",
+
+ (ulong) (btr_search_sys
+ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0)
+ + btr_search_sys_subtotal,
+ (ulong) (btr_search_sys
+ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0),
+ (ulong) btr_search_sys_subtotal,
+
+ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)),
+
+ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+ + dict_sys->table_id_hash->n_cells
+ ) * sizeof(hash_cell_t)
+ + dict_sys->size) : 0),
+ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+ + dict_sys->table_id_hash->n_cells
+ ) * sizeof(hash_cell_t)) : 0),
+ (ulong) (dict_sys ? (dict_sys->size) : 0),
+
+ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
+ + fil_system_hash_nodes()),
+ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
+ (ulong) fil_system_hash_nodes(),
+
+ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
+ + lock_sys_subtotal),
+ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
+ (ulong) lock_sys_subtotal,
+
+ (ulong) (((recv_sys && recv_sys->addr_hash)
+ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
+ + recv_sys_subtotal),
+ (ulong) ((recv_sys && recv_sys->addr_hash)
+ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
+ (ulong) recv_sys_subtotal,
+
+ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)
+ + thr_local_hash_nodes()),
+ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)),
+ (ulong) thr_local_hash_nodes());
+
+ fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
+ dict_sys->size);
+
+ buf_print_io(file);
+
+ fputs("--------------\n"
+ "ROW OPERATIONS\n"
+ "--------------\n", file);
+ fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+ (long) srv_conc_n_threads,
+ (ulong) srv_conc_n_waiting_threads);
+
+ fprintf(file, "%lu read views open inside InnoDB\n",
+ UT_LIST_GET_LEN(trx_sys->view_list));
+
+ n_reserved = fil_space_get_n_reserved_extents(0);
+ if (n_reserved > 0) {
+ fprintf(file,
+ "%lu tablespace extents now reserved for"
+ " B-tree split operations\n",
+ (ulong) n_reserved);
+ }
+
+#ifdef UNIV_LINUX
+ fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+ (ulong) srv_main_thread_process_no,
+ (ulong) srv_main_thread_id,
+ srv_main_thread_op_info);
+#else
+ fprintf(file, "Main thread id %lu, state: %s\n",
+ (ulong) srv_main_thread_id,
+ srv_main_thread_op_info);
+#endif
+ fprintf(file,
+ "Number of rows inserted " ULINTPF
+ ", updated " ULINTPF ", deleted " ULINTPF
+ ", read " ULINTPF "\n",
+ srv_n_rows_inserted,
+ srv_n_rows_updated,
+ srv_n_rows_deleted,
+ srv_n_rows_read);
+ fprintf(file,
+ "%.2f inserts/s, %.2f updates/s,"
+ " %.2f deletes/s, %.2f reads/s\n",
+ (srv_n_rows_inserted - srv_n_rows_inserted_old)
+ / time_elapsed,
+ (srv_n_rows_updated - srv_n_rows_updated_old)
+ / time_elapsed,
+ (srv_n_rows_deleted - srv_n_rows_deleted_old)
+ / time_elapsed,
+ (srv_n_rows_read - srv_n_rows_read_old)
+ / time_elapsed);
+
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+ srv_n_rows_read_old = srv_n_rows_read;
+
+ /* Only if lock_print_info_summary proceeds correctly,
+ before we call the lock_print_info_all_transactions
+ to print all the lock information. */
+ ret = lock_print_info_summary(file, nowait);
+
+ if (ret) {
+ if (trx_start) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_start = ULINT_UNDEFINED;
+ } else {
+ *trx_start = (ulint) t;
+ }
+ }
+ lock_print_info_all_transactions(file);
+ if (trx_end) {
+ long t = ftell(file);
+ if (t < 0) {
+ *trx_end = ULINT_UNDEFINED;
+ } else {
+ *trx_end = (ulint) t;
+ }
+ }
+ }
+
+ fputs("----------------------------\n"
+ "END OF INNODB MONITOR OUTPUT\n"
+ "============================\n", file);
+ mutex_exit(&srv_innodb_monitor_mutex);
+ fflush(file);
+
+ return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ export_vars.innodb_data_pending_reads
+ = os_n_pending_reads;
+ export_vars.innodb_data_pending_writes
+ = os_n_pending_writes;
+ export_vars.innodb_data_pending_fsyncs
+ = fil_n_pending_log_flushes
+ + fil_n_pending_tablespace_flushes;
+ export_vars.innodb_data_fsyncs = os_n_fsyncs;
+ export_vars.innodb_data_read = srv_data_read;
+ export_vars.innodb_data_reads = os_n_file_reads;
+ export_vars.innodb_data_writes = os_n_file_writes;
+ export_vars.innodb_data_written = srv_data_written;
+ export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0);
+ export_vars.innodb_buffer_pool_read_requests = buf_pool->stat.n_page_gets;
+ export_vars.innodb_buffer_pool_write_requests
+ = srv_buf_pool_write_requests;
+ export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
+ export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
+ export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
+ export_vars.innodb_buffer_pool_read_ahead
+ = buf_pool->stat.n_ra_pages_read;
+ export_vars.innodb_buffer_pool_read_ahead_evicted
+ = buf_pool->stat.n_ra_pages_evicted;
+ export_vars.innodb_buffer_pool_pages_data
+ = UT_LIST_GET_LEN(buf_pool->LRU);
+ export_vars.innodb_buffer_pool_pages_dirty
+ = UT_LIST_GET_LEN(buf_pool->flush_list);
+ export_vars.innodb_buffer_pool_pages_free
+ = UT_LIST_GET_LEN(buf_pool->free);
+ export_vars.innodb_deadlocks
+ = srv_n_lock_deadlock_count;
+#ifdef UNIV_DEBUG
+ export_vars.innodb_buffer_pool_pages_latched
+ = buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+ export_vars.innodb_buffer_pool_pages_total = buf_pool->curr_size;
+
+ export_vars.innodb_buffer_pool_pages_misc = buf_pool->curr_size
+ - UT_LIST_GET_LEN(buf_pool->LRU)
+ - UT_LIST_GET_LEN(buf_pool->free);
+#ifdef HAVE_ATOMIC_BUILTINS
+ export_vars.innodb_have_atomic_builtins = 1;
+#else
+ export_vars.innodb_have_atomic_builtins = 0;
+#endif
+ export_vars.innodb_page_size = UNIV_PAGE_SIZE;
+ export_vars.innodb_log_waits = srv_log_waits;
+ export_vars.innodb_os_log_written = srv_os_log_written;
+ export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+ export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+ export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
+ export_vars.innodb_log_write_requests = srv_log_write_requests;
+ export_vars.innodb_log_writes = srv_log_writes;
+ export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
+ export_vars.innodb_dblwr_writes = srv_dblwr_writes;
+ export_vars.innodb_pages_created = buf_pool->stat.n_pages_created;
+ export_vars.innodb_pages_read = buf_pool->stat.n_pages_read;
+ export_vars.innodb_pages_written = buf_pool->stat.n_pages_written;
+ export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
+ export_vars.innodb_row_lock_current_waits
+ = srv_n_lock_wait_current_count;
+ export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
+ if (srv_n_lock_wait_count > 0) {
+ export_vars.innodb_row_lock_time_avg = (ulint)
+ (srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
+ } else {
+ export_vars.innodb_row_lock_time_avg = 0;
+ }
+ export_vars.innodb_row_lock_time_max
+ = srv_n_lock_max_wait_time / 1000;
+ export_vars.innodb_rows_read = srv_n_rows_read;
+ export_vars.innodb_rows_inserted = srv_n_rows_inserted;
+ export_vars.innodb_rows_updated = srv_n_rows_updated;
+ export_vars.innodb_rows_deleted = srv_n_rows_deleted;
+
+ mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ double time_elapsed;
+ time_t current_time;
+ time_t last_table_monitor_time;
+ time_t last_tablespace_monitor_time;
+ time_t last_monitor_time;
+ ulint mutex_skipped;
+ ibool last_srv_print_monitor;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ UT_NOT_USED(arg);
+ srv_last_monitor_time = time(NULL);
+ last_table_monitor_time = time(NULL);
+ last_tablespace_monitor_time = time(NULL);
+ last_monitor_time = time(NULL);
+ mutex_skipped = 0;
+ last_srv_print_monitor = srv_print_innodb_monitor;
+loop:
+ srv_monitor_active = TRUE;
+
+ /* Wake up every 5 seconds to see if we need to print
+ monitor information. */
+
+ os_thread_sleep(5000000);
+
+ current_time = time(NULL);
+
+ time_elapsed = difftime(current_time, last_monitor_time);
+
+ if (time_elapsed > 15) {
+ last_monitor_time = time(NULL);
+
+ if (srv_print_innodb_monitor) {
+ /* Reset mutex_skipped counter everytime
+ srv_print_innodb_monitor changes. This is to
+ ensure we will not be blocked by kernel_mutex
+ for short duration information printing,
+ such as requested by sync_array_print_long_waits() */
+ if (!last_srv_print_monitor) {
+ mutex_skipped = 0;
+ last_srv_print_monitor = TRUE;
+ }
+
+ if (!srv_printf_innodb_monitor(stderr,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ /* Reset the counter */
+ mutex_skipped = 0;
+ }
+ } else {
+ last_srv_print_monitor = FALSE;
+ }
+
+
+ if (srv_innodb_status) {
+ mutex_enter(&srv_monitor_file_mutex);
+ rewind(srv_monitor_file);
+ if (!srv_printf_innodb_monitor(srv_monitor_file,
+ MUTEX_NOWAIT(mutex_skipped),
+ NULL, NULL)) {
+ mutex_skipped++;
+ } else {
+ mutex_skipped = 0;
+ }
+
+ os_file_set_eof(srv_monitor_file);
+ mutex_exit(&srv_monitor_file_mutex);
+ }
+
+ if (srv_print_innodb_tablespace_monitor
+ && difftime(current_time,
+ last_tablespace_monitor_time) > 60) {
+ last_tablespace_monitor_time = time(NULL);
+
+ fputs("========================"
+ "========================\n",
+ stderr);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+ "========================"
+ "========================\n",
+ stderr);
+
+ fsp_print(0);
+ fputs("Validating tablespace\n", stderr);
+ fsp_validate(0);
+ fputs("Validation ok\n"
+ "---------------------------------------\n"
+ "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+ "=======================================\n",
+ stderr);
+ }
+
+ if (srv_print_innodb_table_monitor
+ && difftime(current_time, last_table_monitor_time) > 60) {
+
+ last_table_monitor_time = time(NULL);
+
+ fputs("===========================================\n",
+ stderr);
+
+ ut_print_timestamp(stderr);
+
+ fputs(" INNODB TABLE MONITOR OUTPUT\n"
+ "===========================================\n",
+ stderr);
+ dict_print();
+
+ fputs("-----------------------------------\n"
+ "END OF INNODB TABLE MONITOR OUTPUT\n"
+ "==================================\n",
+ stderr);
+ }
+ }
+
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ if (srv_print_innodb_monitor
+ || srv_print_innodb_lock_monitor
+ || srv_print_innodb_tablespace_monitor
+ || srv_print_innodb_table_monitor) {
+ goto loop;
+ }
+
+ srv_monitor_active = FALSE;
+
+ goto loop;
+
+exit_func:
+ srv_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by
+ os_thread_create */
+{
+ srv_slot_t* slot;
+ ibool some_waits;
+ double wait_time;
+ ulint i;
+
+loop:
+ /* When someone is waiting for a lock, we wake up every second
+ and check if a timeout has passed for a lock wait */
+
+ os_thread_sleep(1000000);
+
+ srv_lock_timeout_active = TRUE;
+
+ mutex_enter(&kernel_mutex);
+
+ some_waits = FALSE;
+
+ /* Check of all slots if a thread is waiting there, and if it
+ has exceeded the time limit */
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = srv_mysql_table + i;
+
+ if (slot->in_use) {
+ trx_t* trx;
+ ulong lock_wait_timeout;
+
+ some_waits = TRUE;
+
+ wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+ trx = thr_get_trx(slot->thr);
+ lock_wait_timeout = thd_lock_wait_timeout(
+ trx->mysql_thd);
+
+ if (trx_is_interrupted(trx)
+ || (lock_wait_timeout < 100000000
+ && (wait_time > (double) lock_wait_timeout
+ || wait_time < 0))) {
+
+ /* Timeout exceeded or a wrap-around in system
+ time counter: cancel the lock request queued
+ by the transaction and release possible
+ other transactions waiting behind; it is
+ possible that the lock has already been
+ granted: in that case do nothing */
+
+ if (trx->wait_lock) {
+ lock_cancel_waiting_and_release(
+ trx->wait_lock);
+ }
+ }
+ }
+ }
+
+ os_event_reset(srv_lock_timeout_thread_event);
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ if (some_waits) {
+ goto loop;
+ }
+
+ srv_lock_timeout_active = FALSE;
+
+#if 0
+ /* The following synchronisation is disabled, since
+ the InnoDB monitor output is to be updated every 15 seconds. */
+ os_event_wait(srv_lock_timeout_thread_event);
+#endif
+ goto loop;
+
+exit_func:
+ srv_lock_timeout_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_error_monitor_thread(
+/*=====================*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ /* number of successive fatal timeouts observed */
+ ulint fatal_cnt = 0;
+ ib_uint64_t old_lsn;
+ ib_uint64_t new_lsn;
+
+ old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Error monitor thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+loop:
+ srv_error_monitor_active = TRUE;
+
+ /* Try to track a strange bug reported by Harald Fuchs and others,
+ where the lsn seems to decrease at times */
+
+ new_lsn = log_get_lsn();
+
+ if (new_lsn < old_lsn) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: old log sequence number %llu"
+ " was greater\n"
+ "InnoDB: than the new log sequence number %llu!\n"
+ "InnoDB: Please submit a bug report"
+ " to http://bugs.mysql.com\n",
+ old_lsn, new_lsn);
+ }
+
+ old_lsn = new_lsn;
+
+ if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+ /* We referesh InnoDB Monitor values so that averages are
+ printed from at most 60 last seconds */
+
+ srv_refresh_innodb_monitor_stats();
+ }
+
+ /* Update the statistics collected for deciding LRU
+ eviction policy. */
+ buf_LRU_stat_update();
+
+ /* Update the statistics collected for flush rate policy. */
+ buf_flush_stat_update();
+
+ /* In case mutex_exit is not a memory barrier, it is
+ theoretically possible some threads are left waiting though
+ the semaphore is already released. Wake up those threads: */
+
+ sync_arr_wake_threads_if_sema_free();
+
+ if (sync_array_print_long_waits()) {
+ fatal_cnt++;
+ if (fatal_cnt > 10) {
+
+ fprintf(stderr,
+ "InnoDB: Error: semaphore wait has lasted"
+ " > %lu seconds\n"
+ "InnoDB: We intentionally crash the server,"
+ " because it appears to be hung.\n",
+ (ulong) srv_fatal_semaphore_wait_threshold);
+
+ ut_error;
+ }
+ } else {
+ fatal_cnt = 0;
+ }
+
+ /* Flush stderr so that a database user gets the output
+ to possible MySQL error file */
+
+ fflush(stderr);
+
+ os_thread_sleep(1000000);
+
+ if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
+
+ goto loop;
+ }
+
+ srv_error_monitor_active = FALSE;
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which restores the buffer pool from a dump file on startup and does
+periodic buffer pool dumps.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_LRU_dump_restore_thread(
+/*====================*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ uint auto_lru_dump;
+ time_t last_dump_time;
+ time_t time_elapsed;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "LRU dump/restore thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+ if (srv_auto_lru_dump)
+ buf_LRU_file_restore();
+
+ last_dump_time = time(NULL);
+
+loop:
+ os_thread_sleep(5000000);
+
+ if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+ goto exit_func;
+ }
+
+ time_elapsed = time(NULL) - last_dump_time;
+ auto_lru_dump = srv_auto_lru_dump;
+ if (auto_lru_dump > 0 && (time_t) auto_lru_dump < time_elapsed) {
+ last_dump_time = time(NULL);
+ buf_LRU_file_dump();
+ }
+
+ goto loop;
+exit_func:
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+ srv_activity_count++;
+
+ if (srv_n_threads_active[SRV_MASTER] == 0) {
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_MASTER, 1);
+
+ mutex_exit(&kernel_mutex);
+ }
+}
+
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+ srv_activity_count++;
+
+ mutex_enter(&kernel_mutex);
+
+ srv_release_threads(SRV_MASTER, 1);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+ time_t current_time = time(NULL);
+
+ srv_main_thread_op_info = "flushing log";
+ if (difftime(current_time, srv_last_log_flush_time) >= 1) {
+ log_buffer_sync_in_background(TRUE);
+ srv_last_log_flush_time = current_time;
+ srv_log_writes_and_flush++;
+ }
+}
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_master_thread(
+/*==============*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ os_event_t event;
+ ulint old_activity_count;
+ ulint n_pages_purged = 0;
+ ulint n_bytes_merged;
+ ulint n_pages_flushed;
+ ulint n_bytes_archived;
+ ulint n_tables_to_drop;
+ ulint n_ios;
+ ulint n_ios_old;
+ ulint n_ios_very_old;
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
+
+ ib_uint64_t lsn_old;
+
+ ib_uint64_t oldest_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ srv_main_thread_process_no = os_proc_get_number();
+ srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+
+ mutex_enter(&kernel_mutex);
+
+ srv_table_reserve_slot(SRV_MASTER);
+ srv_n_threads_active[SRV_MASTER]++;
+
+ mutex_exit(&kernel_mutex);
+
+ mutex_enter(&(log_sys->mutex));
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+loop:
+ /*****************************************************************/
+ /* ---- When there is database activity by users, we cycle in this
+ loop */
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ n_ios_very_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+ + buf_pool->stat.n_pages_written;
+ mutex_enter(&kernel_mutex);
+
+ /* Store the user activity counter at the start of this loop */
+ old_activity_count = srv_activity_count;
+
+ mutex_exit(&kernel_mutex);
+
+ if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+
+ goto suspend_thread;
+ }
+
+ /* ---- We run the following loop approximately once per second
+ when there is database activity */
+
+ srv_last_log_flush_time = time(NULL);
+ skip_sleep = FALSE;
+
+ for (i = 0; i < 10; i++) {
+ n_ios_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+ + buf_pool->stat.n_pages_written;
+ srv_main_thread_op_info = "sleeping";
+ srv_main_1_second_loops++;
+
+ if (!skip_sleep) {
+
+ os_thread_sleep(1000000);
+ srv_main_sleeps++;
+
+ /*
+ mutex_enter(&(log_sys->mutex));
+ oldest_lsn = buf_pool_get_oldest_modification();
+ ib_uint64_t lsn = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+
+ if(oldest_lsn)
+ fprintf(stderr,
+ "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
+ (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
+ lsn - lsn_old);
+ */
+ }
+
+ skip_sleep = FALSE;
+
+ /* ALTER TABLE in MySQL requires on Unix that the table handler
+ can drop tables lazily after there no longer are SELECT
+ queries to them. */
+
+ srv_main_thread_op_info = "doing background drop tables";
+
+ row_drop_tables_for_mysql_in_background();
+
+ srv_main_thread_op_info = "";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ goto background_loop;
+ }
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+
+ srv_main_thread_op_info = "making checkpoint";
+ log_free_check();
+
+ /* If i/os during one second sleep were less than 5% of
+ capacity, we assume that there is free disk i/o capacity
+ available, and it makes sense to do an insert buffer merge. */
+
+ n_pend_ios = buf_get_n_pending_ios()
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+ + buf_pool->stat.n_pages_written;
+ if (n_pend_ios < SRV_PEND_IO_THRESHOLD
+ && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+ ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+ }
+
+ if (UNIV_UNLIKELY(buf_get_modified_ratio_pct()
+ > srv_max_buf_pool_modified_pct)) {
+
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+ srv_main_thread_op_info =
+ "flushing buffer pool pages";
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+ PCT_IO(100),
+ IB_ULONGLONG_MAX);
+
+ /* If we had to do the flush, it may have taken
+ even more than 1 second, and also, there may be more
+ to flush. Do not sleep 1 second during the next
+ iteration of this loop. */
+
+ skip_sleep = TRUE;
+
+ mutex_enter(&(log_sys->mutex));
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+ } else if (srv_adaptive_flushing) {
+
+ /* Try to keep the rate of flushing of dirty
+ pages such that redo log generation does not
+ produce bursts of IO at checkpoint time. */
+ ulint n_flush = buf_flush_get_desired_flush_rate();
+
+ if (n_flush) {
+ srv_main_thread_op_info =
+ "flushing buffer pool pages";
+ n_flush = ut_min(PCT_IO(100), n_flush);
+ n_pages_flushed =
+ buf_flush_batch(
+ BUF_FLUSH_LIST,
+ n_flush,
+ IB_ULONGLONG_MAX);
+
+ if (n_flush == PCT_IO(100)) {
+ skip_sleep = TRUE;
+ }
+ }
+
+ mutex_enter(&(log_sys->mutex));
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+ } else if (srv_adaptive_checkpoint == 1) {
+ /* adaptive_flushing option is prior to adaptive_checkpoint option, for now */
+
+ /* Try to keep modified age not to exceed
+ max_checkpoint_age * 7/8 line */
+
+ mutex_enter(&(log_sys->mutex));
+ lsn_old = log_sys->lsn;
+ oldest_lsn = buf_pool_get_oldest_modification();
+ if (oldest_lsn == 0) {
+
+ mutex_exit(&(log_sys->mutex));
+
+ } else {
+ if ((log_sys->lsn - oldest_lsn)
+ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
+ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
+ /* We should not flush from here. */
+ mutex_exit(&(log_sys->mutex));
+ } else if ((log_sys->lsn - oldest_lsn)
+ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
+
+ /* 2nd defence line (max_checkpoint_age * 3/4) */
+
+ mutex_exit(&(log_sys->mutex));
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ IB_ULONGLONG_MAX);
+ skip_sleep = TRUE;
+ } else if ((log_sys->lsn - oldest_lsn)
+ > (log_sys->max_checkpoint_age)/2 ) {
+
+ /* 1st defence line (max_checkpoint_age * 1/2) */
+
+ mutex_exit(&(log_sys->mutex));
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ IB_ULONGLONG_MAX);
+ skip_sleep = TRUE;
+ } else {
+ mutex_exit(&(log_sys->mutex));
+ }
+ }
+ } else if (srv_adaptive_checkpoint == 2) {
+
+ /* Try to keep modified age not to exceed
+ max_checkpoint_age * 7/8 line */
+
+ mutex_enter(&(log_sys->mutex));
+
+ oldest_lsn = buf_pool_get_oldest_modification();
+ if (oldest_lsn == 0) {
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+
+ } else {
+ if ((log_sys->lsn - oldest_lsn)
+ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
+ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
+ /* We should not flush from here. */
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+ } else if ((log_sys->lsn - oldest_lsn)
+ > (log_sys->max_checkpoint_age)/4 ) {
+
+ /* defence line (max_checkpoint_age * 1/2) */
+ ib_uint64_t lsn = log_sys->lsn;
+
+ ib_uint64_t level, bpl;
+ buf_page_t* bpage;
+
+ mutex_exit(&(log_sys->mutex));
+
+ mutex_enter(&flush_list_mutex);
+
+ level = 0;
+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+ while (bpage != NULL) {
+ ib_uint64_t oldest_modification = bpage->oldest_modification;
+ if (oldest_modification != 0) {
+ level += log_sys->max_checkpoint_age
+ - (lsn - oldest_modification);
+ }
+ bpage = UT_LIST_GET_NEXT(flush_list, bpage);
+ }
+
+ if (level) {
+ bpl = ((ib_uint64_t) UT_LIST_GET_LEN(buf_pool->flush_list)
+ * UT_LIST_GET_LEN(buf_pool->flush_list)
+ * (lsn - lsn_old)) / level;
+ } else {
+ bpl = 0;
+ }
+
+ mutex_exit(&flush_list_mutex);
+
+ if (!srv_use_doublewrite_buf) {
+ /* flush is faster than when doublewrite */
+ bpl = (bpl * 7) / 8;
+ }
+
+ if (bpl) {
+retry_flush_batch:
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+ (ulint) bpl,
+ oldest_lsn + (lsn - lsn_old));
+ if (n_pages_flushed == ULINT_UNDEFINED) {
+ os_thread_sleep(5000);
+ goto retry_flush_batch;
+ }
+ }
+
+ lsn_old = lsn;
+ /*
+ fprintf(stderr,
+ "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
+ (lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
+ lsn - lsn_old, bpl);
+ */
+ } else {
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+ }
+ }
+
+ } else {
+ mutex_enter(&(log_sys->mutex));
+ lsn_old = log_sys->lsn;
+ mutex_exit(&(log_sys->mutex));
+ }
+
+ if (srv_activity_count == old_activity_count) {
+
+ /* There is no user activity at the moment, go to
+ the background loop */
+
+ goto background_loop;
+ }
+ }
+
+ /* ---- We perform the following code approximately once per
+ 10 seconds when there is database activity */
+
+#ifdef MEM_PERIODIC_CHECK
+ /* Check magic numbers of every allocated mem block once in 10
+ seconds */
+ mem_validate_all_blocks();
+#endif
+ /* If i/os during the 10 second period were less than 200% of
+ capacity, we assume that there is free disk i/o capacity
+ available, and it makes sense to flush srv_io_capacity pages.
+
+ Note that this is done regardless of the fraction of dirty
+ pages relative to the max requested by the user. The one second
+ loop above requests writes for that case. The writes done here
+ are not required, and may be disabled. */
+
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+ + buf_pool->stat.n_pages_written;
+
+ srv_main_10_second_loops++;
+ if (n_pend_ios < SRV_PEND_IO_THRESHOLD
+ && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ IB_ULONGLONG_MAX);
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+ ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+
+ if (!srv_use_purge_thread) {
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+ do {
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ goto background_loop;
+ }
+
+ srv_main_thread_op_info = "purging";
+ n_pages_purged = trx_purge();
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+
+ } while (n_pages_purged);
+ }
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+
+ /* Flush a few oldest pages to make a new checkpoint younger */
+
+ if (buf_get_modified_ratio_pct() > 70) {
+
+ /* If there are lots of modified pages in the buffer pool
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+ PCT_IO(100),
+ IB_ULONGLONG_MAX);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+ PCT_IO(10),
+ IB_ULONGLONG_MAX);
+ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+ /* Make a new checkpoint about once in 10 seconds */
+
+ log_checkpoint(TRUE, FALSE);
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+
+ /* ---- When there is database activity, we jump from here back to
+ the start of loop */
+
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ /* If the database is quiet, we enter the background loop */
+
+ /*****************************************************************/
+background_loop:
+ /* ---- In this loop we run background operations when the server
+ is quiet from user activity. Also in the case of a shutdown, we
+ loop here, flushing the buffer pool to the data files. */
+
+ /* The server has been quiet for a while: start running background
+ operations */
+ srv_main_background_loops++;
+ srv_main_thread_op_info = "doing background drop tables";
+
+ n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+ if (n_tables_to_drop > 0) {
+ /* Do not monopolize the CPU even if there are tables waiting
+ in the background drop queue. (It is essentially a bug if
+ MySQL tries to drop a table while there are still open handles
+ to it and we had to put it to the background drop queue.) */
+
+ os_thread_sleep(100000);
+ }
+
+ if (!srv_use_purge_thread) {
+ srv_main_thread_op_info = "purging";
+
+ /* Run a full purge */
+ do {
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+ break;
+ }
+
+ srv_main_thread_op_info = "purging";
+ n_pages_purged = trx_purge();
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+
+ } while (n_pages_purged);
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ n_bytes_merged = 0;
+ } else {
+ /* This should do an amount of IO similar to the number of
+ dirty pages that will be flushed in the call to
+ buf_flush_batch below. Otherwise, the system favors
+ clean pages over cleanup throughput. */
+ n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
+ PCT_IBUF_IO(100));
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+flush_loop:
+ srv_main_thread_op_info = "flushing buffer pool pages";
+ srv_main_flush_loops++;
+ if (srv_fast_shutdown < 2) {
+ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+ PCT_IO(100),
+ IB_ULONGLONG_MAX);
+ } else {
+ /* In the fastest shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+
+ n_pages_flushed = 0;
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+
+ srv_main_thread_op_info = "waiting for buffer pool flush to end";
+ buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+ /* Flush logs if needed */
+ srv_sync_log_buffer_in_background();
+
+ srv_main_thread_op_info = "making checkpoint";
+
+ log_checkpoint(TRUE, FALSE);
+
+ if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) {
+
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+ goto flush_loop;
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+
+ mutex_enter(&kernel_mutex);
+ if (srv_activity_count != old_activity_count) {
+ mutex_exit(&kernel_mutex);
+ goto loop;
+ }
+ mutex_exit(&kernel_mutex);
+ /*
+ srv_main_thread_op_info = "archiving log (if log archive is on)";
+
+ log_archive_do(FALSE, &n_bytes_archived);
+ */
+ n_bytes_archived = 0;
+
+ /* Keep looping in the background loop if still work to do */
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ if (n_tables_to_drop + n_pages_flushed
+ + n_bytes_archived != 0) {
+
+ /* If we are doing a fast shutdown (= the default)
+ we do not do purge or insert buffer merge. But we
+ flush the buffer pool completely to disk.
+ In a 'very fast' shutdown we do not flush the buffer
+ pool to data files: we have set n_pages_flushed to
+ 0 artificially. */
+
+ goto background_loop;
+ }
+ } else if (n_tables_to_drop
+ + n_pages_purged + n_bytes_merged + n_pages_flushed
+ + n_bytes_archived != 0) {
+ /* In a 'slow' shutdown we run purge and the insert buffer
+ merge to completion */
+
+ goto background_loop;
+ }
+
+ /* There is no work for background operations either: suspend
+ master thread to wait for more server activity */
+
+suspend_thread:
+ srv_main_thread_op_info = "suspending";
+
+ mutex_enter(&kernel_mutex);
+
+ if (row_get_background_drop_list_len_low() > 0) {
+ mutex_exit(&kernel_mutex);
+
+ goto loop;
+ }
+
+ event = srv_suspend_thread();
+
+ mutex_exit(&kernel_mutex);
+
+ /* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
+ waits for database activity to die down when converting < 4.1.x
+ databases, and relies on this string being exactly as it is. InnoDB
+ manual also mentions this string in several places. */
+ srv_main_thread_op_info = "waiting for server activity";
+
+ os_event_wait(event);
+
+ if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+ /* This is only extra safety, the thread should exit
+ already when the event wait ends */
+
+ os_thread_exit(NULL);
+ }
+
+ /* When there is user activity, InnoDB will set the event and the
+ main thread goes back to loop. */
+
+ goto loop;
+
+ OS_THREAD_DUMMY_RETURN; /* Not reached, avoid compiler warning */
+}
+
+/*************************************************************************
+A thread which is devoted to purge, for take over the master thread's
+purging */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+ void* arg __attribute__((unused)))
+ /* in: a dummy parameter required by os_thread_create */
+{
+ ulint n_pages_purged;
+ ulint n_pages_purged_sum = 1; /* dummy */
+ ulint history_len;
+ ulint sleep_ms= 10000; /* initial: 10 sec. */
+ ibool can_be_last = FALSE;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Purge thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+ mutex_enter(&kernel_mutex);
+ srv_table_reserve_slot(SRV_PURGE);
+ srv_n_threads_active[SRV_PURGE]++;
+ mutex_exit(&kernel_mutex);
+
+loop:
+ if (srv_shutdown_state > 0) {
+ if (srv_fast_shutdown) {
+ /* someone other should wait the end of the workers */
+ goto exit_func;
+ }
+
+ mutex_enter(&kernel_mutex);
+ if (srv_n_threads_active[SRV_PURGE_WORKER]) {
+ can_be_last = FALSE;
+ } else {
+ can_be_last = TRUE;
+ }
+ mutex_exit(&kernel_mutex);
+
+ sleep_ms = 10;
+ os_event_reset(srv_purge_thread_event);
+ }
+
+ os_event_wait_time(srv_purge_thread_event, sleep_ms * 1000);
+
+ history_len = trx_sys->rseg_history_len;
+ if (history_len > 1000)
+ sleep_ms /= 10;
+ if (sleep_ms < 10)
+ sleep_ms = 10;
+
+ n_pages_purged_sum = 0;
+
+ do {
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ goto exit_func;
+ }
+ n_pages_purged = trx_purge();
+ n_pages_purged_sum += n_pages_purged;
+ } while (n_pages_purged);
+
+ if (srv_shutdown_state > 0 && can_be_last) {
+ /* the last trx_purge() is executed without workers */
+ goto exit_func;
+ }
+
+ if (n_pages_purged_sum) {
+ srv_active_wake_master_thread();
+ }
+
+ if (n_pages_purged_sum == 0)
+ sleep_ms *= 10;
+ if (sleep_ms > 10000)
+ sleep_ms = 10000;
+
+ goto loop;
+
+exit_func:
+ trx_purge_worker_wake(); /* It may not make sense. for safety only */
+
+ /* wake master thread to flush the pages */
+ srv_wake_master_thread();
+
+ mutex_enter(&kernel_mutex);
+ srv_n_threads_active[SRV_PURGE]--;
+ mutex_exit(&kernel_mutex);
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*************************************************************************
+A thread which is devoted to purge, for take over the master thread's
+purging */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_worker_thread(
+/*====================*/
+ void* arg)
+{
+ ulint worker_id; /* index for array */
+
+ worker_id = *((ulint*)arg);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Purge worker thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ mutex_enter(&kernel_mutex);
+ srv_table_reserve_slot(SRV_PURGE_WORKER);
+ srv_n_threads_active[SRV_PURGE_WORKER]++;
+ mutex_exit(&kernel_mutex);
+
+loop:
+ /* purge worker threads only works when srv_shutdown_state==0 */
+ /* for safety and exactness. */
+ if (srv_shutdown_state > 0) {
+ goto exit_func;
+ }
+
+ trx_purge_worker_wait();
+
+ if (srv_shutdown_state > 0) {
+ goto exit_func;
+ }
+
+ trx_purge_worker(worker_id);
+
+ goto loop;
+
+exit_func:
+ mutex_enter(&kernel_mutex);
+ srv_n_threads_active[SRV_PURGE_WORKER]--;
+ mutex_exit(&kernel_mutex);
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
new file mode 100644
index 00000000000..b36faf2d2d7
--- /dev/null
+++ b/storage/xtradb/srv/srv0start.c
@@ -0,0 +1,2268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.c
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0proc.h"
+# include "sync0sync.h"
+# include "buf0flu.h"
+# include "buf0rea.h"
+# include "dict0boot.h"
+# include "dict0load.h"
+# include "que0que.h"
+# include "usr0sess.h"
+# include "lock0lock.h"
+# include "trx0roll.h"
+# include "trx0purge.h"
+# include "lock0lock.h"
+# include "pars0pars.h"
+# include "btr0sea.h"
+# include "rem0cmp.h"
+# include "dict0crea.h"
+# include "row0ins.h"
+# include "row0sel.h"
+# include "row0upd.h"
+# include "row0row.h"
+# include "row0mysql.h"
+# include "btr0pcur.h"
+# include "thr0loc.h"
+# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+# include "zlib.h" /* for ZLIB_VERSION */
+
+/** Log sequence number immediately after startup */
+UNIV_INTERN ib_uint64_t srv_start_lsn;
+/** Log sequence number at shutdown */
+UNIV_INTERN ib_uint64_t srv_shutdown_lsn;
+
+#ifdef HAVE_DARWIN_THREADS
+# include <sys/utsname.h>
+/** TRUE if the F_FULLFSYNC option is available */
+UNIV_INTERN ibool srv_have_fullfsync = FALSE;
+#endif
+
+/** TRUE if a raw partition is in use */
+UNIV_INTERN ibool srv_start_raw_disk_in_use = FALSE;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+UNIV_INTERN ibool srv_startup_is_before_trx_rollback_phase = FALSE;
+/** TRUE if the server is being started */
+UNIV_INTERN ibool srv_is_being_started = FALSE;
+/** TRUE if the server was successfully started */
+UNIV_INTERN ibool srv_was_started = FALSE;
+/** TRUE if innobase_start_or_create_for_mysql() has been called */
+static ibool srv_start_has_been_called = FALSE;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+UNIV_INTERN enum srv_shutdown_state srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Files comprising the system tablespace */
+static os_file_t files[1000];
+
+/** Mutex protecting the ios count */
+static mutex_t ios_mutex;
+/** Count of I/O operations in io_handler_thread() */
+static ulint ios;
+
+/** io_handler_thread parameters for thread identification */
+static ulint n[SRV_MAX_N_IO_THREADS + 7 + 64];
+/** io_handler_thread identifiers */
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 7 + 64];
+
+/** We use this mutex to test the return value of pthread_mutex_trylock
+ on successful locking. HP-UX does NOT return 0, though Linux et al do. */
+static os_fast_mutex_t srv_os_test_mutex;
+
+/** Name of srv_monitor_file */
+static char* srv_monitor_file_name;
+#endif /* !UNIV_HOTBACKUP */
+
+/** */
+#define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
+#define SRV_MAX_N_PENDING_SYNC_IOS 100
+
+
+/*********************************************************************//**
+Convert a numeric string that optionally ends in G or M, to a number
+containing megabytes.
+@return next character in string */
+static
+char*
+srv_parse_megabytes(
+/*================*/
+ char* str, /*!< in: string containing a quantity in bytes */
+ ulint* megs) /*!< out: the number in megabytes */
+{
+ char* endp;
+ ulint size;
+
+ size = strtoul(str, &endp, 10);
+
+ str = endp;
+
+ switch (*str) {
+ case 'G': case 'g':
+ size *= 1024;
+ /* fall through */
+ case 'M': case 'm':
+ str++;
+ break;
+ default:
+ size /= 1024 * 1024;
+ break;
+ }
+
+ *megs = size;
+ return(str);
+}
+
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+ char* str) /*!< in/out: the data file path string */
+{
+ char* input_str;
+ char* path;
+ ulint size;
+ ulint i = 0;
+
+ srv_auto_extend_last_data_file = FALSE;
+ srv_last_file_size_max = 0;
+ srv_data_file_names = NULL;
+ srv_data_file_sizes = NULL;
+ srv_data_file_is_raw_partition = NULL;
+
+ input_str = str;
+
+ /* First calculate the number of data files and check syntax:
+ path:size[M | G];path:size[M | G]... . Note that a Windows path may
+ contain a drive name and a ':'. */
+
+ while (*str != '\0') {
+ path = str;
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == '\0') {
+ return(FALSE);
+ }
+
+ str++;
+
+ str = srv_parse_megabytes(str, &size);
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = srv_parse_megabytes(str, &size);
+ }
+
+ if (*str != '\0') {
+
+ return(FALSE);
+ }
+ }
+
+ if (strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+ str += 3;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+ str += 3;
+ }
+
+ if (size == 0) {
+ return(FALSE);
+ }
+
+ i++;
+
+ if (*str == ';') {
+ str++;
+ } else if (*str != '\0') {
+
+ return(FALSE);
+ }
+ }
+
+ if (i == 0) {
+ /* If innodb_data_file_path was defined it must contain
+ at least one data file definition */
+
+ return(FALSE);
+ }
+
+ srv_data_file_names = malloc(i * sizeof *srv_data_file_names);
+ srv_data_file_sizes = malloc(i * sizeof *srv_data_file_sizes);
+ srv_data_file_is_raw_partition = malloc(
+ i * sizeof *srv_data_file_is_raw_partition);
+
+ srv_n_data_files = i;
+
+ /* Then store the actual values to our arrays */
+
+ str = input_str;
+ i = 0;
+
+ while (*str != '\0') {
+ path = str;
+
+ /* Note that we must step over the ':' in a Windows path;
+ a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+ a Windows raw partition may have a specification like
+ \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+ while ((*str != ':' && *str != '\0')
+ || (*str == ':'
+ && (*(str + 1) == '\\' || *(str + 1) == '/'
+ || *(str + 1) == ':'))) {
+ str++;
+ }
+
+ if (*str == ':') {
+ /* Make path a null-terminated string */
+ *str = '\0';
+ str++;
+ }
+
+ str = srv_parse_megabytes(str, &size);
+
+ srv_data_file_names[i] = path;
+ srv_data_file_sizes[i] = size;
+
+ if (0 == strncmp(str, ":autoextend",
+ (sizeof ":autoextend") - 1)) {
+
+ srv_auto_extend_last_data_file = TRUE;
+
+ str += (sizeof ":autoextend") - 1;
+
+ if (0 == strncmp(str, ":max:",
+ (sizeof ":max:") - 1)) {
+
+ str += (sizeof ":max:") - 1;
+
+ str = srv_parse_megabytes(
+ str, &srv_last_file_size_max);
+ }
+
+ if (*str != '\0') {
+
+ return(FALSE);
+ }
+ }
+
+ (srv_data_file_is_raw_partition)[i] = 0;
+
+ if (strlen(str) >= 6
+ && *str == 'n'
+ && *(str + 1) == 'e'
+ && *(str + 2) == 'w') {
+ str += 3;
+ (srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW;
+ }
+
+ if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+ str += 3;
+
+ if ((srv_data_file_is_raw_partition)[i] == 0) {
+ (srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW;
+ }
+ }
+
+ i++;
+
+ if (*str == ';') {
+ str++;
+ }
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Reads log group home directories from a character string given in
+the .cnf file.
+@return TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_log_group_home_dirs(
+/*==========================*/
+ char* str) /*!< in/out: character string */
+{
+ char* input_str;
+ char* path;
+ ulint i = 0;
+
+ srv_log_group_home_dirs = NULL;
+
+ input_str = str;
+
+ /* First calculate the number of directories and check syntax:
+ path;path;... */
+
+ while (*str != '\0') {
+ path = str;
+
+ while (*str != ';' && *str != '\0') {
+ str++;
+ }
+
+ i++;
+
+ if (*str == ';') {
+ str++;
+ } else if (*str != '\0') {
+
+ return(FALSE);
+ }
+ }
+
+ if (i != 1) {
+ /* If innodb_log_group_home_dir was defined it must
+ contain exactly one path definition under current MySQL */
+
+ return(FALSE);
+ }
+
+ srv_log_group_home_dirs = malloc(i * sizeof *srv_log_group_home_dirs);
+
+ /* Then store the actual values to our array */
+
+ str = input_str;
+ i = 0;
+
+ while (*str != '\0') {
+ path = str;
+
+ while (*str != ';' && *str != '\0') {
+ str++;
+ }
+
+ if (*str == ';') {
+ *str = '\0';
+ str++;
+ }
+
+ srv_log_group_home_dirs[i] = path;
+
+ i++;
+ }
+
+ return(TRUE);
+}
+
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void)
+/*==========================*/
+{
+ free(srv_data_file_names);
+ srv_data_file_names = NULL;
+ free(srv_data_file_sizes);
+ srv_data_file_sizes = NULL;
+ free(srv_data_file_is_raw_partition);
+ srv_data_file_is_raw_partition = NULL;
+ free(srv_log_group_home_dirs);
+ srv_log_group_home_dirs = NULL;
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+I/o-handler thread function.
+@return OS_THREAD_DUMMY_RETURN */
+static
+os_thread_ret_t
+io_handler_thread(
+/*==============*/
+ void* arg) /*!< in: pointer to the number of the segment in
+ the aio array */
+{
+ ulint segment;
+ ulint i;
+
+ segment = *((ulint*)arg);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
+ os_thread_pf(os_thread_get_curr_id()));
+#endif
+ for (i = 0;; i++) {
+ fil_aio_wait(segment);
+
+ mutex_enter(&ios_mutex);
+ ios++;
+ mutex_exit(&ios_mutex);
+ }
+
+ thr_local_free(os_thread_get_curr_id());
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit.
+ The thread actually never comes here because it is exited in an
+ os_event_wait(). */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR '\\'
+#else
+#define SRV_PATH_SEPARATOR '/'
+#endif
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+ char* str __attribute__((unused))) /*!< in/out: null-terminated
+ character string */
+{
+#ifdef __WIN__
+ for (; *str; str++) {
+
+ if (*str == '/') {
+ *str = '\\';
+ }
+ }
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Calculates the low 32 bits when a file size which is given as a number
+database pages is converted to the number of bytes.
+@return low 32 bytes of file size when expressed in bytes */
+static
+ulint
+srv_calc_low32(
+/*===========*/
+ ulint file_size) /*!< in: file size in database pages */
+{
+ return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT));
+}
+
+/*********************************************************************//**
+Calculates the high 32 bits when a file size which is given as a number
+database pages is converted to the number of bytes.
+@return high 32 bytes of file size when expressed in bytes */
+static
+ulint
+srv_calc_high32(
+/*============*/
+ ulint file_size) /*!< in: file size in database pages */
+{
+ return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT));
+}
+
+/*********************************************************************//**
+Creates or opens the log files and closes them.
+@return DB_SUCCESS or error code */
+static
+ulint
+open_or_create_log_file(
+/*====================*/
+ ibool create_new_db, /*!< in: TRUE if we should create a
+ new database */
+ ibool* log_file_created, /*!< out: TRUE if new log file
+ created */
+ ibool log_file_has_been_opened,/*!< in: TRUE if a log file has been
+ opened before: then it is an error
+ to try to create another log file */
+ ulint k, /*!< in: log group number */
+ ulint i) /*!< in: log file number in group */
+{
+ ibool ret;
+ ulint size;
+ ulint size_high;
+ char name[10000];
+ ulint dirnamelen;
+
+ UT_NOT_USED(create_new_db);
+
+ *log_file_created = FALSE;
+
+ srv_normalize_path_for_win(srv_log_group_home_dirs[k]);
+
+ dirnamelen = strlen(srv_log_group_home_dirs[k]);
+ ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+ memcpy(name, srv_log_group_home_dirs[k], dirnamelen);
+
+ /* Add a path separator if needed. */
+ if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+ name[dirnamelen++] = SRV_PATH_SEPARATOR;
+ }
+
+ sprintf(name + dirnamelen, "%s%lu", "ib_logfile", (ulong) i);
+
+ files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL,
+ OS_LOG_FILE, &ret);
+ if (ret == FALSE) {
+ if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+ /* AIX 5.1 after security patch ML7 may have errno set
+ to 0 here, which causes our function to return 100;
+ work around that AIX problem */
+ && os_file_get_last_error(FALSE) != 100
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: Error in creating"
+ " or opening %s\n", name);
+
+ return(DB_ERROR);
+ }
+
+ files[i] = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO,
+ OS_LOG_FILE, &ret);
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in opening %s\n", name);
+
+ return(DB_ERROR);
+ }
+
+ ret = os_file_get_size(files[i], &size, &size_high);
+ ut_a(ret);
+
+ if (size != srv_calc_low32(srv_log_file_size)
+ || size_high != srv_calc_high32(srv_log_file_size)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: log file %s is"
+ " of different size %lu %lu bytes\n"
+ "InnoDB: than specified in the .cnf"
+ " file %lu %lu bytes!\n",
+ name, (ulong) size_high, (ulong) size,
+ (ulong) srv_calc_high32(srv_log_file_size),
+ (ulong) srv_calc_low32(srv_log_file_size));
+
+ return(DB_ERROR);
+ }
+ } else {
+ *log_file_created = TRUE;
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Log file %s did not exist:"
+ " new to be created\n",
+ name);
+ if (log_file_has_been_opened) {
+
+ return(DB_ERROR);
+ }
+
+ fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n",
+ name, (ulong) srv_log_file_size
+ >> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+ fprintf(stderr,
+ "InnoDB: Database physically writes the file"
+ " full: wait...\n");
+
+ ret = os_file_set_size(name, files[i],
+ srv_calc_low32(srv_log_file_size),
+ srv_calc_high32(srv_log_file_size));
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in creating %s:"
+ " probably out of disk space\n",
+ name);
+
+ return(DB_ERROR);
+ }
+ }
+
+ ret = os_file_close(files[i]);
+ ut_a(ret);
+
+ if (i == 0) {
+ /* Create in memory the file space object
+ which is for this log group */
+
+ fil_space_create(name,
+ 2 * k + SRV_LOG_SPACE_FIRST_ID, 0, FIL_LOG);
+ }
+
+ ut_a(fil_validate());
+
+ fil_node_create(name, srv_log_file_size,
+ 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE);
+#ifdef UNIV_LOG_ARCHIVE
+ /* If this is the first log group, create the file space object
+ for archived logs.
+ Under MySQL, no archiving ever done. */
+
+ if (k == 0 && i == 0) {
+ arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID;
+
+ fil_space_create("arch_log_space", arch_space_id, 0, FIL_LOG);
+ } else {
+ arch_space_id = ULINT_UNDEFINED;
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+ if (i == 0) {
+ log_group_init(k, srv_n_log_files,
+ srv_log_file_size * UNIV_PAGE_SIZE,
+ 2 * k + SRV_LOG_SPACE_FIRST_ID,
+ SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch
+ space id */
+ }
+
+ return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Creates or opens database data files and closes them.
+@return DB_SUCCESS or error code */
+static
+ulint
+open_or_create_data_files(
+/*======================*/
+ ibool* create_new_db, /*!< out: TRUE if new database should be
+ created */
+ ibool* create_new_doublewrite_file,
+#ifdef UNIV_LOG_ARCHIVE
+ ulint* min_arch_log_no,/*!< out: min of archived log
+ numbers in data files */
+ ulint* max_arch_log_no,/*!< out: max of archived log
+ numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+ ib_uint64_t* min_flushed_lsn,/*!< out: min of flushed lsn
+ values in data files */
+ ib_uint64_t* max_flushed_lsn,/*!< out: max of flushed lsn
+ values in data files */
+ ulint* sum_of_new_sizes)/*!< out: sum of sizes of the
+ new files added */
+{
+ ibool ret;
+ ulint i;
+ ibool one_opened = FALSE;
+ ibool one_created = FALSE;
+ ulint size;
+ ulint size_high;
+ ulint rounded_size_pages;
+ char name[10000];
+
+ if (srv_n_data_files >= 1000) {
+ fprintf(stderr, "InnoDB: can only have < 1000 data files\n"
+ "InnoDB: you have defined %lu\n",
+ (ulong) srv_n_data_files);
+ return(DB_ERROR);
+ }
+
+ *sum_of_new_sizes = 0;
+
+ *create_new_db = FALSE;
+ *create_new_doublewrite_file = FALSE;
+
+ srv_normalize_path_for_win(srv_data_home);
+
+ for (i = 0; i < srv_n_data_files; i++) {
+ ulint dirnamelen;
+
+ srv_normalize_path_for_win(srv_data_file_names[i]);
+ dirnamelen = strlen(srv_data_home);
+
+ ut_a(dirnamelen + strlen(srv_data_file_names[i])
+ < (sizeof name) - 1);
+ memcpy(name, srv_data_home, dirnamelen);
+ /* Add a path separator if needed. */
+ if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+ name[dirnamelen++] = SRV_PATH_SEPARATOR;
+ }
+
+ strcpy(name + dirnamelen, srv_data_file_names[i]);
+
+ if (srv_data_file_is_raw_partition[i] == 0) {
+
+ /* First we try to create the file: if it already
+ exists, ret will get value FALSE */
+
+ files[i] = os_file_create(name, OS_FILE_CREATE,
+ OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+
+ if (ret == FALSE && os_file_get_last_error(FALSE)
+ != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+ /* AIX 5.1 after security patch ML7 may have
+ errno set to 0 here, which causes our function
+ to return 100; work around that AIX problem */
+ && os_file_get_last_error(FALSE) != 100
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: Error in creating"
+ " or opening %s\n",
+ name);
+
+ return(DB_ERROR);
+ }
+ } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+ /* The partition is opened, not created; then it is
+ written over */
+
+ srv_start_raw_disk_in_use = TRUE;
+ srv_created_new_raw = TRUE;
+
+ files[i] = os_file_create(name, OS_FILE_OPEN_RAW,
+ OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in opening %s\n", name);
+
+ return(DB_ERROR);
+ }
+ } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+ srv_start_raw_disk_in_use = TRUE;
+
+ ret = FALSE;
+ } else {
+ ut_a(0);
+ }
+
+ if (ret == FALSE) {
+ /* We open the data file */
+
+ if (one_created) {
+ fprintf(stderr,
+ "InnoDB: Error: data files can only"
+ " be added at the end\n");
+ fprintf(stderr,
+ "InnoDB: of a tablespace, but"
+ " data file %s existed beforehand.\n",
+ name);
+ return(DB_ERROR);
+ }
+
+ if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN_RAW,
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ } else if (i == 0) {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN_RETRY,
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ } else {
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+ }
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in opening %s\n", name);
+ os_file_get_last_error(TRUE);
+
+ return(DB_ERROR);
+ }
+
+ if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+
+ goto skip_size_check;
+ }
+
+ ret = os_file_get_size(files[i], &size, &size_high);
+ ut_a(ret);
+ /* Round size downward to megabytes */
+
+ rounded_size_pages
+ = (size / (1024 * 1024) + 4096 * size_high)
+ << (20 - UNIV_PAGE_SIZE_SHIFT);
+
+ if (i == srv_n_data_files - 1
+ && srv_auto_extend_last_data_file) {
+
+ if (srv_data_file_sizes[i] > rounded_size_pages
+ || (srv_last_file_size_max > 0
+ && srv_last_file_size_max
+ < rounded_size_pages)) {
+
+ fprintf(stderr,
+ "InnoDB: Error: auto-extending"
+ " data file %s is"
+ " of a different size\n"
+ "InnoDB: %lu pages (rounded"
+ " down to MB) than specified"
+ " in the .cnf file:\n"
+ "InnoDB: initial %lu pages,"
+ " max %lu (relevant if"
+ " non-zero) pages!\n",
+ name,
+ (ulong) rounded_size_pages,
+ (ulong) srv_data_file_sizes[i],
+ (ulong)
+ srv_last_file_size_max);
+
+ return(DB_ERROR);
+ }
+
+ srv_data_file_sizes[i] = rounded_size_pages;
+ }
+
+ if (rounded_size_pages != srv_data_file_sizes[i]) {
+
+ fprintf(stderr,
+ "InnoDB: Error: data file %s"
+ " is of a different size\n"
+ "InnoDB: %lu pages"
+ " (rounded down to MB)\n"
+ "InnoDB: than specified"
+ " in the .cnf file %lu pages!\n",
+ name,
+ (ulong) rounded_size_pages,
+ (ulong) srv_data_file_sizes[i]);
+
+ return(DB_ERROR);
+ }
+skip_size_check:
+ fil_read_flushed_lsn_and_arch_log_no(
+ files[i], one_opened,
+#ifdef UNIV_LOG_ARCHIVE
+ min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+ min_flushed_lsn, max_flushed_lsn);
+ one_opened = TRUE;
+ } else {
+ /* We created the data file and now write it full of
+ zeros */
+
+ one_created = TRUE;
+
+ if (i > 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Data file %s did not"
+ " exist: new to be created\n",
+ name);
+ } else {
+ fprintf(stderr,
+ "InnoDB: The first specified"
+ " data file %s did not exist:\n"
+ "InnoDB: a new database"
+ " to be created!\n", name);
+ *create_new_db = TRUE;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Setting file %s size to %lu MB\n",
+ name,
+ (ulong) (srv_data_file_sizes[i]
+ >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+ fprintf(stderr,
+ "InnoDB: Database physically writes the"
+ " file full: wait...\n");
+
+ ret = os_file_set_size(
+ name, files[i],
+ srv_calc_low32(srv_data_file_sizes[i]),
+ srv_calc_high32(srv_data_file_sizes[i]));
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in creating %s:"
+ " probably out of disk space\n", name);
+
+ return(DB_ERROR);
+ }
+
+ *sum_of_new_sizes = *sum_of_new_sizes
+ + srv_data_file_sizes[i];
+ }
+
+ ret = os_file_close(files[i]);
+ ut_a(ret);
+
+ if (i == 0) {
+ fil_space_create(name, 0, 0, FIL_TABLESPACE);
+ }
+
+ ut_a(fil_validate());
+
+ fil_node_create(name, srv_data_file_sizes[i], 0,
+ srv_data_file_is_raw_partition[i] != 0);
+ }
+
+ /* special file for doublewrite buffer */
+ if (srv_doublewrite_file)
+ {
+ srv_normalize_path_for_win(srv_doublewrite_file);
+
+ fprintf(stderr,
+ "InnoDB: Notice: innodb_doublewrite_file is specified.\n"
+ "InnoDB: This is for expert only. Don't use if you don't understand what is it 'WELL'.\n"
+ "InnoDB: ### Don't specify older file than the last checkpoint ###\n"
+ "InnoDB: otherwise the older doublewrite buffer will break your data during recovery!\n");
+
+ strcpy(name, srv_doublewrite_file);
+
+ /* First we try to create the file: if it already
+ exists, ret will get value FALSE */
+
+ files[i] = os_file_create(name, OS_FILE_CREATE,
+ OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+
+ if (ret == FALSE && os_file_get_last_error(FALSE)
+ != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+ /* AIX 5.1 after security patch ML7 may have
+ errno set to 0 here, which causes our function
+ to return 100; work around that AIX problem */
+ && os_file_get_last_error(FALSE) != 100
+#endif
+ ) {
+ fprintf(stderr,
+ "InnoDB: Error in creating"
+ " or opening %s\n",
+ name);
+
+ return(DB_ERROR);
+ }
+
+ if (ret == FALSE) {
+ /* We open the data file */
+
+ files[i] = os_file_create(
+ name, OS_FILE_OPEN, OS_FILE_NORMAL,
+ OS_DATA_FILE, &ret);
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in opening %s\n", name);
+ os_file_get_last_error(TRUE);
+
+ return(DB_ERROR);
+ }
+
+ ret = os_file_get_size(files[i], &size, &size_high);
+ ut_a(ret);
+ /* Round size downward to megabytes */
+
+ rounded_size_pages
+ = (size / (1024 * 1024) + 4096 * size_high)
+ << (20 - UNIV_PAGE_SIZE_SHIFT);
+
+ if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: doublewrite buffer file %s"
+ " is of a different size\n"
+ "InnoDB: %lu pages"
+ " (rounded down to MB)\n"
+ "InnoDB: than intended size"
+ " %lu pages...\n",
+ name,
+ (ulong) rounded_size_pages,
+ (ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9);
+ }
+
+ fil_read_flushed_lsn_and_arch_log_no(
+ files[i], one_opened,
+#ifdef UNIV_LOG_ARCHIVE
+ min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+ min_flushed_lsn, max_flushed_lsn);
+ one_opened = TRUE;
+ } else {
+ /* We created the data file and now write it full of
+ zeros */
+
+ *create_new_doublewrite_file = TRUE;
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Doublewrite buffer file %s did not"
+ " exist: new to be created\n",
+ name);
+
+ if (*create_new_db == FALSE) {
+ fprintf(stderr,
+ "InnoDB: Warning: Previous version's ibdata files may cause crash.\n"
+ " If you use that, please use the ibdata files of this version.\n");
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Setting file %s size to %lu MB\n",
+ name,
+ (ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)
+ >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+ fprintf(stderr,
+ "InnoDB: Database physically writes the"
+ " file full: wait...\n");
+
+ ret = os_file_set_size(
+ name, files[i],
+ srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9),
+ srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9));
+
+ if (!ret) {
+ fprintf(stderr,
+ "InnoDB: Error in creating %s:"
+ " probably out of disk space\n", name);
+
+ return(DB_ERROR);
+ }
+ }
+
+ ret = os_file_close(files[i]);
+ ut_a(ret);
+
+ fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE);
+
+ ut_a(fil_validate());
+
+ fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE);
+
+ i++;
+ }
+
+ ios = 0;
+
+ mutex_create(&ios_mutex, SYNC_NO_ORDER_CHECK);
+
+ return(DB_SUCCESS);
+}
+
+/********************************************************************
+Starts InnoDB and creates a new database if database files
+are not found and the user wants.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_start_or_create_for_mysql(void)
+/*====================================*/
+{
+ buf_pool_t* ret;
+ ibool create_new_db;
+ ibool create_new_doublewrite_file;
+ ibool log_file_created;
+ ibool log_created = FALSE;
+ ibool log_opened = FALSE;
+ ib_uint64_t min_flushed_lsn;
+ ib_uint64_t max_flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+ ulint min_arch_log_no;
+ ulint max_arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+ ulint sum_of_new_sizes;
+ ulint sum_of_data_file_sizes;
+ ulint tablespace_size_in_header;
+ ulint err;
+ ulint i;
+ ulint io_limit;
+ my_bool srv_file_per_table_original_value
+ = srv_file_per_table;
+ mtr_t mtr;
+#ifdef HAVE_DARWIN_THREADS
+# ifdef F_FULLFSYNC
+ /* This executable has been compiled on Mac OS X 10.3 or later.
+ Assume that F_FULLFSYNC is available at run-time. */
+ srv_have_fullfsync = TRUE;
+# else /* F_FULLFSYNC */
+ /* This executable has been compiled on Mac OS X 10.2
+ or earlier. Determine if the executable is running
+ on Mac OS X 10.3 or later. */
+ struct utsname utsname;
+ if (uname(&utsname)) {
+ fputs("InnoDB: cannot determine Mac OS X version!\n", stderr);
+ } else {
+ srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0;
+ }
+ if (!srv_have_fullfsync) {
+ fputs("InnoDB: On Mac OS X, fsync() may be"
+ " broken on internal drives,\n"
+ "InnoDB: making transactions unsafe!\n", stderr);
+ }
+# endif /* F_FULLFSYNC */
+#endif /* HAVE_DARWIN_THREADS */
+
+ if (sizeof(ulint) != sizeof(void*)) {
+ fprintf(stderr,
+ "InnoDB: Error: size of InnoDB's ulint is %lu,"
+ " but size of void* is %lu.\n"
+ "InnoDB: The sizes should be the same"
+ " so that on a 64-bit platform you can\n"
+ "InnoDB: allocate more than 4 GB of memory.",
+ (ulong)sizeof(ulint), (ulong)sizeof(void*));
+ }
+
+ /* System tables are created in tablespace 0. Thus, we must
+ temporarily clear srv_file_per_table. This is ok, because the
+ server will not accept connections (which could modify
+ innodb_file_per_table) until this function has returned. */
+ srv_file_per_table = FALSE;
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n"
+# ifdef UNIV_IBUF_COUNT_DEBUG
+ "InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on !!!!!!!!!\n"
+ "InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n"
+# endif
+ );
+#endif
+
+#ifdef UNIV_SYNC_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_SEARCH_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_LOG_LSN_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
+#endif /* UNIV_LOG_LSN_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+ fprintf(stderr,
+ "InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ fprintf(stderr,
+ "InnoDB: The InnoDB memory heap is disabled\n");
+ }
+
+ fputs("InnoDB: " IB_ATOMICS_STARTUP_MSG
+ "\nInnoDB: Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+ " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_ZIP_COPY
+ " and extra copying"
+#endif /* UNIV_ZIP_COPY */
+ "\n" , stderr);
+
+ /* Since InnoDB does not currently clean up all its internal data
+ structures in MySQL Embedded Server Library server_end(), we
+ print an error message if someone tries to start up InnoDB a
+ second time during the process lifetime. */
+
+ if (srv_start_has_been_called) {
+ fprintf(stderr,
+ "InnoDB: Error: startup called second time"
+ " during the process lifetime.\n"
+ "InnoDB: In the MySQL Embedded Server Library"
+ " you cannot call server_init()\n"
+ "InnoDB: more than once during"
+ " the process lifetime.\n");
+ }
+
+ srv_start_has_been_called = TRUE;
+
+#ifdef UNIV_DEBUG
+ log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+ /* yydebug = TRUE; */
+
+ srv_is_being_started = TRUE;
+ srv_startup_is_before_trx_rollback_phase = TRUE;
+ os_aio_use_native_aio = FALSE;
+
+#ifdef __WIN__
+ switch (os_get_os_version()) {
+ case OS_WIN95:
+ case OS_WIN31:
+ case OS_WINNT:
+ /* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+ and NT use simulated aio. In NT Windows provides async i/o,
+ but when run in conjunction with InnoDB Hot Backup, it seemed
+ to corrupt the data files. */
+
+ os_aio_use_native_aio = FALSE;
+ break;
+ default:
+ /* On Win 2000 and XP use async i/o */
+ //os_aio_use_native_aio = TRUE;
+ os_aio_use_native_aio = FALSE;
+ fprintf(stderr,
+ "InnoDB: Windows native async i/o is disabled as default.\n"
+ "InnoDB: It is not applicable for the current"
+ " multi io threads implementation.\n");
+ break;
+ }
+#endif
+ if (srv_file_flush_method_str == NULL) {
+ /* These are the default options */
+
+ srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
+ srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
+ srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
+ srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
+ srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
+ srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
+ srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+ srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+ os_aio_use_native_aio = FALSE;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+ os_aio_use_native_aio = FALSE;
+
+ } else if (0 == ut_strcmp(srv_file_flush_method_str,
+ "async_unbuffered")) {
+ srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+ os_aio_use_native_aio = TRUE;
+ srv_n_read_io_threads = srv_n_write_io_threads = 1;
+ fprintf(stderr,
+ "InnoDB: 'async_unbuffered' was detected as innodb_flush_method.\n"
+ "InnoDB: Windows native async i/o is enabled.\n"
+ "InnoDB: And io threads are restricted.\n");
+#endif
+ } else {
+ fprintf(stderr,
+ "InnoDB: Unrecognized value %s for"
+ " innodb_flush_method\n",
+ srv_file_flush_method_str);
+ return(DB_ERROR);
+ }
+
+ /* Note that the call srv_boot() also changes the values of
+ some variables to the units used by InnoDB internally */
+
+ /* Set the maximum number of threads which can wait for a semaphore
+ inside InnoDB: this is the 'sync wait array' size, as well as the
+ maximum number of threads that can wait in the 'srv_conc array' for
+ their time to enter InnoDB. */
+
+#if defined(__NETWARE__)
+
+ /* Create less event semaphores because Win 98/ME had
+ difficulty creating 40000 event semaphores. Comment from
+ Novell, Inc.: also, these just take a lot of memory on
+ NetWare. */
+ srv_max_n_threads = 1000;
+#else
+ if (srv_buf_pool_size >= 1000 * 1024 * 1024) {
+ /* If buffer pool is less than 1000 MB,
+ assume fewer threads. */
+ srv_max_n_threads = 50000;
+
+ } else if (srv_buf_pool_size >= 8 * 1024 * 1024) {
+
+ srv_max_n_threads = 10000;
+ } else {
+ srv_max_n_threads = 1000; /* saves several MB of memory,
+ especially in 64-bit
+ computers */
+ }
+#endif
+ err = srv_boot();
+
+ if (err != DB_SUCCESS) {
+
+ return((int) err);
+ }
+
+ mutex_create(&srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
+
+ if (srv_innodb_status) {
+ srv_monitor_file_name = mem_alloc(
+ strlen(fil_path_to_mysql_datadir)
+ + 20 + sizeof "/innodb_status.");
+ sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
+ fil_path_to_mysql_datadir, os_proc_get_number());
+ srv_monitor_file = fopen(srv_monitor_file_name, "w+");
+ if (!srv_monitor_file) {
+ fprintf(stderr, "InnoDB: unable to create %s: %s\n",
+ srv_monitor_file_name, strerror(errno));
+ return(DB_ERROR);
+ }
+ } else {
+ srv_monitor_file_name = NULL;
+ srv_monitor_file = os_file_create_tmpfile();
+ if (!srv_monitor_file) {
+ return(DB_ERROR);
+ }
+ }
+
+ mutex_create(&srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
+
+ srv_dict_tmpfile = os_file_create_tmpfile();
+ if (!srv_dict_tmpfile) {
+ return(DB_ERROR);
+ }
+
+ mutex_create(&srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
+
+ srv_misc_tmpfile = os_file_create_tmpfile();
+ if (!srv_misc_tmpfile) {
+ return(DB_ERROR);
+ }
+
+ /* If user has set the value of innodb_file_io_threads then
+ we'll emit a message telling the user that this parameter
+ is now deprecated. */
+ if (srv_n_file_io_threads != 4) {
+ fprintf(stderr, "InnoDB: Warning:"
+ " innodb_file_io_threads is deprecated."
+ " Please use innodb_read_io_threads and"
+ " innodb_write_io_threads instead\n");
+ }
+
+ /* Now overwrite the value on srv_n_file_io_threads */
+ srv_n_file_io_threads = 2 + srv_n_read_io_threads
+ + srv_n_write_io_threads;
+
+ ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
+
+ /* TODO: Investigate if SRV_N_PENDING_IOS_PER_THREAD (32) limit
+ still applies to windows. */
+ if (!os_aio_use_native_aio) {
+ io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
+ } else {
+ io_limit = SRV_N_PENDING_IOS_PER_THREAD;
+ }
+
+ os_aio_init(io_limit,
+ srv_n_read_io_threads,
+ srv_n_write_io_threads,
+ SRV_MAX_N_PENDING_SYNC_IOS);
+
+ fil_init(srv_file_per_table ? 50000 : 5000,
+ srv_max_n_open_files);
+
+ ret = buf_pool_init();
+
+ if (ret == NULL) {
+ fprintf(stderr,
+ "InnoDB: Fatal error: cannot allocate the memory"
+ " for the buffer pool\n");
+
+ return(DB_ERROR);
+ }
+
+#ifdef UNIV_DEBUG
+ /* We have observed deadlocks with a 5MB buffer pool but
+ the actual lower limit could very well be a little higher. */
+
+ if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+ fprintf(stderr, "InnoDB: Warning: Small buffer pool size "
+ "(%luM), the flst_validate() debug function "
+ "can cause a deadlock if the buffer pool fills up.\n",
+ srv_buf_pool_size / 1024 / 1024);
+ }
+#endif
+
+ fsp_init();
+ log_init();
+
+ lock_sys_create(srv_lock_table_size);
+
+ /* Create i/o-handler threads: */
+
+ for (i = 0; i < srv_n_file_io_threads; i++) {
+ n[i] = i;
+
+ os_thread_create(io_handler_thread, n + i, thread_ids + i);
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
+ fprintf(stderr,
+ "InnoDB: Error: you must set the log group"
+ " home dir in my.cnf the\n"
+ "InnoDB: same as log arch dir.\n");
+
+ return(DB_ERROR);
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ if (sizeof(ulint) == 4
+ && srv_n_log_files * srv_log_file_size
+ >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+ fprintf(stderr,
+ "InnoDB: Error: combined size of log files"
+ " must be < 4 GB on 32-bit systems\n");
+
+ return(DB_ERROR);
+ }
+
+ sum_of_new_sizes = 0;
+
+ for (i = 0; i < srv_n_data_files; i++) {
+#ifndef __WIN__
+ if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+ fprintf(stderr,
+ "InnoDB: Error: file size must be < 4 GB"
+ " with this MySQL binary\n"
+ "InnoDB: and operating system combination,"
+ " in some OS's < 2 GB\n");
+
+ return(DB_ERROR);
+ }
+#endif
+ sum_of_new_sizes += srv_data_file_sizes[i];
+ }
+
+ if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Error: tablespace size must be"
+ " at least 10 MB\n");
+
+ return(DB_ERROR);
+ }
+
+ err = open_or_create_data_files(&create_new_db,
+ &create_new_doublewrite_file,
+#ifdef UNIV_LOG_ARCHIVE
+ &min_arch_log_no, &max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+ &min_flushed_lsn, &max_flushed_lsn,
+ &sum_of_new_sizes);
+ if (err != DB_SUCCESS) {
+ fprintf(stderr,
+ "InnoDB: Could not open or create data files.\n"
+ "InnoDB: If you tried to add new data files,"
+ " and it failed here,\n"
+ "InnoDB: you should now edit innodb_data_file_path"
+ " in my.cnf back\n"
+ "InnoDB: to what it was, and remove the"
+ " new ibdata files InnoDB created\n"
+ "InnoDB: in this failed attempt. InnoDB only wrote"
+ " those files full of\n"
+ "InnoDB: zeros, but did not yet use them in any way."
+ " But be careful: do not\n"
+ "InnoDB: remove old data files"
+ " which contain your precious data!\n");
+
+ return((int) err);
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ srv_normalize_path_for_win(srv_arch_dir);
+ srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ for (i = 0; i < srv_n_log_files; i++) {
+ err = open_or_create_log_file(create_new_db, &log_file_created,
+ log_opened, 0, i);
+ if (err != DB_SUCCESS) {
+
+ return((int) err);
+ }
+
+ if (log_file_created) {
+ log_created = TRUE;
+ } else {
+ log_opened = TRUE;
+ }
+ if ((log_opened && create_new_db)
+ || (log_opened && log_created)) {
+ fprintf(stderr,
+ "InnoDB: Error: all log files must be"
+ " created at the same time.\n"
+ "InnoDB: All log files must be"
+ " created also in database creation.\n"
+ "InnoDB: If you want bigger or smaller"
+ " log files, shut down the\n"
+ "InnoDB: database and make sure there"
+ " were no errors in shutdown.\n"
+ "InnoDB: Then delete the existing log files."
+ " Edit the .cnf file\n"
+ "InnoDB: and start the database again.\n");
+
+ return(DB_ERROR);
+ }
+ }
+
+ /* Open all log files and data files in the system tablespace: we
+ keep them open until database shutdown */
+
+ fil_open_log_and_system_tablespace_files();
+
+ if (log_created && !create_new_db
+#ifdef UNIV_LOG_ARCHIVE
+ && !srv_archive_recovery
+#endif /* UNIV_LOG_ARCHIVE */
+ ) {
+ if (max_flushed_lsn != min_flushed_lsn
+#ifdef UNIV_LOG_ARCHIVE
+ || max_arch_log_no != min_arch_log_no
+#endif /* UNIV_LOG_ARCHIVE */
+ ) {
+ fprintf(stderr,
+ "InnoDB: Cannot initialize created"
+ " log files because\n"
+ "InnoDB: data files were not in sync"
+ " with each other\n"
+ "InnoDB: or the data files are corrupt.\n");
+
+ return(DB_ERROR);
+ }
+
+ if (max_flushed_lsn < (ib_uint64_t) 1000) {
+ fprintf(stderr,
+ "InnoDB: Cannot initialize created"
+ " log files because\n"
+ "InnoDB: data files are corrupt,"
+ " or new data files were\n"
+ "InnoDB: created when the database"
+ " was started previous\n"
+ "InnoDB: time but the database"
+ " was not shut down\n"
+ "InnoDB: normally after that.\n");
+
+ return(DB_ERROR);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+
+#ifdef UNIV_LOG_ARCHIVE
+ /* Do not + 1 arch_log_no because we do not use log
+ archiving */
+ recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE);
+#else
+ recv_reset_logs(max_flushed_lsn, TRUE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+ mutex_exit(&(log_sys->mutex));
+ }
+
+ trx_sys_file_format_init();
+
+ if (create_new_doublewrite_file) {
+ mtr_start(&mtr);
+ fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
+ mtr_commit(&mtr);
+
+ trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
+ }
+
+ if (create_new_db) {
+ mtr_start(&mtr);
+ fsp_header_init(0, sum_of_new_sizes, &mtr);
+
+ mtr_commit(&mtr);
+
+ trx_sys_create();
+ dict_create();
+ srv_startup_is_before_trx_rollback_phase = FALSE;
+
+ if (trx_doublewrite == NULL) {
+ /* Create the doublewrite buffer here to avoid assertion error
+ about page_no of doublewrite_buf */
+ trx_sys_create_doublewrite_buf();
+ }
+
+ if (srv_extra_rsegments)
+ trx_sys_create_extra_rseg(srv_extra_rsegments);
+#ifdef UNIV_LOG_ARCHIVE
+ } else if (srv_archive_recovery) {
+ fprintf(stderr,
+ "InnoDB: Starting archive"
+ " recovery from a backup...\n");
+ err = recv_recovery_from_archive_start(
+ min_flushed_lsn, srv_archive_recovery_limit_lsn,
+ min_arch_log_no);
+ if (err != DB_SUCCESS) {
+
+ return(DB_ERROR);
+ }
+ /* Since ibuf init is in dict_boot, and ibuf is needed
+ in any disk i/o, first call dict_boot */
+
+ dict_boot();
+ trx_sys_init_at_db_start();
+ srv_startup_is_before_trx_rollback_phase = FALSE;
+
+ /* Initialize the fsp free limit global variable in the log
+ system */
+ fsp_header_get_free_limit();
+
+ recv_recovery_from_archive_finish();
+#endif /* UNIV_LOG_ARCHIVE */
+ } else {
+
+ /* Check if we support the max format that is stamped
+ on the system tablespace.
+ Note: We are NOT allowed to make any modifications to
+ the TRX_SYS_PAGE_NO page before recovery because this
+ page also contains the max_trx_id etc. important system
+ variables that are required for recovery. We need to
+ ensure that we return the system to a state where normal
+ recovery is guaranteed to work. We do this by
+ invalidating the buffer cache, this will force the
+ reread of the page and restoration to its last known
+ consistent state, this is REQUIRED for the recovery
+ process to work. */
+ err = trx_sys_file_format_max_check(
+ srv_check_file_format_at_startup);
+
+ if (err != DB_SUCCESS) {
+ return(err);
+ }
+
+ /* Invalidate the buffer pool to ensure that we reread
+ the page that we read above, during recovery.
+ Note that this is not as heavy weight as it seems. At
+ this point there will be only ONE page in the buf_LRU
+ and there must be no page in the buf_flush list. */
+ /* buffer_pool_shm should not be reused when recovery was needed. */
+ if (!srv_buffer_pool_shm_is_reused)
+ buf_pool_invalidate();
+
+ /* We always try to do a recovery, even if the database had
+ been shut down normally: this is the normal startup path */
+
+ err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT,
+ IB_ULONGLONG_MAX,
+ min_flushed_lsn,
+ max_flushed_lsn);
+ if (err != DB_SUCCESS) {
+
+ return(DB_ERROR);
+ }
+
+ /* Since the insert buffer init is in dict_boot, and the
+ insert buffer is needed in any disk i/o, first we call
+ dict_boot(). Note that trx_sys_init_at_db_start() only needs
+ to access space 0, and the insert buffer at this stage already
+ works for space 0. */
+
+ dict_boot();
+ trx_sys_init_at_db_start();
+
+ /* Initialize the fsp free limit global variable in the log
+ system */
+ fsp_header_get_free_limit();
+
+ /* recv_recovery_from_checkpoint_finish needs trx lists which
+ are initialized in trx_sys_init_at_db_start(). */
+
+ recv_recovery_from_checkpoint_finish();
+ if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
+ /* The following call is necessary for the insert
+ buffer to work with multiple tablespaces. We must
+ know the mapping between space id's and .ibd file
+ names.
+
+ In a crash recovery, we check that the info in data
+ dictionary is consistent with what we already know
+ about space id's from the call of
+ fil_load_single_table_tablespaces().
+
+ In a normal startup, we create the space objects for
+ every table in the InnoDB data dictionary that has
+ an .ibd file.
+
+ We also determine the maximum tablespace id used. */
+
+ dict_check_tablespaces_and_store_max_id(
+ recv_needed_recovery);
+ }
+
+ srv_startup_is_before_trx_rollback_phase = FALSE;
+ recv_recovery_rollback_active();
+
+ /* It is possible that file_format tag has never
+ been set. In this case we initialize it to minimum
+ value. Important to note that we can do it ONLY after
+ we have finished the recovery process so that the
+ image of TRX_SYS_PAGE_NO is not stale. */
+ trx_sys_file_format_tag_init();
+ }
+
+ if (!create_new_db && sum_of_new_sizes > 0) {
+ /* New data file(s) were added */
+ mtr_start(&mtr);
+
+ fsp_header_inc_size(0, sum_of_new_sizes, &mtr);
+
+ mtr_commit(&mtr);
+
+ /* Immediately write the log record about increased tablespace
+ size to disk, so that it is durable even if mysqld would crash
+ quickly */
+
+ log_buffer_flush_to_disk();
+ }
+
+#ifdef UNIV_LOG_ARCHIVE
+ /* Archiving is always off under MySQL */
+ if (!srv_log_archive_on) {
+ ut_a(DB_SUCCESS == log_archive_noarchivelog());
+ } else {
+ mutex_enter(&(log_sys->mutex));
+
+ start_archive = FALSE;
+
+ if (log_sys->archiving_state == LOG_ARCH_OFF) {
+ start_archive = TRUE;
+ }
+
+ mutex_exit(&(log_sys->mutex));
+
+ if (start_archive) {
+ ut_a(DB_SUCCESS == log_archive_archivelog());
+ }
+ }
+#endif /* UNIV_LOG_ARCHIVE */
+
+ /* fprintf(stderr, "Max allowed record size %lu\n",
+ page_get_free_space_of_empty() / 2); */
+
+ /* Create the thread which watches the timeouts for lock waits */
+ os_thread_create(&srv_lock_timeout_thread, NULL,
+ thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+
+ /* Create the thread which warns of long semaphore waits */
+ os_thread_create(&srv_error_monitor_thread, NULL,
+ thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+ /* Create the thread which prints InnoDB monitor info */
+ os_thread_create(&srv_monitor_thread, NULL,
+ thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+
+ /* Create the thread which automaticaly dumps/restore buffer pool */
+ os_thread_create(&srv_LRU_dump_restore_thread, NULL,
+ thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+
+ srv_is_being_started = FALSE;
+
+ if (trx_doublewrite == NULL) {
+ /* Create the doublewrite buffer to a new tablespace */
+
+ trx_sys_create_doublewrite_buf();
+ }
+
+ err = dict_create_or_check_foreign_constraint_tables();
+
+ if (err != DB_SUCCESS) {
+ return((int)DB_ERROR);
+ }
+
+ /* Create the master thread which does purge and other utility
+ operations */
+
+ os_thread_create(&srv_master_thread, NULL, thread_ids
+ + (1 + SRV_MAX_N_IO_THREADS));
+
+ if (srv_use_purge_thread) {
+ ulint i;
+
+ os_thread_create(&srv_purge_thread, NULL, thread_ids
+ + (6 + SRV_MAX_N_IO_THREADS));
+
+ for (i = 0; i < srv_use_purge_thread - 1; i++) {
+ n[7 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */
+ os_thread_create(&srv_purge_worker_thread,
+ n + (7 + i + SRV_MAX_N_IO_THREADS),
+ thread_ids + (7 + i + SRV_MAX_N_IO_THREADS));
+ }
+ }
+#ifdef UNIV_DEBUG
+ /* buf_debug_prints = TRUE; */
+#endif /* UNIV_DEBUG */
+ sum_of_data_file_sizes = 0;
+
+ for (i = 0; i < srv_n_data_files; i++) {
+ sum_of_data_file_sizes += srv_data_file_sizes[i];
+ }
+
+ tablespace_size_in_header = fsp_header_get_tablespace_size();
+
+ if (!srv_auto_extend_last_data_file
+ && sum_of_data_file_sizes != tablespace_size_in_header) {
+
+ fprintf(stderr,
+ "InnoDB: Error: tablespace size"
+ " stored in header is %lu pages, but\n"
+ "InnoDB: the sum of data file sizes is %lu pages\n",
+ (ulong) tablespace_size_in_header,
+ (ulong) sum_of_data_file_sizes);
+
+ if (srv_force_recovery == 0
+ && sum_of_data_file_sizes < tablespace_size_in_header) {
+ /* This is a fatal error, the tail of a tablespace is
+ missing */
+
+ fprintf(stderr,
+ "InnoDB: Cannot start InnoDB."
+ " The tail of the system tablespace is\n"
+ "InnoDB: missing. Have you edited"
+ " innodb_data_file_path in my.cnf in an\n"
+ "InnoDB: inappropriate way, removing"
+ " ibdata files from there?\n"
+ "InnoDB: You can set innodb_force_recovery=1"
+ " in my.cnf to force\n"
+ "InnoDB: a startup if you are trying"
+ " to recover a badly corrupt database.\n");
+
+ return(DB_ERROR);
+ }
+ }
+
+ if (srv_auto_extend_last_data_file
+ && sum_of_data_file_sizes < tablespace_size_in_header) {
+
+ fprintf(stderr,
+ "InnoDB: Error: tablespace size stored in header"
+ " is %lu pages, but\n"
+ "InnoDB: the sum of data file sizes"
+ " is only %lu pages\n",
+ (ulong) tablespace_size_in_header,
+ (ulong) sum_of_data_file_sizes);
+
+ if (srv_force_recovery == 0) {
+
+ fprintf(stderr,
+ "InnoDB: Cannot start InnoDB. The tail of"
+ " the system tablespace is\n"
+ "InnoDB: missing. Have you edited"
+ " innodb_data_file_path in my.cnf in an\n"
+ "InnoDB: inappropriate way, removing"
+ " ibdata files from there?\n"
+ "InnoDB: You can set innodb_force_recovery=1"
+ " in my.cnf to force\n"
+ "InnoDB: a startup if you are trying to"
+ " recover a badly corrupt database.\n");
+
+ return(DB_ERROR);
+ }
+ }
+
+ /* Check that os_fast_mutexes work as expected */
+ os_fast_mutex_init(&srv_os_test_mutex);
+
+ if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
+ fprintf(stderr,
+ "InnoDB: Error: pthread_mutex_trylock returns"
+ " an unexpected value on\n"
+ "InnoDB: success! Cannot continue.\n");
+ exit(1);
+ }
+
+ os_fast_mutex_unlock(&srv_os_test_mutex);
+
+ os_fast_mutex_lock(&srv_os_test_mutex);
+
+ os_fast_mutex_unlock(&srv_os_test_mutex);
+
+ os_fast_mutex_free(&srv_os_test_mutex);
+
+ if (!srv_file_per_table_original_value
+ && srv_pass_corrupt_table) {
+ fprintf(stderr, "InnoDB: Warning:"
+ " innodb_file_per_table is diabled."
+ " So innodb_pass_corrupt_table doesn't make sence\n");
+ }
+
+ if (srv_print_verbose_log) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " Percona XtraDB (http://www.percona.com) %s started; "
+ "log sequence number %llu\n",
+ INNODB_VERSION_STR, srv_start_lsn);
+ }
+
+ if (srv_force_recovery > 0) {
+ fprintf(stderr,
+ "InnoDB: !!! innodb_force_recovery"
+ " is set to %lu !!!\n",
+ (ulong) srv_force_recovery);
+ }
+
+ fflush(stderr);
+
+ if (trx_doublewrite_must_reset_space_ids) {
+ /* Actually, we did not change the undo log format between
+ 4.0 and 4.1.1, and we would not need to run purge to
+ completion. Note also that the purge algorithm in 4.1.1
+ can process the history list again even after a full
+ purge, because our algorithm does not cut the end of the
+ history list in all cases so that it would become empty
+ after a full purge. That mean that we may purge 4.0 type
+ undo log even after this phase.
+
+ The insert buffer record format changed between 4.0 and
+ 4.1.1. It is essential that the insert buffer is emptied
+ here! */
+
+ fprintf(stderr,
+ "InnoDB: You are upgrading to an"
+ " InnoDB version which allows multiple\n"
+ "InnoDB: tablespaces. Wait that purge"
+ " and insert buffer merge run to\n"
+ "InnoDB: completion...\n");
+ for (;;) {
+ os_thread_sleep(1000000);
+
+ if (0 == strcmp(srv_main_thread_op_info,
+ "waiting for server activity")) {
+
+ ut_a(ibuf_is_empty());
+
+ break;
+ }
+ }
+ fprintf(stderr,
+ "InnoDB: Full purge and insert buffer merge"
+ " completed.\n");
+
+ trx_sys_mark_upgraded_to_multiple_tablespaces();
+
+ fprintf(stderr,
+ "InnoDB: You have now successfully upgraded"
+ " to the multiple tablespaces\n"
+ "InnoDB: format. You should NOT DOWNGRADE"
+ " to an earlier version of\n"
+ "InnoDB: InnoDB! But if you absolutely need to"
+ " downgrade, see\n"
+ "InnoDB: " REFMAN "multiple-tablespaces.html\n"
+ "InnoDB: for instructions.\n");
+ }
+
+ if (srv_force_recovery == 0) {
+ /* In the insert buffer we may have even bigger tablespace
+ id's, because we may have dropped those tablespaces, but
+ insert buffer merge has not had time to clean the records from
+ the ibuf tree. */
+
+ ibuf_update_max_tablespace_id();
+ }
+
+ srv_file_per_table = srv_file_per_table_original_value;
+
+ srv_was_started = TRUE;
+
+ return((int) DB_SUCCESS);
+}
+
+/****************************************************************//**
+Shuts down the InnoDB database.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_shutdown_for_mysql(void)
+/*=============================*/
+{
+ ulint i;
+#ifdef __NETWARE__
+ extern ibool panic_shutdown;
+#endif
+ if (!srv_was_started) {
+ if (srv_is_being_started) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: shutting down"
+ " a not properly started\n"
+ "InnoDB: or created database!\n");
+ }
+
+ return(DB_SUCCESS);
+ }
+
+ /* 1. Flush the buffer pool to disk, write the current lsn to
+ the tablespace header(s), and copy all log data to archive.
+ The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
+ just free data structures after the shutdown. */
+
+
+ if (srv_fast_shutdown == 2) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: MySQL has requested a very fast shutdown"
+ " without flushing "
+ "the InnoDB buffer pool to data files."
+ " At the next mysqld startup "
+ "InnoDB will do a crash recovery!\n");
+ }
+
+#ifdef __NETWARE__
+ if (!panic_shutdown)
+#endif
+ logs_empty_and_mark_files_at_shutdown();
+
+ if (srv_conc_n_threads != 0) {
+ fprintf(stderr,
+ "InnoDB: Warning: query counter shows %ld queries"
+ " still\n"
+ "InnoDB: inside InnoDB at shutdown\n",
+ srv_conc_n_threads);
+ }
+
+ /* 2. Make all threads created by InnoDB to exit */
+
+ srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+ /* In a 'very fast' shutdown, we do not need to wait for these threads
+ to die; all which counts is that we flushed the log; a 'very fast'
+ shutdown is essentially a crash. */
+
+ if (srv_fast_shutdown == 2) {
+ return(DB_SUCCESS);
+ }
+
+ /* All threads end up waiting for certain events. Put those events
+ to the signaled state. Then the threads will exit themselves in
+ os_thread_event_wait(). */
+
+ for (i = 0; i < 1000; i++) {
+ /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
+ HERE OR EARLIER */
+
+ /* a. Let the lock timeout thread exit */
+ os_event_set(srv_lock_timeout_thread_event);
+
+ /* b. srv error monitor thread exits automatically, no need
+ to do anything here */
+
+ /* c. We wake the master thread so that it exits */
+ srv_wake_master_thread();
+
+ /* d. Exit the i/o threads */
+
+ os_aio_wake_all_threads_at_shutdown();
+
+ os_mutex_enter(os_sync_mutex);
+
+ if (os_thread_count == 0) {
+ /* All the threads have exited or are just exiting;
+ NOTE that the threads may not have completed their
+ exit yet. Should we use pthread_join() to make sure
+ they have exited? If we did, we would have to
+ remove the pthread_detach() from
+ os_thread_exit(). Now we just sleep 0.1
+ seconds and hope that is enough! */
+
+ os_mutex_exit(os_sync_mutex);
+
+ os_thread_sleep(100000);
+
+ break;
+ }
+
+ os_mutex_exit(os_sync_mutex);
+
+ os_thread_sleep(100000);
+ }
+
+ if (i == 1000) {
+ fprintf(stderr,
+ "InnoDB: Warning: %lu threads created by InnoDB"
+ " had not exited at shutdown!\n",
+ (ulong) os_thread_count);
+ }
+
+ if (srv_monitor_file) {
+ fclose(srv_monitor_file);
+ srv_monitor_file = 0;
+ if (srv_monitor_file_name) {
+ unlink(srv_monitor_file_name);
+ mem_free(srv_monitor_file_name);
+ }
+ }
+ if (srv_dict_tmpfile) {
+ fclose(srv_dict_tmpfile);
+ srv_dict_tmpfile = 0;
+ }
+
+ if (srv_misc_tmpfile) {
+ fclose(srv_misc_tmpfile);
+ srv_misc_tmpfile = 0;
+ }
+
+ /* This must be disabled before closing the buffer pool
+ and closing the data dictionary. */
+ btr_search_disable();
+
+ ibuf_close();
+ log_shutdown();
+ lock_sys_close();
+ thr_local_close();
+ trx_sys_file_format_close();
+ trx_sys_close();
+
+ mutex_free(&srv_monitor_file_mutex);
+ mutex_free(&srv_dict_tmpfile_mutex);
+ mutex_free(&srv_misc_tmpfile_mutex);
+ dict_close();
+ btr_search_sys_free();
+
+ /* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
+ them */
+ os_aio_free();
+ sync_close();
+ srv_free();
+ fil_close();
+
+ /* 4. Free the os_conc_mutex and all os_events and os_mutexes */
+
+ os_sync_free();
+
+ /* 5. Free all allocated memory */
+
+ pars_lexer_close();
+ log_mem_free();
+ buf_pool_free();
+ mem_close();
+
+ /* ut_free_all_mem() frees all allocated memory not freed yet
+ in shutdown, and it will also free the ut_list_mutex, so it
+ should be the last one for all operation */
+ ut_free_all_mem();
+
+ if (os_thread_count != 0
+ || os_event_count != 0
+ || os_mutex_count != 0
+ || os_fast_mutex_count != 0) {
+ fprintf(stderr,
+ "InnoDB: Warning: some resources were not"
+ " cleaned up in shutdown:\n"
+ "InnoDB: threads %lu, events %lu,"
+ " os_mutexes %lu, os_fast_mutexes %lu\n",
+ (ulong) os_thread_count, (ulong) os_event_count,
+ (ulong) os_mutex_count, (ulong) os_fast_mutex_count);
+ }
+
+ if (dict_foreign_err_file) {
+ fclose(dict_foreign_err_file);
+ }
+ if (lock_latest_err_file) {
+ fclose(lock_latest_err_file);
+ }
+
+ if (srv_print_verbose_log) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Shutdown completed;"
+ " log sequence number %llu\n",
+ srv_shutdown_lsn);
+ }
+
+ srv_was_started = FALSE;
+ srv_start_has_been_called = FALSE;
+
+ return((int) DB_SUCCESS);
+}
+
+#ifdef __NETWARE__
+void set_panic_flag_for_netware()
+{
+ extern ibool panic_shutdown;
+ panic_shutdown = TRUE;
+}
+#endif /* __NETWARE__ */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c
new file mode 100644
index 00000000000..223e1715944
--- /dev/null
+++ b/storage/xtradb/sync/sync0arr.c
@@ -0,0 +1,1023 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0arr.c
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0arr.h"
+#ifdef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "os0sync.h"
+#include "os0file.h"
+#include "srv0srv.h"
+
+/*
+ WAIT ARRAY
+ ==========
+
+The wait array consists of cells each of which has an
+an operating system event object created for it. The threads
+waiting for a mutex, for example, can reserve a cell
+in the array and suspend themselves to wait for the event
+to become signaled. When using the wait array, remember to make
+sure that some thread holding the synchronization object
+will eventually know that there is a waiter in the array and
+signal the object, to prevent infinite wait.
+Why we chose to implement a wait array? First, to make
+mutexes fast, we had to code our own implementation of them,
+which only in usually uncommon cases resorts to using
+slow operating system primitives. Then we had the choice of
+assigning a unique OS event for each mutex, which would
+be simpler, or using a global wait array. In some operating systems,
+the global wait array solution is more efficient and flexible,
+because we can do with a very small number of OS events,
+say 200. In NT 3.51, allocating events seems to be a quadratic
+algorithm, because 10 000 events are created fast, but
+100 000 events takes a couple of minutes to create.
+
+As of 5.0.30 the above mentioned design is changed. Since now
+OS can handle millions of wait events efficiently, we no longer
+have this concept of each cell of wait array having one event.
+Instead, now the event that a thread wants to wait on is embedded
+in the wait object (mutex or rw_lock). We still keep the global
+wait array for the sake of diagnostics and also to avoid infinite
+wait The error_monitor thread scans the global wait array to signal
+any waiting threads who have missed the signal. */
+
+/** A cell where an individual thread may wait suspended
+until a resource is released. The suspending is implemented
+using an operating system event semaphore. */
+struct sync_cell_struct {
+ void* wait_object; /*!< pointer to the object the
+ thread is waiting for; if NULL
+ the cell is free for use */
+ mutex_t* old_wait_mutex; /*!< the latest wait mutex in cell */
+ rw_lock_t* old_wait_rw_lock;
+ /*!< the latest wait rw-lock
+ in cell */
+ ulint request_type; /*!< lock type requested on the
+ object */
+ const char* file; /*!< in debug version file where
+ requested */
+ ulint line; /*!< in debug version line where
+ requested */
+ os_thread_id_t thread; /*!< thread id of this waiting
+ thread */
+ ibool waiting; /*!< TRUE if the thread has already
+ called sync_array_event_wait
+ on this cell */
+ ib_int64_t signal_count; /*!< We capture the signal_count
+ of the wait_object when we
+ reset the event. This value is
+ then passed on to os_event_wait
+ and we wait only if the event
+ has not been signalled in the
+ period between the reset and
+ wait call. */
+ time_t reservation_time;/*!< time when the thread reserved
+ the wait cell */
+};
+
+/* NOTE: It is allowed for a thread to wait
+for an event allocated for the array without owning the
+protecting mutex (depending on the case: OS or database mutex), but
+all changes (set or reset) to the state of the event must be made
+while owning the mutex. */
+
+/** Synchronization array */
+struct sync_array_struct {
+ ulint n_reserved; /*!< number of currently reserved
+ cells in the wait array */
+ ulint n_cells; /*!< number of cells in the
+ wait array */
+ sync_cell_t* array; /*!< pointer to wait array */
+ ulint protection; /*!< this flag tells which
+ mutex protects the data */
+ mutex_t mutex; /*!< possible database mutex
+ protecting this data structure */
+ os_mutex_t os_mutex; /*!< Possible operating system mutex
+ protecting the data structure.
+ As this data structure is used in
+ constructing the database mutex,
+ to prevent infinite recursion
+ in implementation, we fall back to
+ an OS mutex. */
+ ulint sg_count; /*!< count of how many times an
+ object has been signalled */
+ ulint res_count; /*!< count of cell reservations
+ since creation of the array */
+};
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth); /*!< in: recursion depth */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return cell */
+static
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: sync array */
+ ulint n) /*!< in: index */
+{
+ ut_a(arr);
+ ut_a(n < arr->n_cells);
+
+ return(arr->array + n);
+}
+
+/******************************************************************//**
+Reserves the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_enter(
+/*=============*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ ulint protection;
+
+ protection = arr->protection;
+
+ if (protection == SYNC_ARRAY_OS_MUTEX) {
+ os_mutex_enter(arr->os_mutex);
+ } else if (protection == SYNC_ARRAY_MUTEX) {
+ mutex_enter(&(arr->mutex));
+ } else {
+ ut_error;
+ }
+}
+
+/******************************************************************//**
+Releases the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_exit(
+/*============*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ ulint protection;
+
+ protection = arr->protection;
+
+ if (protection == SYNC_ARRAY_OS_MUTEX) {
+ os_mutex_exit(arr->os_mutex);
+ } else if (protection == SYNC_ARRAY_MUTEX) {
+ mutex_exit(&(arr->mutex));
+ } else {
+ ut_error;
+ }
+}
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return own: created wait array */
+UNIV_INTERN
+sync_array_t*
+sync_array_create(
+/*==============*/
+ ulint n_cells, /*!< in: number of cells in the array
+ to create */
+ ulint protection) /*!< in: either SYNC_ARRAY_OS_MUTEX or
+ SYNC_ARRAY_MUTEX: determines the type
+ of mutex protecting the data structure */
+{
+ ulint sz;
+ sync_array_t* arr;
+
+ ut_a(n_cells > 0);
+
+ /* Allocate memory for the data structures */
+ arr = ut_malloc(sizeof(sync_array_t));
+ memset(arr, 0x0, sizeof(*arr));
+
+ sz = sizeof(sync_cell_t) * n_cells;
+ arr->array = ut_malloc(sz);
+ memset(arr->array, 0x0, sz);
+
+ arr->n_cells = n_cells;
+ arr->protection = protection;
+
+ /* Then create the mutex to protect the wait array complex */
+ if (protection == SYNC_ARRAY_OS_MUTEX) {
+ arr->os_mutex = os_mutex_create(NULL);
+ } else if (protection == SYNC_ARRAY_MUTEX) {
+ mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK);
+ } else {
+ ut_error;
+ }
+
+ return(arr);
+}
+
+/******************************************************************//**
+Frees the resources in a wait array. */
+UNIV_INTERN
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr) /*!< in, own: sync wait array */
+{
+ ulint protection;
+
+ ut_a(arr->n_reserved == 0);
+
+ sync_array_validate(arr);
+
+ protection = arr->protection;
+
+ /* Release the mutex protecting the wait array complex */
+
+ if (protection == SYNC_ARRAY_OS_MUTEX) {
+ os_mutex_free(arr->os_mutex);
+ } else if (protection == SYNC_ARRAY_MUTEX) {
+ mutex_free(&(arr->mutex));
+ } else {
+ ut_error;
+ }
+
+ ut_free(arr->array);
+ ut_free(arr);
+}
+
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ ulint i;
+ sync_cell_t* cell;
+ ulint count = 0;
+
+ sync_array_enter(arr);
+
+ for (i = 0; i < arr->n_cells; i++) {
+ cell = sync_array_get_nth_cell(arr, i);
+ if (cell->wait_object != NULL) {
+ count++;
+ }
+ }
+
+ ut_a(count == arr->n_reserved);
+
+ sync_array_exit(arr);
+}
+
+/*******************************************************************//**
+Returns the event that the thread owning the cell waits for. */
+static
+os_event_t
+sync_cell_get_event(
+/*================*/
+ sync_cell_t* cell) /*!< in: non-empty sync array cell */
+{
+ ulint type = cell->request_type;
+
+ if (type == SYNC_MUTEX) {
+ return(((mutex_t *) cell->wait_object)->event);
+ } else if (type == RW_LOCK_WAIT_EX) {
+ return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
+ } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
+ return(((rw_lock_t *) cell->wait_object)->event);
+ }
+}
+
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+UNIV_INTERN
+void
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ ulint line, /*!< in: line where requested */
+ ulint* index) /*!< out: index of the reserved cell */
+{
+ sync_cell_t* cell;
+ os_event_t event;
+ ulint i;
+
+ ut_a(object);
+ ut_a(index);
+
+ sync_array_enter(arr);
+
+ arr->res_count++;
+
+ /* Reserve a new cell. */
+ for (i = 0; i < arr->n_cells; i++) {
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object == NULL) {
+
+ cell->waiting = FALSE;
+ cell->wait_object = object;
+
+ if (type == SYNC_MUTEX) {
+ cell->old_wait_mutex = object;
+ } else {
+ cell->old_wait_rw_lock = object;
+ }
+
+ cell->request_type = type;
+
+ cell->file = file;
+ cell->line = line;
+
+ arr->n_reserved++;
+
+ *index = i;
+
+ sync_array_exit(arr);
+
+ /* Make sure the event is reset and also store
+ the value of signal_count at which the event
+ was reset. */
+ event = sync_cell_get_event(cell);
+ cell->signal_count = os_event_reset(event);
+
+ cell->reservation_time = time(NULL);
+
+ cell->thread = os_thread_get_curr_id();
+
+ return;
+ }
+ }
+
+ ut_error; /* No free cell found */
+
+ return;
+}
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index) /*!< in: index of the reserved cell */
+{
+ sync_cell_t* cell;
+ os_event_t event;
+
+ ut_a(arr);
+
+ sync_array_enter(arr);
+
+ cell = sync_array_get_nth_cell(arr, index);
+
+ ut_a(cell->wait_object);
+ ut_a(!cell->waiting);
+ ut_ad(os_thread_get_curr_id() == cell->thread);
+
+ event = sync_cell_get_event(cell);
+ cell->waiting = TRUE;
+
+#ifdef UNIV_SYNC_DEBUG
+
+ /* We use simple enter to the mutex below, because if
+ we cannot acquire it at once, mutex_enter would call
+ recursively sync_array routines, leading to trouble.
+ rw_lock_debug_mutex freezes the debug lists. */
+
+ rw_lock_debug_mutex_enter();
+
+ if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
+
+ fputs("########################################\n", stderr);
+ ut_error;
+ }
+
+ rw_lock_debug_mutex_exit();
+#endif
+ sync_array_exit(arr);
+
+ os_event_wait_low(event, cell->signal_count);
+
+ sync_array_free_cell(arr, index);
+}
+
+/******************************************************************//**
+Reports info of a wait array cell. */
+static
+void
+sync_array_cell_print(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_cell_t* cell) /*!< in: sync cell */
+{
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+ ulint type;
+ ulint writer;
+
+ type = cell->request_type;
+
+ fprintf(file,
+ "--Thread %lu has waited at %s line %lu"
+ " for %#.5g seconds the semaphore:\n",
+ (ulong) os_thread_pf(cell->thread), cell->file,
+ (ulong) cell->line,
+ difftime(time(NULL), cell->reservation_time));
+
+ if (type == SYNC_MUTEX) {
+ /* We use old_wait_mutex in case the cell has already
+ been freed meanwhile */
+ mutex = cell->old_wait_mutex;
+
+ fprintf(file,
+ "Mutex at %p '%s', lock var %lu\n"
+#ifdef UNIV_SYNC_DEBUG
+ "Last time reserved in file %s line %lu, "
+#endif /* UNIV_SYNC_DEBUG */
+ "waiters flag %lu\n",
+ (void*) mutex, mutex->cmutex_name,
+ (ulong) mutex->lock_word,
+#ifdef UNIV_SYNC_DEBUG
+ mutex->file_name, (ulong) mutex->line,
+#endif /* UNIV_SYNC_DEBUG */
+ (ulong) mutex->waiters);
+
+ } else if (type == RW_LOCK_EX
+ || type == RW_LOCK_WAIT_EX
+ || type == RW_LOCK_SHARED) {
+
+ fputs(type == RW_LOCK_EX ? "X-lock on"
+ : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
+ : "S-lock on", file);
+
+ rwlock = cell->old_wait_rw_lock;
+
+ fprintf(file,
+ " RW-latch at %p '%s'\n",
+ (void*) rwlock, rwlock->lock_name);
+ writer = rw_lock_get_writer(rwlock);
+ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has"
+ " reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+ "number of readers %lu, waiters flag %lu, "
+ "lock_word: %lx\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+ (ulong) rw_lock_get_reader_count(rwlock),
+ (ulong) rwlock->waiters,
+ rwlock->lock_word,
+ rwlock->last_s_file_name,
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+ (ulong) rwlock->last_x_line);
+ } else {
+ ut_error;
+ }
+
+ if (!cell->waiting) {
+ fputs("wait has ended\n", file);
+ }
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Looks for a cell with the given thread id.
+@return pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+ sync_array_t* arr, /*!< in: wait array */
+ os_thread_id_t thread) /*!< in: thread id */
+{
+ ulint i;
+ sync_cell_t* cell;
+
+ for (i = 0; i < arr->n_cells; i++) {
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL
+ && os_thread_eq(cell->thread, thread)) {
+
+ return(cell); /* Found */
+ }
+ }
+
+ return(NULL); /* Not found */
+}
+
+/******************************************************************//**
+Recursion step for deadlock detection.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_deadlock_step(
+/*=====================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search
+ started */
+ os_thread_id_t thread, /*!< in: thread to look at */
+ ulint pass, /*!< in: pass value */
+ ulint depth) /*!< in: recursion depth */
+{
+ sync_cell_t* new;
+ ibool ret;
+
+ depth++;
+
+ if (pass != 0) {
+ /* If pass != 0, then we do not know which threads are
+ responsible of releasing the lock, and no deadlock can
+ be detected. */
+
+ return(FALSE);
+ }
+
+ new = sync_array_find_thread(arr, thread);
+
+ if (new == start) {
+ /* Stop running of other threads */
+
+ ut_dbg_stop_threads = TRUE;
+
+ /* Deadlock */
+ fputs("########################################\n"
+ "DEADLOCK of threads detected!\n", stderr);
+
+ return(TRUE);
+
+ } else if (new) {
+ ret = sync_array_detect_deadlock(arr, start, new, depth);
+
+ if (ret) {
+ return(TRUE);
+ }
+ }
+ return(FALSE);
+}
+
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth) /*!< in: recursion depth */
+{
+ mutex_t* mutex;
+ rw_lock_t* lock;
+ os_thread_id_t thread;
+ ibool ret;
+ rw_lock_debug_t*debug;
+
+ ut_a(arr);
+ ut_a(start);
+ ut_a(cell);
+ ut_ad(cell->wait_object);
+ ut_ad(os_thread_get_curr_id() == start->thread);
+ ut_ad(depth < 100);
+
+ depth++;
+
+ if (!cell->waiting) {
+
+ return(FALSE); /* No deadlock here */
+ }
+
+ if (cell->request_type == SYNC_MUTEX) {
+
+ mutex = cell->wait_object;
+
+ if (mutex_get_lock_word(mutex) != 0) {
+
+ thread = mutex->thread_id;
+
+ /* Note that mutex->thread_id above may be
+ also OS_THREAD_ID_UNDEFINED, because the
+ thread which held the mutex maybe has not
+ yet updated the value, or it has already
+ released the mutex: in this case no deadlock
+ can occur, as the wait array cannot contain
+ a thread with ID_UNDEFINED value. */
+
+ ret = sync_array_deadlock_step(arr, start, thread, 0,
+ depth);
+ if (ret) {
+ fprintf(stderr,
+ "Mutex %p owned by thread %lu file %s line %lu\n",
+ mutex, (ulong) os_thread_pf(mutex->thread_id),
+ mutex->file_name, (ulong) mutex->line);
+ sync_array_cell_print(stderr, cell);
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE); /* No deadlock */
+
+ } else if (cell->request_type == RW_LOCK_EX
+ || cell->request_type == RW_LOCK_WAIT_EX) {
+
+ lock = cell->wait_object;
+
+ debug = UT_LIST_GET_FIRST(lock->debug_list);
+
+ while (debug != NULL) {
+
+ thread = debug->thread_id;
+
+ if (((debug->lock_type == RW_LOCK_EX)
+ && !os_thread_eq(thread, cell->thread))
+ || ((debug->lock_type == RW_LOCK_WAIT_EX)
+ && !os_thread_eq(thread, cell->thread))
+ || (debug->lock_type == RW_LOCK_SHARED)) {
+
+ /* The (wait) x-lock request can block
+ infinitely only if someone (can be also cell
+ thread) is holding s-lock, or someone
+ (cannot be cell thread) (wait) x-lock, and
+ he is blocked by start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+ if (ret) {
+print:
+ fprintf(stderr, "rw-lock %p ",
+ (void*) lock);
+ sync_array_cell_print(stderr, cell);
+ rw_lock_debug_print(debug);
+ return(TRUE);
+ }
+ }
+
+ debug = UT_LIST_GET_NEXT(list, debug);
+ }
+
+ return(FALSE);
+
+ } else if (cell->request_type == RW_LOCK_SHARED) {
+
+ lock = cell->wait_object;
+ debug = UT_LIST_GET_FIRST(lock->debug_list);
+
+ while (debug != NULL) {
+
+ thread = debug->thread_id;
+
+ if ((debug->lock_type == RW_LOCK_EX)
+ || (debug->lock_type == RW_LOCK_WAIT_EX)) {
+
+ /* The s-lock request can block infinitely
+ only if someone (can also be cell thread) is
+ holding (wait) x-lock, and he is blocked by
+ start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+ if (ret) {
+ goto print;
+ }
+ }
+
+ debug = UT_LIST_GET_NEXT(list, debug);
+ }
+
+ return(FALSE);
+
+ } else {
+ ut_error;
+ }
+
+ return(TRUE); /* Execution never reaches this line: for compiler
+ fooling only */
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Determines if we can wake up the thread waiting for a sempahore. */
+static
+ibool
+sync_arr_cell_can_wake_up(
+/*======================*/
+ sync_cell_t* cell) /*!< in: cell to search */
+{
+ mutex_t* mutex;
+ rw_lock_t* lock;
+
+ if (cell->request_type == SYNC_MUTEX) {
+
+ mutex = cell->wait_object;
+
+ if (mutex_get_lock_word(mutex) == 0) {
+
+ return(TRUE);
+ }
+
+ } else if (cell->request_type == RW_LOCK_EX) {
+
+ lock = cell->wait_object;
+
+ if (lock->lock_word > 0) {
+ /* Either unlocked or only read locked. */
+
+ return(TRUE);
+ }
+
+ } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+
+ lock = cell->wait_object;
+
+ /* lock_word == 0 means all readers have left */
+ if (lock->lock_word == 0) {
+
+ return(TRUE);
+ }
+ } else if (cell->request_type == RW_LOCK_SHARED) {
+ lock = cell->wait_object;
+
+ /* lock_word > 0 means no writer or reserved writer */
+ if (lock->lock_word > 0) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index) /*!< in: index of the cell in array */
+{
+ sync_cell_t* cell;
+
+ sync_array_enter(arr);
+
+ cell = sync_array_get_nth_cell(arr, index);
+
+ ut_a(cell->wait_object != NULL);
+
+ cell->waiting = FALSE;
+ cell->wait_object = NULL;
+ cell->signal_count = 0;
+
+ ut_a(arr->n_reserved > 0);
+ arr->n_reserved--;
+
+ sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Increments the signalled count. */
+UNIV_INTERN
+void
+sync_array_object_signalled(
+/*========================*/
+ sync_array_t* arr) /*!< in: wait array */
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+ (void) os_atomic_increment_ulint(&arr->sg_count, 1);
+#else
+ sync_array_enter(arr);
+
+ arr->sg_count++;
+
+ sync_array_exit(arr);
+#endif
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+ sync_array_t* arr = sync_primary_wait_array;
+ sync_cell_t* cell;
+ ulint count;
+ ulint i;
+ os_event_t event;
+
+ sync_array_enter(arr);
+
+ i = 0;
+ count = 0;
+
+ while (count < arr->n_reserved) {
+
+ cell = sync_array_get_nth_cell(arr, i);
+ i++;
+
+ if (cell->wait_object == NULL) {
+ continue;
+ }
+ count++;
+
+ if (sync_arr_cell_can_wake_up(cell)) {
+
+ event = sync_cell_get_event(cell);
+
+ os_event_set(event);
+ }
+
+ }
+
+ sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(void)
+/*=============================*/
+{
+ sync_cell_t* cell;
+ ibool old_val;
+ ibool noticed = FALSE;
+ ulint i;
+ ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
+ ibool fatal = FALSE;
+
+ for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+
+ cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
+
+ if (cell->wait_object != NULL && cell->waiting
+ && difftime(time(NULL), cell->reservation_time) > 240) {
+ fputs("InnoDB: Warning: a long semaphore wait:\n",
+ stderr);
+ sync_array_cell_print(stderr, cell);
+ noticed = TRUE;
+ }
+
+ if (cell->wait_object != NULL && cell->waiting
+ && difftime(time(NULL), cell->reservation_time)
+ > fatal_timeout) {
+ fatal = TRUE;
+ }
+ }
+
+ if (noticed) {
+ fprintf(stderr,
+ "InnoDB: ###### Starts InnoDB Monitor"
+ " for 30 secs to print diagnostic info:\n");
+ old_val = srv_print_innodb_monitor;
+
+ /* If some crucial semaphore is reserved, then also the InnoDB
+ Monitor can hang, and we do not get diagnostics. Since in
+ many cases an InnoDB hang is caused by a pwrite() or a pread()
+ call hanging inside the operating system, let us print right
+ now the values of pending calls of these. */
+
+ fprintf(stderr,
+ "InnoDB: Pending preads %lu, pwrites %lu\n",
+ (ulong)os_file_n_pending_preads,
+ (ulong)os_file_n_pending_pwrites);
+
+ srv_print_innodb_monitor = TRUE;
+ os_event_set(srv_lock_timeout_thread_event);
+
+ os_thread_sleep(30000000);
+
+ srv_print_innodb_monitor = old_val;
+ fprintf(stderr,
+ "InnoDB: ###### Diagnostic info printed"
+ " to the standard error stream\n");
+ }
+
+ return(fatal);
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_output_info(
+/*===================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array; NOTE! caller must own the
+ mutex */
+{
+ sync_cell_t* cell;
+ ulint count;
+ ulint i;
+
+ fprintf(file,
+ "OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
+ (long) arr->res_count, (long) arr->sg_count);
+ i = 0;
+ count = 0;
+
+ while (count < arr->n_reserved) {
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL) {
+ count++;
+ sync_array_cell_print(file, cell);
+ }
+
+ i++;
+ }
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print_info(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array */
+{
+ sync_array_enter(arr);
+
+ sync_array_output_info(file, arr);
+
+ sync_array_exit(arr);
+}
diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c
new file mode 100644
index 00000000000..9e10f6e943b
--- /dev/null
+++ b/storage/xtradb/sync/sync0rw.c
@@ -0,0 +1,1037 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0rw.c
+The read-write lock (for thread synchronization)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0rw.h"
+#ifdef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+
+#include "os0thread.h"
+#include "mem0mem.h"
+#include "srv0srv.h"
+#include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+
+/*
+ IMPLEMENTATION OF THE RW_LOCK
+ =============================
+The status of a rw_lock is held in lock_word. The initial value of lock_word is
+X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR
+for each x-lock. This describes the lock state for each value of lock_word:
+
+lock_word == X_LOCK_DECR: Unlocked.
+0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers.
+ (X_LOCK_DECR - lock_word) is the
+ number of readers that hold the lock.
+lock_word == 0: Write locked
+-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer.
+ (-lock_word) is the number of readers
+ that hold the lock.
+lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been
+ decremented by X_LOCK_DECR once for each lock,
+ so the number of locks is:
+ ((-lock_word) / X_LOCK_DECR) + 1
+When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0:
+other values of lock_word are invalid.
+
+The lock_word is always read and updated atomically and consistently, so that
+it always represents the state of the lock, and the state of the lock changes
+with a single atomic operation. This lock_word holds all of the information
+that a thread needs in order to determine if it is eligible to gain the lock
+or if it must spin or sleep. The one exception to this is that writer_thread
+must be verified before recursive write locks: to solve this scenario, we make
+writer_thread readable by all threads, but only writeable by the x-lock holder.
+
+The other members of the lock obey the following rules to remain consistent:
+
+recursive: This and the writer_thread field together control the
+ behaviour of recursive x-locking.
+ lock->recursive must be FALSE in following states:
+ 1) The writer_thread contains garbage i.e.: the
+ lock has just been initialized.
+ 2) The lock is not x-held and there is no
+ x-waiter waiting on WAIT_EX event.
+ 3) The lock is x-held or there is an x-waiter
+ waiting on WAIT_EX event but the 'pass' value
+ is non-zero.
+ lock->recursive is TRUE iff:
+ 1) The lock is x-held or there is an x-waiter
+ waiting on WAIT_EX event and the 'pass' value
+ is zero.
+ This flag must be set after the writer_thread field
+ has been updated with a memory ordering barrier.
+ It is unset before the lock_word has been incremented.
+writer_thread: Is used only in recursive x-locking. Can only be safely
+ read iff lock->recursive flag is TRUE.
+ This field is uninitialized at lock creation time and
+ is updated atomically when x-lock is acquired or when
+ move_ownership is called. A thread is only allowed to
+ set the value of this field to it's thread_id i.e.: a
+ thread cannot set writer_thread to some other thread's
+ id.
+waiters: May be set to 1 anytime, but to avoid unnecessary wake-up
+ signals, it should only be set to 1 when there are threads
+ waiting on event. Must be 1 when a writer starts waiting to
+ ensure the current x-locking thread sends a wake-up signal
+ during unlock. May only be reset to 0 immediately before a
+ a wake-up signal is sent to event. On most platforms, a
+ memory barrier is required after waiters is set, and before
+ verifying lock_word is still held, to ensure some unlocker
+ really does see the flags new value.
+event: Threads wait on event for read or writer lock when another
+ thread has an x-lock or an x-lock reservation (wait_ex). A
+ thread may only wait on event after performing the following
+ actions in order:
+ (1) Record the counter value of event (with os_event_reset).
+ (2) Set waiters to 1.
+ (3) Verify lock_word <= 0.
+ (1) must come before (2) to ensure signal is not missed.
+ (2) must come before (3) to ensure a signal is sent.
+ These restrictions force the above ordering.
+ Immediately before sending the wake-up signal, we should:
+ (1) Verify lock_word == X_LOCK_DECR (unlocked)
+ (2) Reset waiters to 0.
+wait_ex_event: A thread may only wait on the wait_ex_event after it has
+ performed the following actions in order:
+ (1) Decrement lock_word by X_LOCK_DECR.
+ (2) Record counter value of wait_ex_event (os_event_reset,
+ called from sync_array_reserve_cell).
+ (3) Verify that lock_word < 0.
+ (1) must come first to ensures no other threads become reader
+ or next writer, and notifies unlocker that signal must be sent.
+ (2) must come before (3) to ensure the signal is not missed.
+ These restrictions force the above ordering.
+ Immediately before sending the wake-up signal, we should:
+ Verify lock_word == 0 (waiting thread holds x_lock)
+*/
+
+
+/** number of spin waits on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t rw_s_spin_wait_count = 0;
+/** number of spin loop rounds on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t rw_s_spin_round_count = 0;
+
+/** number of OS waits on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t rw_s_os_wait_count = 0;
+
+/** number of unlocks (that unlock shared locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+UNIV_INTERN ib_int64_t rw_s_exit_count = 0;
+
+/** number of spin waits on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t rw_x_spin_wait_count = 0;
+/** number of spin loop rounds on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t rw_x_spin_round_count = 0;
+
+/** number of OS waits on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t rw_x_os_wait_count = 0;
+
+/** number of unlocks (that unlock exclusive locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+UNIV_INTERN ib_int64_t rw_x_exit_count = 0;
+
+/* The global list of rw-locks */
+UNIV_INTERN rw_lock_list_t rw_lock_list;
+UNIV_INTERN mutex_t rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+acquired in addition to the mutex protecting the lock. */
+
+UNIV_INTERN mutex_t rw_lock_debug_mutex;
+/* If deadlock detection does not get immediately the mutex,
+it may wait for this event */
+UNIV_INTERN os_event_t rw_lock_debug_event;
+/* This is set to TRUE, if there may be waiters for the event */
+UNIV_INTERN ibool rw_lock_debug_waiters;
+
+/******************************************************************//**
+Creates a debug info struct. */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void);
+/*======================*/
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+ rw_lock_debug_t* info);
+
+/******************************************************************//**
+Creates a debug info struct.
+@return own: debug info struct */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void)
+/*======================*/
+{
+ return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t)));
+}
+
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+ rw_lock_debug_t* info)
+{
+ mem_free(info);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+ const char* cmutex_name, /*!< in: mutex name */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline) /*!< in: file line where created */
+{
+ /* If this is the very first time a synchronization object is
+ created, then the following call initializes the sync system. */
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_create(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+
+ ut_d(lock->mutex.cfile_name = cfile_name);
+ ut_d(lock->mutex.cline = cline);
+
+ lock->mutex.cmutex_name = cmutex_name;
+ ut_d(lock->mutex.mutex_type = 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+# ifdef UNIV_DEBUG
+ UT_NOT_USED(cfile_name);
+ UT_NOT_USED(cline);
+# endif
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ lock->lock_word = X_LOCK_DECR;
+ lock->waiters = 0;
+
+ /* We set this value to signify that lock->writer_thread
+ contains garbage at initialization and cannot be used for
+ recursive x-locking. */
+ lock->recursive = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+ UT_LIST_INIT(lock->debug_list);
+
+ lock->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_d(lock->magic_n = RW_LOCK_MAGIC_N);
+
+ lock->lock_name = cmutex_name;
+
+ lock->count_os_wait = 0;
+ lock->last_s_file_name = "not yet reserved";
+ lock->last_x_file_name = "not yet reserved";
+ lock->last_s_line = 0;
+ lock->last_x_line = 0;
+ lock->event = os_event_create(NULL);
+ lock->wait_ex_event = os_event_create(NULL);
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ ut_ad(UT_LIST_GET_FIRST(rw_lock_list) == NULL
+ || UT_LIST_GET_FIRST(rw_lock_list)->magic_n == RW_LOCK_MAGIC_N);
+
+ UT_LIST_ADD_FIRST(list, rw_lock_list, lock);
+
+ mutex_exit(&rw_lock_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free(
+/*=========*/
+ rw_lock_t* lock) /*!< in: rw-lock */
+{
+ ut_ad(rw_lock_validate(lock));
+ ut_a(lock->lock_word == X_LOCK_DECR);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_free(rw_lock_get_mutex(lock));
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+ mutex_enter(&rw_lock_list_mutex);
+ os_event_free(lock->event);
+
+ os_event_free(lock->wait_ex_event);
+
+ ut_ad(UT_LIST_GET_PREV(list, lock) == NULL
+ || UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+ ut_ad(UT_LIST_GET_NEXT(list, lock) == NULL
+ || UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+
+ UT_LIST_REMOVE(list, rw_lock_list, lock);
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ ut_d(lock->magic_n = 0);
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+ rw_lock_t* lock) /*!< in: rw-lock */
+{
+ ut_a(lock);
+
+ ulint waiters = rw_lock_get_waiters(lock);
+ lint lock_word = lock->lock_word;
+
+ ut_ad(lock->magic_n == RW_LOCK_MAGIC_N);
+ ut_a(waiters == 0 || waiters == 1);
+ ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0);
+
+ return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock
+ will be passed to another thread to unlock */
+ const char* file_name, /*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ ulint index; /* index of the reserved wait cell */
+ ulint i = 0; /* spin round count */
+
+ ut_ad(rw_lock_validate(lock));
+
+ rw_s_spin_wait_count++; /*!< Count calls to this function */
+lock_loop:
+
+ /* Spin waiting for the writer field to become free */
+ while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+
+ i++;
+ }
+
+ if (i == SYNC_SPIN_ROUNDS) {
+ os_thread_yield();
+ }
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+ "Thread %lu spin wait rw-s-lock at %p"
+ " '%s' rnds %lu\n",
+ (ulong) os_thread_pf(os_thread_get_curr_id()),
+ (void*) lock,
+ lock->lock_name, (ulong) i);
+ }
+
+ /* We try once again to obtain the lock */
+ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+ rw_s_spin_round_count += i;
+
+ return; /* Success */
+ } else {
+
+ if (i < SYNC_SPIN_ROUNDS) {
+ goto lock_loop;
+ }
+
+ rw_s_spin_round_count += i;
+
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock, RW_LOCK_SHARED,
+ file_name, line,
+ &index);
+
+ /* Set waiters before checking lock_word to ensure wake-up
+ signal is sent. This may lead to some unnecessary signals. */
+ rw_lock_set_waiter_flag(lock);
+
+ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+ sync_array_free_cell(sync_primary_wait_array, index);
+ return; /* Success */
+ }
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+ "Thread %lu OS wait rw-s-lock at %p"
+ " '%s'\n",
+ os_thread_pf(os_thread_get_curr_id()),
+ (void*) lock, lock->lock_name);
+ }
+
+ /* these stats may not be accurate */
+ lock->count_os_wait++;
+ rw_s_os_wait_count++;
+
+ sync_array_wait_event(sync_primary_wait_array, index);
+
+ i = 0;
+ goto lock_loop;
+ }
+}
+
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+ rw_lock_t* lock) /*!< in: lock which was x-locked in the
+ buffer read */
+{
+ ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
+ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+}
+
+/******************************************************************//**
+Function for the next writer to call. Waits for readers to exit.
+The caller must have already decremented lock_word by X_LOCK_DECR. */
+UNIV_INLINE
+void
+rw_lock_x_lock_wait(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+#endif
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ ulint index;
+ ulint i = 0;
+
+ ut_ad(lock->lock_word <= 0);
+
+ while (lock->lock_word < 0) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+ if(i < SYNC_SPIN_ROUNDS) {
+ i++;
+ continue;
+ }
+
+ /* If there is still a reader, then go to sleep.*/
+ rw_x_spin_round_count += i;
+ i = 0;
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock,
+ RW_LOCK_WAIT_EX,
+ file_name, line,
+ &index);
+ /* Check lock_word to ensure wake-up isn't missed.*/
+ if(lock->lock_word < 0) {
+
+ /* these stats may not be accurate */
+ lock->count_os_wait++;
+ rw_x_os_wait_count++;
+
+ /* Add debug info as it is needed to detect possible
+ deadlock. We must add info for WAIT_EX thread for
+ deadlock detection to work properly. */
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
+ file_name, line);
+#endif
+
+ sync_array_wait_event(sync_primary_wait_array,
+ index);
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass,
+ RW_LOCK_WAIT_EX);
+#endif
+ /* It is possible to wake when lock_word < 0.
+ We must pass the while-loop check to proceed.*/
+ } else {
+ sync_array_free_cell(sync_primary_wait_array,
+ index);
+ }
+ }
+ rw_x_spin_round_count += i;
+}
+
+/******************************************************************//**
+Low-level function for acquiring an exclusive lock.
+@return RW_LOCK_NOT_LOCKED if did not succeed, RW_LOCK_EX if success. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_low(
+/*===============*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) {
+
+ /* lock->recursive also tells us if the writer_thread
+ field is stale or active. As we are going to write
+ our own thread id in that field it must be that the
+ current writer_thread value is not active. */
+ ut_a(!lock->recursive);
+
+ /* Decrement occurred: we are writer or next-writer. */
+ rw_lock_set_writer_id_and_recursion_flag(lock,
+ pass ? FALSE : TRUE);
+
+ rw_lock_x_lock_wait(lock,
+#ifdef UNIV_SYNC_DEBUG
+ pass,
+#endif
+ file_name, line);
+
+ } else {
+ /* Decrement failed: relock or failed lock */
+ if (!pass && lock->recursive
+ && os_thread_eq(lock->writer_thread, curr_thread)) {
+ /* Relock */
+ lock->lock_word -= X_LOCK_DECR;
+ } else {
+ /* Another thread locked before us */
+ return(FALSE);
+ }
+ }
+#ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
+ file_name, line);
+#endif
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = (unsigned int) line;
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+ rw_lock_t* lock, /*!< in: pointer to rw-lock */
+ ulint pass, /*!< in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
+ const char* file_name,/*!< in: file name where lock requested */
+ ulint line) /*!< in: line where requested */
+{
+ ulint index; /*!< index of the reserved wait cell */
+ ulint i; /*!< spin round count */
+ ibool spinning = FALSE;
+
+ ut_ad(rw_lock_validate(lock));
+
+ i = 0;
+
+lock_loop:
+
+ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+ rw_x_spin_round_count += i;
+
+ return; /* Locking succeeded */
+
+ } else {
+
+ if (!spinning) {
+ spinning = TRUE;
+ rw_x_spin_wait_count++;
+ }
+
+ /* Spin waiting for the lock_word to become free */
+ while (i < SYNC_SPIN_ROUNDS
+ && lock->lock_word <= 0) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+ srv_spin_wait_delay));
+ }
+
+ i++;
+ }
+ if (i == SYNC_SPIN_ROUNDS) {
+ os_thread_yield();
+ } else {
+ goto lock_loop;
+ }
+ }
+
+ rw_x_spin_round_count += i;
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+ "Thread %lu spin wait rw-x-lock at %p"
+ " '%s' rnds %lu\n",
+ os_thread_pf(os_thread_get_curr_id()), (void*) lock,
+ lock->lock_name, (ulong) i);
+ }
+
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock,
+ RW_LOCK_EX,
+ file_name, line,
+ &index);
+
+ /* Waiters must be set before checking lock_word, to ensure signal
+ is sent. This could lead to a few unnecessary wake-up signals. */
+ rw_lock_set_waiter_flag(lock);
+
+ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+ sync_array_free_cell(sync_primary_wait_array, index);
+ return; /* Locking succeeded */
+ }
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+ "Thread %lu OS wait for rw-x-lock at %p"
+ " '%s'\n",
+ os_thread_pf(os_thread_get_curr_id()), (void*) lock,
+ lock->lock_name);
+ }
+
+ /* these stats may not be accurate */
+ lock->count_os_wait++;
+ rw_x_os_wait_count++;
+
+ sync_array_wait_event(sync_primary_wait_array, index);
+
+ i = 0;
+ goto lock_loop;
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void)
+/*==========================*/
+{
+loop:
+ if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+ return;
+ }
+
+ os_event_reset(rw_lock_debug_event);
+
+ rw_lock_debug_waiters = TRUE;
+
+ if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+ return;
+ }
+
+ os_event_wait(rw_lock_debug_event);
+
+ goto loop;
+}
+
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void)
+/*==========================*/
+{
+ mutex_exit(&rw_lock_debug_mutex);
+
+ if (rw_lock_debug_waiters) {
+ rw_lock_debug_waiters = FALSE;
+ os_event_set(rw_lock_debug_event);
+ }
+}
+
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type, /*!< in: lock type */
+ const char* file_name, /*!< in: file where requested */
+ ulint line) /*!< in: line where requested */
+{
+ rw_lock_debug_t* info;
+
+ ut_ad(lock);
+ ut_ad(file_name);
+
+ info = rw_lock_debug_create();
+
+ rw_lock_debug_mutex_enter();
+
+ info->file_name = file_name;
+ info->line = line;
+ info->lock_type = lock_type;
+ info->thread_id = os_thread_get_curr_id();
+ info->pass = pass;
+
+ UT_LIST_ADD_FIRST(list, lock->debug_list, info);
+
+ rw_lock_debug_mutex_exit();
+
+ if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+ sync_thread_add_level(lock, lock->level);
+ }
+}
+
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint pass, /*!< in: pass value */
+ ulint lock_type) /*!< in: lock type */
+{
+ rw_lock_debug_t* info;
+
+ ut_ad(lock);
+
+ if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+ sync_thread_reset_level(lock);
+ }
+
+ rw_lock_debug_mutex_enter();
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+
+ while (info != NULL) {
+ if ((pass == info->pass)
+ && ((pass != 0)
+ || os_thread_eq(info->thread_id,
+ os_thread_get_curr_id()))
+ && (info->lock_type == lock_type)) {
+
+ /* Found! */
+ UT_LIST_REMOVE(list, lock->debug_list, info);
+ rw_lock_debug_mutex_exit();
+
+ rw_lock_debug_free(info);
+
+ return;
+ }
+
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+
+ ut_error;
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0.
+@return TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+{
+ rw_lock_debug_t* info;
+
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
+ rw_lock_debug_mutex_enter();
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+
+ while (info != NULL) {
+
+ if (os_thread_eq(info->thread_id, os_thread_get_curr_id())
+ && (info->pass == 0)
+ && (info->lock_type == lock_type)) {
+
+ rw_lock_debug_mutex_exit();
+ /* Found! */
+
+ return(TRUE);
+ }
+
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+ rw_lock_debug_mutex_exit();
+
+ return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode.
+@return TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+ rw_lock_t* lock, /*!< in: rw-lock */
+ ulint lock_type) /*!< in: lock type: RW_LOCK_SHARED,
+ RW_LOCK_EX */
+{
+ ibool ret = FALSE;
+
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
+ if (lock_type == RW_LOCK_SHARED) {
+ if (rw_lock_get_reader_count(lock) > 0) {
+ ret = TRUE;
+ }
+ } else if (lock_type == RW_LOCK_EX) {
+ if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+ ret = TRUE;
+ }
+ } else {
+ ut_error;
+ }
+
+ return(ret);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+ FILE* file) /*!< in: file where to print */
+{
+ rw_lock_t* lock;
+ ulint count = 0;
+ rw_lock_debug_t* info;
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ fputs("-------------\n"
+ "RW-LATCH INFO\n"
+ "-------------\n", file);
+
+ lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+ while (lock != NULL) {
+
+ count++;
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_enter(&(lock->mutex));
+#endif
+ if (lock->lock_word != X_LOCK_DECR) {
+
+ fprintf(file, "RW-LOCK: %p ", (void*) lock);
+
+ if (rw_lock_get_waiters(lock)) {
+ fputs(" Waiters for the lock exist\n", file);
+ } else {
+ putc('\n', file);
+ }
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+ while (info != NULL) {
+ rw_lock_debug_print(info);
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+ }
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ mutex_exit(&(lock->mutex));
+#endif
+
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+ fprintf(file, "Total number of rw-locks %ld\n", count);
+ mutex_exit(&rw_lock_list_mutex);
+}
+
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+ rw_lock_t* lock) /*!< in: rw-lock */
+{
+ rw_lock_debug_t* info;
+
+ fprintf(stderr,
+ "-------------\n"
+ "RW-LATCH INFO\n"
+ "RW-LATCH: %p ", (void*) lock);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+ /* We used to acquire lock->mutex here, but it would cause a
+ recursive call to sync_thread_add_level() if UNIV_SYNC_DEBUG
+ is defined. Since this function is only invoked from
+ sync_thread_levels_g(), let us choose the smaller evil:
+ performing dirty reads instead of causing bogus deadlocks or
+ assertion failures. */
+#endif
+ if (lock->lock_word != X_LOCK_DECR) {
+
+ if (rw_lock_get_waiters(lock)) {
+ fputs(" Waiters for the lock exist\n", stderr);
+ } else {
+ putc('\n', stderr);
+ }
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+ while (info != NULL) {
+ rw_lock_debug_print(info);
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+ }
+}
+
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+ rw_lock_debug_t* info) /*!< in: debug struct */
+{
+ ulint rwt;
+
+ rwt = info->lock_type;
+
+ fprintf(stderr, "Locked: thread %ld file %s line %ld ",
+ (ulong) os_thread_pf(info->thread_id), info->file_name,
+ (ulong) info->line);
+ if (rwt == RW_LOCK_SHARED) {
+ fputs("S-LOCK", stderr);
+ } else if (rwt == RW_LOCK_EX) {
+ fputs("X-LOCK", stderr);
+ } else if (rwt == RW_LOCK_WAIT_EX) {
+ fputs("WAIT X-LOCK", stderr);
+ } else {
+ ut_error;
+ }
+ if (info->pass != 0) {
+ fprintf(stderr, " pass value %lu", (ulong) info->pass);
+ }
+ putc('\n', stderr);
+}
+
+/***************************************************************//**
+Returns the number of currently locked rw-locks. Works only in the debug
+version.
+@return number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void)
+/*==================*/
+{
+ rw_lock_t* lock;
+ ulint count = 0;
+
+ mutex_enter(&rw_lock_list_mutex);
+
+ lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+ while (lock != NULL) {
+
+ if (lock->lock_word != X_LOCK_DECR) {
+ count++;
+ }
+
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+ mutex_exit(&rw_lock_list_mutex);
+
+ return(count);
+}
+#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c
new file mode 100644
index 00000000000..225f28df78e
--- /dev/null
+++ b/storage/xtradb/sync/sync0sync.c
@@ -0,0 +1,1525 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0sync.c
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#ifdef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "buf0types.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+
+/*
+ REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX
+ ============================================
+
+Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc
+takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995
+Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to
+implement our own efficient spin lock mutex. Future operating systems may
+provide efficient spin locks, but we cannot count on that.
+
+Another reason for implementing a spin lock is that on multiprocessor systems
+it can be more efficient for a processor to run a loop waiting for the
+semaphore to be released than to switch to a different thread. A thread switch
+takes 25 us on both platforms mentioned above. See Gray and Reuter's book
+Transaction processing for background.
+
+How long should the spin loop last before suspending the thread? On a
+uniprocessor, spinning does not help at all, because if the thread owning the
+mutex is not executing, it cannot be released. Spinning actually wastes
+resources.
+
+On a multiprocessor, we do not know if the thread owning the mutex is
+executing or not. Thus it would make sense to spin as long as the operation
+guarded by the mutex would typically last assuming that the thread is
+executing. If the mutex is not released by that time, we may assume that the
+thread owning the mutex is not executing and suspend the waiting thread.
+
+A typical operation (where no i/o involved) guarded by a mutex or a read-write
+lock may last 1 - 20 us on the current Pentium platform. The longest
+operations are the binary searches on an index node.
+
+We conclude that the best choice is to set the spin time at 20 us. Then the
+system should work well on a multiprocessor. On a uniprocessor we have to
+make sure that thread swithches due to mutex collisions are not frequent,
+i.e., they do not happen every 100 us or so, because that wastes too much
+resources. If the thread switches are not frequent, the 20 us wasted in spin
+loop is not too much.
+
+Empirical studies on the effect of spin time should be done for different
+platforms.
+
+
+ IMPLEMENTATION OF THE MUTEX
+ ===========================
+
+For background, see Curt Schimmel's book on Unix implementation on modern
+architectures. The key points in the implementation are atomicity and
+serialization of memory accesses. The test-and-set instruction (XCHG in
+Pentium) must be atomic. As new processors may have weak memory models, also
+serialization of memory references may be necessary. The successor of Pentium,
+P6, has at least one mode where the memory model is weak. As far as we know,
+in Pentium all memory accesses are serialized in the program order and we do
+not have to worry about the memory model. On other processors there are
+special machine instructions called a fence, memory barrier, or storage
+barrier (STBAR in Sparc), which can be used to serialize the memory accesses
+to happen in program order relative to the fence instruction.
+
+Leslie Lamport has devised a "bakery algorithm" to implement a mutex without
+the atomic test-and-set, but his algorithm should be modified for weak memory
+models. We do not use Lamport's algorithm, because we guess it is slower than
+the atomic test-and-set.
+
+Our mutex implementation works as follows: After that we perform the atomic
+test-and-set instruction on the memory word. If the test returns zero, we
+know we got the lock first. If the test returns not zero, some other thread
+was quicker and got the lock: then we spin in a loop reading the memory word,
+waiting it to become zero. It is wise to just read the word in the loop, not
+perform numerous test-and-set instructions, because they generate memory
+traffic between the cache and the main memory. The read loop can just access
+the cache, saving bus bandwidth.
+
+If we cannot acquire the mutex lock in the specified time, we reserve a cell
+in the wait array, set the waiters byte in the mutex to 1. To avoid a race
+condition, after setting the waiters byte and before suspending the waiting
+thread, we still have to check that the mutex is reserved, because it may
+have happened that the thread which was holding the mutex has just released
+it and did not see the waiters byte set to 1, a case which would lead the
+other thread to an infinite wait.
+
+LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some
+=======
+thread will eventually call os_event_set() on that particular event.
+Thus no infinite wait is possible in this case.
+
+Proof: After making the reservation the thread sets the waiters field in the
+mutex to 1. Then it checks that the mutex is still reserved by some thread,
+or it reserves the mutex for itself. In any case, some thread (which may be
+also some earlier thread, not necessarily the one currently holding the mutex)
+will set the waiters field to 0 in mutex_exit, and then call
+os_event_set() with the mutex as an argument.
+Q.E.D.
+
+LEMMA 2: If an os_event_set() call is made after some thread has called
+=======
+the os_event_reset() and before it starts wait on that event, the call
+will not be lost to the second thread. This is true even if there is an
+intervening call to os_event_reset() by another thread.
+Thus no infinite wait is possible in this case.
+
+Proof (non-windows platforms): os_event_reset() returns a monotonically
+increasing value of signal_count. This value is increased at every
+call of os_event_set() If thread A has called os_event_reset() followed
+by thread B calling os_event_set() and then some other thread C calling
+os_event_reset(), the is_set flag of the event will be set to FALSE;
+but now if thread A calls os_event_wait_low() with the signal_count
+value returned from the earlier call of os_event_reset(), it will
+return immediately without waiting.
+Q.E.D.
+
+Proof (windows): If there is a writer thread which is forced to wait for
+the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX
+The design of rw_lock ensures that there is one and only one thread
+that is able to change the state to RW_LOCK_WAIT_EX and this thread is
+guaranteed to acquire the lock after it is released by the current
+holders and before any other waiter gets the lock.
+On windows this thread waits on a separate event i.e.: wait_ex_event.
+Since only one thread can wait on this event there is no chance
+of this event getting reset before the writer starts wait on it.
+Therefore, this thread is guaranteed to catch the os_set_event()
+signalled unconditionally at the release of the lock.
+Q.E.D. */
+
+/* Number of spin waits on mutexes: for performance monitoring */
+
+/** The number of iterations in the mutex_spin_wait() spin loop.
+Intended for performance monitoring. */
+static ib_int64_t mutex_spin_round_count = 0;
+/** The number of mutex_spin_wait() calls. Intended for
+performance monitoring. */
+static ib_int64_t mutex_spin_wait_count = 0;
+/** The number of OS waits in mutex_spin_wait(). Intended for
+performance monitoring. */
+static ib_int64_t mutex_os_wait_count = 0;
+/** The number of mutex_exit() calls. Intended for performance
+monitoring. */
+UNIV_INTERN ib_int64_t mutex_exit_count = 0;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+UNIV_INTERN sync_array_t* sync_primary_wait_array;
+
+/** This variable is set to TRUE when sync_init is called */
+UNIV_INTERN ibool sync_initialized = FALSE;
+
+/** An acquired mutex or rw-lock and its level in the latching order */
+typedef struct sync_level_struct sync_level_t;
+/** Mutexes or rw-locks held by a thread */
+typedef struct sync_thread_struct sync_thread_t;
+
+#ifdef UNIV_SYNC_DEBUG
+/** The latch levels currently owned by threads are stored in this data
+structure; the size of this array is OS_THREAD_MAX_N */
+
+UNIV_INTERN sync_thread_t* sync_thread_level_arrays;
+
+/** Mutex protecting sync_thread_level_arrays */
+UNIV_INTERN mutex_t sync_thread_mutex;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Global list of database mutexes (not OS mutexes) created. */
+UNIV_INTERN ut_list_base_node_t mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+UNIV_INTERN mutex_t mutex_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+UNIV_INTERN ibool sync_order_checks_on = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Mutexes or rw-locks held by a thread */
+struct sync_thread_struct{
+ os_thread_id_t id; /*!< OS thread id */
+ sync_level_t* levels; /*!< level array for this thread; if
+ this is NULL this slot is unused */
+};
+
+/** Number of slots reserved for each OS thread in the sync level array */
+#define SYNC_THREAD_N_LEVELS 10000
+
+/** An acquired mutex or rw-lock and its level in the latching order */
+struct sync_level_struct{
+ void* latch; /*!< pointer to a mutex or an rw-lock; NULL means that
+ the slot is empty */
+ ulint level; /*!< level of the latch in the latching order */
+};
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+ mutex_t* mutex, /*!< in: pointer to memory */
+ const char* cmutex_name, /*!< in: mutex name */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+ ulint level, /*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+ const char* cfile_name, /*!< in: file name where created */
+ ulint cline) /*!< in: file line where created */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+ mutex_reset_lock_word(mutex);
+#else
+ os_fast_mutex_init(&(mutex->os_fast_mutex));
+ mutex->lock_word = 0;
+#endif
+ mutex->event = os_event_create(NULL);
+ mutex->waiters = 0;
+#ifdef UNIV_DEBUG
+ mutex->magic_n = MUTEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+ mutex->line = 0;
+ mutex->file_name = "not yet reserved";
+ mutex->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+ mutex->cfile_name = cfile_name;
+ mutex->cline = cline;
+#endif /* UNIV_DEBUG */
+ mutex->count_os_wait = 0;
+ mutex->cmutex_name= cmutex_name;
+#ifdef UNIV_DEBUG
+ mutex->count_using= 0;
+ mutex->mutex_type= 0;
+ mutex->lspent_time= 0;
+ mutex->lmax_spent_time= 0;
+ mutex->count_spin_loop= 0;
+ mutex->count_spin_rounds= 0;
+ mutex->count_os_yield= 0;
+#endif /* UNIV_DEBUG */
+
+ /* Check that lock_word is aligned; this is important on Intel */
+ ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0);
+
+ /* NOTE! The very first mutexes are not put to the mutex list */
+
+ if ((mutex == &mutex_list_mutex)
+#ifdef UNIV_SYNC_DEBUG
+ || (mutex == &sync_thread_mutex)
+#endif /* UNIV_SYNC_DEBUG */
+ ) {
+
+ return;
+ }
+
+ mutex_enter(&mutex_list_mutex);
+
+ ut_ad(UT_LIST_GET_LEN(mutex_list) == 0
+ || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N);
+
+ UT_LIST_ADD_FIRST(list, mutex_list, mutex);
+
+ mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free(
+/*=======*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+ ut_ad(mutex_validate(mutex));
+ ut_a(mutex_get_lock_word(mutex) == 0);
+ ut_a(mutex_get_waiters(mutex) == 0);
+
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ ut_ad(UT_LIST_GET_LEN(mutex_list) == 1);
+ ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex);
+ UT_LIST_REMOVE(list, mutex_list, mutex);
+ goto func_exit;
+ }
+#endif /* UNIV_MEM_DEBUG */
+
+ if (mutex != &mutex_list_mutex
+#ifdef UNIV_SYNC_DEBUG
+ && mutex != &sync_thread_mutex
+#endif /* UNIV_SYNC_DEBUG */
+ ) {
+
+ mutex_enter(&mutex_list_mutex);
+
+ ut_ad(!UT_LIST_GET_PREV(list, mutex)
+ || UT_LIST_GET_PREV(list, mutex)->magic_n
+ == MUTEX_MAGIC_N);
+ ut_ad(!UT_LIST_GET_NEXT(list, mutex)
+ || UT_LIST_GET_NEXT(list, mutex)->magic_n
+ == MUTEX_MAGIC_N);
+
+ UT_LIST_REMOVE(list, mutex_list, mutex);
+
+ mutex_exit(&mutex_list_mutex);
+ }
+
+ os_event_free(mutex->event);
+#ifdef UNIV_MEM_DEBUG
+func_exit:
+#endif /* UNIV_MEM_DEBUG */
+#if !defined(HAVE_ATOMIC_BUILTINS)
+ os_fast_mutex_free(&(mutex->os_fast_mutex));
+#endif
+ /* If we free the mutex protecting the mutex list (freeing is
+ not necessary), we have to reset the magic number AFTER removing
+ it from the list. */
+#ifdef UNIV_DEBUG
+ mutex->magic_n = 0;
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return 0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name __attribute__((unused)),
+ /*!< in: file name where mutex
+ requested */
+ ulint line __attribute__((unused)))
+ /*!< in: line where requested */
+{
+ ut_ad(mutex_validate(mutex));
+
+ if (!mutex_test_and_set(mutex)) {
+
+ ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+ return(0); /* Succeeded! */
+ }
+
+ return(1);
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ ut_a(mutex);
+ ut_a(mutex->magic_n == MUTEX_MAGIC_N);
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only in the debug
+version.
+@return TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+ const mutex_t* mutex) /*!< in: mutex */
+{
+ ut_ad(mutex_validate(mutex));
+
+ return(mutex_get_lock_word(mutex) == 1
+ && os_thread_eq(mutex->thread_id, os_thread_get_curr_id()));
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+ mutex_t* mutex, /*!< in: mutex */
+ ulint n) /*!< in: value to set */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+ ut_ad(mutex);
+
+ if (n) {
+ os_compare_and_swap_ulint(&mutex->waiters, 0, 1);
+ } else {
+ os_compare_and_swap_ulint(&mutex->waiters, 1, 0);
+ }
+#else
+ volatile ulint* ptr; /* declared volatile to ensure that
+ the value is stored to memory */
+ ut_ad(mutex);
+
+ ptr = &(mutex->waiters);
+
+ *ptr = n; /* Here we assume that the write of a single
+ word in memory is atomic */
+#endif
+}
+
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+ mutex_t* mutex, /*!< in: pointer to mutex */
+ const char* file_name, /*!< in: file name where mutex
+ requested */
+ ulint line) /*!< in: line where requested */
+{
+ ulint index; /* index of the reserved wait cell */
+ ulint i; /* spin round count */
+#ifdef UNIV_DEBUG
+ ib_int64_t lstart_time = 0, lfinish_time; /* for timing os_wait */
+ ulint ltime_diff;
+ ulint sec;
+ ulint ms;
+ uint timer_started = 0;
+#endif /* UNIV_DEBUG */
+ ut_ad(mutex);
+
+ /* This update is not thread safe, but we don't mind if the count
+ isn't exact. Moved out of ifdef that follows because we are willing
+ to sacrifice the cost of counting this as the data is valuable.
+ Count the number of calls to mutex_spin_wait. */
+ mutex_spin_wait_count++;
+
+mutex_loop:
+
+ i = 0;
+
+ /* Spin waiting for the lock word to become zero. Note that we do
+ not have to assume that the read access to the lock word is atomic,
+ as the actual locking is always committed with atomic test-and-set.
+ In reality, however, all processors probably have an atomic read of
+ a memory word. */
+
+spin_loop:
+ ut_d(mutex->count_spin_loop++);
+
+ while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+
+ i++;
+ }
+
+ if (i == SYNC_SPIN_ROUNDS) {
+#ifdef UNIV_DEBUG
+ mutex->count_os_yield++;
+#ifndef UNIV_HOTBACKUP
+ if (timed_mutexes && timer_started == 0) {
+ ut_usectime(&sec, &ms);
+ lstart_time= (ib_int64_t)sec * 1000000 + ms;
+ timer_started = 1;
+ }
+#endif /* UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+ os_thread_yield();
+ }
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+ fprintf(stderr,
+ "Thread %lu spin wait mutex at %p"
+ " '%s' rnds %lu\n",
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
+ mutex->cmutex_name, (ulong) i);
+#endif
+
+ mutex_spin_round_count += i;
+
+ ut_d(mutex->count_spin_rounds += i);
+
+ if (mutex_test_and_set(mutex) == 0) {
+ /* Succeeded! */
+
+ ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+ goto finish_timing;
+ }
+
+ /* We may end up with a situation where lock_word is 0 but the OS
+ fast mutex is still reserved. On FreeBSD the OS does not seem to
+ schedule a thread which is constantly calling pthread_mutex_trylock
+ (in mutex_test_and_set implementation). Then we could end up
+ spinning here indefinitely. The following 'i++' stops this infinite
+ spin. */
+
+ i++;
+
+ if (i < SYNC_SPIN_ROUNDS) {
+ goto spin_loop;
+ }
+
+ sync_array_reserve_cell(sync_primary_wait_array, mutex,
+ SYNC_MUTEX, file_name, line, &index);
+
+ /* The memory order of the array reservation and the change in the
+ waiters field is important: when we suspend a thread, we first
+ reserve the cell and then set waiters field to 1. When threads are
+ released in mutex_exit, the waiters field is first set to zero and
+ then the event is set to the signaled state. */
+
+ mutex_set_waiters(mutex, 1);
+
+ /* Try to reserve still a few times */
+ for (i = 0; i < 4; i++) {
+ if (mutex_test_and_set(mutex) == 0) {
+ /* Succeeded! Free the reserved wait cell */
+
+ sync_array_free_cell(sync_primary_wait_array, index);
+
+ ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+ mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+ fprintf(stderr, "Thread %lu spin wait succeeds at 2:"
+ " mutex at %p\n",
+ (ulong) os_thread_pf(os_thread_get_curr_id()),
+ (void*) mutex);
+#endif
+
+ goto finish_timing;
+
+ /* Note that in this case we leave the waiters field
+ set to 1. We cannot reset it to zero, as we do not
+ know if there are other waiters. */
+ }
+ }
+
+ /* Now we know that there has been some thread holding the mutex
+ after the change in the wait array and the waiters field was made.
+ Now there is no risk of infinite wait on the event. */
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+ fprintf(stderr,
+ "Thread %lu OS wait mutex at %p '%s' rnds %lu\n",
+ (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
+ mutex->cmutex_name, (ulong) i);
+#endif
+
+ mutex_os_wait_count++;
+
+ mutex->count_os_wait++;
+#ifdef UNIV_DEBUG
+ /* !!!!! Sometimes os_wait can be called without os_thread_yield */
+#ifndef UNIV_HOTBACKUP
+ if (timed_mutexes == 1 && timer_started == 0) {
+ ut_usectime(&sec, &ms);
+ lstart_time= (ib_int64_t)sec * 1000000 + ms;
+ timer_started = 1;
+ }
+#endif /* UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+
+ sync_array_wait_event(sync_primary_wait_array, index);
+ goto mutex_loop;
+
+finish_timing:
+#ifdef UNIV_DEBUG
+ if (timed_mutexes == 1 && timer_started==1) {
+ ut_usectime(&sec, &ms);
+ lfinish_time= (ib_int64_t)sec * 1000000 + ms;
+
+ ltime_diff= (ulint) (lfinish_time - lstart_time);
+ mutex->lspent_time += ltime_diff;
+
+ if (mutex->lmax_spent_time < ltime_diff) {
+ mutex->lmax_spent_time= ltime_diff;
+ }
+ }
+#endif /* UNIV_DEBUG */
+ return;
+}
+
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+ mutex_t* mutex) /*!< in: mutex */
+{
+ mutex_set_waiters(mutex, 0);
+
+ /* The memory order of resetting the waiters field and
+ signaling the object is important. See LEMMA 1 above. */
+ os_event_set(mutex->event);
+ sync_array_object_signalled(sync_primary_wait_array);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char* file_name, /*!< in: file where requested */
+ ulint line) /*!< in: line where requested */
+{
+ ut_ad(mutex);
+ ut_ad(file_name);
+
+ sync_thread_add_level(mutex, mutex->level);
+
+ mutex->file_name = file_name;
+ mutex->line = line;
+}
+
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+ mutex_t* mutex, /*!< in: mutex */
+ const char** file_name, /*!< out: file where requested */
+ ulint* line, /*!< out: line where requested */
+ os_thread_id_t* thread_id) /*!< out: id of the thread which owns
+ the mutex */
+{
+ ut_ad(mutex);
+
+ *file_name = mutex->file_name;
+ *line = mutex->line;
+ *thread_id = mutex->thread_id;
+}
+
+/******************************************************************//**
+Prints debug info of currently reserved mutexes. */
+static
+void
+mutex_list_print_info(
+/*==================*/
+ FILE* file) /*!< in: file where to print */
+{
+ mutex_t* mutex;
+ const char* file_name;
+ ulint line;
+ os_thread_id_t thread_id;
+ ulint count = 0;
+
+ fputs("----------\n"
+ "MUTEX INFO\n"
+ "----------\n", file);
+
+ mutex_enter(&mutex_list_mutex);
+
+ mutex = UT_LIST_GET_FIRST(mutex_list);
+
+ while (mutex != NULL) {
+ count++;
+
+ if (mutex_get_lock_word(mutex) != 0) {
+ mutex_get_debug_info(mutex, &file_name, &line,
+ &thread_id);
+ fprintf(file,
+ "Locked mutex: addr %p thread %ld"
+ " file %s line %ld\n",
+ (void*) mutex, os_thread_pf(thread_id),
+ file_name, line);
+ }
+
+ mutex = UT_LIST_GET_NEXT(list, mutex);
+ }
+
+ fprintf(file, "Total number of mutexes %ld\n", count);
+
+ mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void)
+/*==================*/
+{
+ mutex_t* mutex;
+ ulint count = 0;
+
+ mutex_enter(&mutex_list_mutex);
+
+ mutex = UT_LIST_GET_FIRST(mutex_list);
+
+ while (mutex != NULL) {
+ if (mutex_get_lock_word(mutex) != 0) {
+
+ count++;
+ }
+
+ mutex = UT_LIST_GET_NEXT(list, mutex);
+ }
+
+ mutex_exit(&mutex_list_mutex);
+
+ ut_a(count >= 1);
+
+ return(count - 1); /* Subtract one, because this function itself
+ was holding one mutex (mutex_list_mutex) */
+}
+
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked. Works only in
+the debug version.
+@return TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void)
+/*================*/
+{
+ return(mutex_n_reserved() + rw_lock_n_locked() == 0);
+}
+
+/******************************************************************//**
+Gets the value in the nth slot in the thread level arrays.
+@return pointer to thread slot */
+static
+sync_thread_t*
+sync_thread_level_arrays_get_nth(
+/*=============================*/
+ ulint n) /*!< in: slot number */
+{
+ ut_ad(n < OS_THREAD_MAX_N);
+
+ return(sync_thread_level_arrays + n);
+}
+
+/******************************************************************//**
+Looks for the thread slot for the calling thread.
+@return pointer to thread slot, NULL if not found */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_slot(void)
+/*====================================*/
+
+{
+ sync_thread_t* slot;
+ os_thread_id_t id;
+ ulint i;
+
+ id = os_thread_get_curr_id();
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = sync_thread_level_arrays_get_nth(i);
+
+ if (slot->levels && os_thread_eq(slot->id, id)) {
+
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/******************************************************************//**
+Looks for an unused thread slot.
+@return pointer to thread slot */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_free(void)
+/*====================================*/
+
+{
+ sync_thread_t* slot;
+ ulint i;
+
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ slot = sync_thread_level_arrays_get_nth(i);
+
+ if (slot->levels == NULL) {
+
+ return(slot);
+ }
+ }
+
+ return(NULL);
+}
+
+/******************************************************************//**
+Gets the value in the nth slot in the thread level array.
+@return pointer to level slot */
+static
+sync_level_t*
+sync_thread_levels_get_nth(
+/*=======================*/
+ sync_level_t* arr, /*!< in: pointer to level array for an OS
+ thread */
+ ulint n) /*!< in: slot number */
+{
+ ut_ad(n < SYNC_THREAD_N_LEVELS);
+
+ return(arr + n);
+}
+
+/******************************************************************//**
+Checks if all the level values stored in the level array are greater than
+the given limit.
+@return TRUE if all greater */
+static
+ibool
+sync_thread_levels_g(
+/*=================*/
+ sync_level_t* arr, /*!< in: pointer to level array for an OS
+ thread */
+ ulint limit, /*!< in: level limit */
+ ulint warn) /*!< in: TRUE=display a diagnostic message */
+{
+ sync_level_t* slot;
+ rw_lock_t* lock;
+ mutex_t* mutex;
+ ulint i;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL) {
+ if (slot->level <= limit) {
+
+ if (!warn) {
+
+ return(FALSE);
+ }
+
+ lock = slot->latch;
+ mutex = slot->latch;
+
+ fprintf(stderr,
+ "InnoDB: sync levels should be"
+ " > %lu but a level is %lu\n",
+ (ulong) limit, (ulong) slot->level);
+
+ if (mutex->magic_n == MUTEX_MAGIC_N) {
+ fprintf(stderr,
+ "Mutex '%s'\n",
+ mutex->cmutex_name);
+
+ if (mutex_get_lock_word(mutex) != 0) {
+ const char* file_name;
+ ulint line;
+ os_thread_id_t thread_id;
+
+ mutex_get_debug_info(
+ mutex, &file_name,
+ &line, &thread_id);
+
+ fprintf(stderr,
+ "InnoDB: Locked mutex:"
+ " addr %p thread %ld"
+ " file %s line %ld\n",
+ (void*) mutex,
+ os_thread_pf(
+ thread_id),
+ file_name,
+ (ulong) line);
+ } else {
+ fputs("Not locked\n", stderr);
+ }
+ } else {
+ rw_lock_print(lock);
+ }
+
+ return(FALSE);
+ }
+ }
+ }
+
+ return(TRUE);
+}
+
+/******************************************************************//**
+Checks if the level value is stored in the level array.
+@return TRUE if stored */
+static
+ibool
+sync_thread_levels_contain(
+/*=======================*/
+ sync_level_t* arr, /*!< in: pointer to level array for an OS
+ thread */
+ ulint level) /*!< in: level */
+{
+ sync_level_t* slot;
+ ulint i;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL) {
+ if (slot->level == level) {
+
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+ ulint level) /*!< in: latching order level
+ (SYNC_DICT, ...)*/
+{
+ sync_level_t* arr;
+ sync_thread_t* thread_slot;
+ sync_level_t* slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return(NULL);
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+ }
+
+ arr = thread_slot->levels;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL && slot->level == level) {
+
+ mutex_exit(&sync_thread_mutex);
+ return(slot->latch);
+ }
+ }
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+}
+
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+ ibool dict_mutex_allowed) /*!< in: TRUE if dictionary mutex is
+ allowed to be owned by the thread,
+ also purge_is_running mutex is
+ allowed */
+{
+ sync_level_t* arr;
+ sync_thread_t* thread_slot;
+ sync_level_t* slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return(NULL);
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+ }
+
+ arr = thread_slot->levels;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(arr, i);
+
+ if (slot->latch != NULL
+ && (!dict_mutex_allowed
+ || (slot->level != SYNC_DICT
+ && slot->level != SYNC_DICT_OPERATION))) {
+
+ mutex_exit(&sync_thread_mutex);
+ ut_error;
+
+ return(slot->latch);
+ }
+ }
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(NULL);
+}
+
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return TRUE if empty */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty(void)
+/*==========================*/
+{
+ return(sync_thread_levels_empty_gen(FALSE));
+}
+
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+ void* latch, /*!< in: pointer to a mutex or an rw-lock */
+ ulint level) /*!< in: level in the latching order; if
+ SYNC_LEVEL_VARYING, nothing is done */
+{
+ sync_level_t* array;
+ sync_level_t* slot;
+ sync_thread_t* thread_slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return;
+ }
+
+ if ((latch == (void*)&sync_thread_mutex)
+ || (latch == (void*)&mutex_list_mutex)
+ || (latch == (void*)&rw_lock_debug_mutex)
+ || (latch == (void*)&rw_lock_list_mutex)) {
+
+ return;
+ }
+
+ if (level == SYNC_LEVEL_VARYING) {
+
+ return;
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+ /* We have to allocate the level array for a new thread */
+ array = ut_malloc(sizeof(sync_level_t) * SYNC_THREAD_N_LEVELS);
+
+ thread_slot = sync_thread_level_arrays_find_free();
+
+ thread_slot->id = os_thread_get_curr_id();
+ thread_slot->levels = array;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(array, i);
+
+ slot->latch = NULL;
+ }
+ }
+
+ array = thread_slot->levels;
+
+ /* NOTE that there is a problem with _NODE and _LEAF levels: if the
+ B-tree height changes, then a leaf can change to an internal node
+ or the other way around. We do not know at present if this can cause
+ unnecessary assertion failures below. */
+
+ switch (level) {
+ case SYNC_NO_ORDER_CHECK:
+ case SYNC_EXTERN_STORAGE:
+ case SYNC_TREE_NODE_FROM_HASH:
+ /* Do no order checking */
+ break;
+ case SYNC_MEM_POOL:
+ case SYNC_MEM_HASH:
+ case SYNC_RECV:
+ case SYNC_WORK_QUEUE:
+ case SYNC_LOG:
+ case SYNC_THR_LOCAL:
+ case SYNC_ANY_LATCH:
+ case SYNC_TRX_SYS_HEADER:
+ case SYNC_FILE_FORMAT_TAG:
+ case SYNC_DOUBLEWRITE:
+ case SYNC_BUF_LRU_LIST:
+ case SYNC_BUF_FLUSH_LIST:
+ case SYNC_BUF_PAGE_HASH:
+ case SYNC_BUF_FREE_LIST:
+ case SYNC_BUF_ZIP_FREE:
+ case SYNC_BUF_ZIP_HASH:
+ case SYNC_BUF_POOL:
+ case SYNC_SEARCH_SYS:
+ case SYNC_SEARCH_SYS_CONF:
+ case SYNC_TRX_LOCK_HEAP:
+ case SYNC_KERNEL:
+ case SYNC_IBUF_BITMAP_MUTEX:
+ case SYNC_RSEG:
+ case SYNC_TRX_UNDO:
+ case SYNC_PURGE_LATCH:
+ case SYNC_PURGE_SYS:
+ case SYNC_DICT_AUTOINC_MUTEX:
+ case SYNC_DICT_OPERATION:
+ case SYNC_DICT_HEADER:
+ case SYNC_TRX_I_S_RWLOCK:
+ case SYNC_TRX_I_S_LAST_READ:
+ if (!sync_thread_levels_g(array, level, TRUE)) {
+ fprintf(stderr,
+ "InnoDB: sync_thread_levels_g(array, %lu)"
+ " does not hold!\n", level);
+ ut_error;
+ }
+ break;
+ case SYNC_BUF_BLOCK:
+ /* Either the thread must own the buffer pool mutex
+ (buf_pool_mutex), or it is allowed to latch only ONE
+ buffer block (block->mutex or buf_pool_zip_mutex). */
+ if (!sync_thread_levels_g(array, level, FALSE)) {
+ ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+ ut_a(sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST));
+ }
+ break;
+ case SYNC_REC_LOCK:
+ if (sync_thread_levels_contain(array, SYNC_KERNEL)) {
+ ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1,
+ TRUE));
+ } else {
+ ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK, TRUE));
+ }
+ break;
+ case SYNC_IBUF_BITMAP:
+ /* Either the thread must own the master mutex to all
+ the bitmap pages, or it is allowed to latch only ONE
+ bitmap page. */
+ if (sync_thread_levels_contain(array,
+ SYNC_IBUF_BITMAP_MUTEX)) {
+ ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1,
+ TRUE));
+ } else {
+ ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP,
+ TRUE));
+ }
+ break;
+ case SYNC_FSP_PAGE:
+ ut_a(sync_thread_levels_contain(array, SYNC_FSP));
+ break;
+ case SYNC_FSP:
+ ut_a(sync_thread_levels_contain(array, SYNC_FSP)
+ || sync_thread_levels_g(array, SYNC_FSP, TRUE));
+ break;
+ case SYNC_TRX_UNDO_PAGE:
+ ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
+ || sync_thread_levels_contain(array, SYNC_RSEG)
+ || sync_thread_levels_contain(array, SYNC_PURGE_SYS)
+ || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE, TRUE));
+ break;
+ case SYNC_RSEG_HEADER:
+ ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
+ break;
+ case SYNC_RSEG_HEADER_NEW:
+ ut_a(sync_thread_levels_contain(array, SYNC_KERNEL)
+ && sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+ break;
+ case SYNC_TREE_NODE:
+ ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE)
+ || sync_thread_levels_contain(array, SYNC_DICT_OPERATION)
+ || sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE));
+ break;
+ case SYNC_TREE_NODE_NEW:
+ ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE)
+ || sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+ break;
+ case SYNC_INDEX_TREE:
+ if (sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)
+ && sync_thread_levels_contain(array, SYNC_FSP)) {
+ ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1,
+ TRUE));
+ } else {
+ ut_a(sync_thread_levels_g(array, SYNC_TREE_NODE - 1,
+ TRUE));
+ }
+ break;
+ case SYNC_IBUF_MUTEX:
+ ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1, TRUE));
+ break;
+ case SYNC_IBUF_PESS_INSERT_MUTEX:
+ ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+ ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+ break;
+ case SYNC_IBUF_HEADER:
+ ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+ ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+ ut_a(!sync_thread_levels_contain(array,
+ SYNC_IBUF_PESS_INSERT_MUTEX));
+ break;
+ case SYNC_DICT:
+#ifdef UNIV_DEBUG
+ ut_a(buf_debug_prints
+ || sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#else /* UNIV_DEBUG */
+ ut_a(sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#endif /* UNIV_DEBUG */
+ break;
+ default:
+ ut_error;
+ }
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(array, i);
+
+ if (slot->latch == NULL) {
+ slot->latch = latch;
+ slot->level = level;
+
+ break;
+ }
+ }
+
+ ut_a(i < SYNC_THREAD_N_LEVELS);
+
+ mutex_exit(&sync_thread_mutex);
+}
+
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+ void* latch) /*!< in: pointer to a mutex or an rw-lock */
+{
+ sync_level_t* array;
+ sync_level_t* slot;
+ sync_thread_t* thread_slot;
+ ulint i;
+
+ if (!sync_order_checks_on) {
+
+ return(FALSE);
+ }
+
+ if ((latch == (void*)&sync_thread_mutex)
+ || (latch == (void*)&mutex_list_mutex)
+ || (latch == (void*)&rw_lock_debug_mutex)
+ || (latch == (void*)&rw_lock_list_mutex)) {
+
+ return(FALSE);
+ }
+
+ mutex_enter(&sync_thread_mutex);
+
+ thread_slot = sync_thread_level_arrays_find_slot();
+
+ if (thread_slot == NULL) {
+
+ ut_error;
+
+ mutex_exit(&sync_thread_mutex);
+ return(FALSE);
+ }
+
+ array = thread_slot->levels;
+
+ for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+ slot = sync_thread_levels_get_nth(array, i);
+
+ if (slot->latch == latch) {
+ slot->latch = NULL;
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(TRUE);
+ }
+ }
+
+ if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) {
+ rw_lock_t* rw_lock;
+
+ rw_lock = (rw_lock_t*) latch;
+
+ if (rw_lock->level == SYNC_LEVEL_VARYING) {
+ mutex_exit(&sync_thread_mutex);
+
+ return(TRUE);
+ }
+ }
+
+ ut_error;
+
+ mutex_exit(&sync_thread_mutex);
+
+ return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void)
+/*===========*/
+{
+#ifdef UNIV_SYNC_DEBUG
+ sync_thread_t* thread_slot;
+ ulint i;
+#endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(sync_initialized == FALSE);
+
+ sync_initialized = TRUE;
+
+ /* Create the primary system wait array which is protected by an OS
+ mutex */
+
+ sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N,
+ SYNC_ARRAY_OS_MUTEX);
+#ifdef UNIV_SYNC_DEBUG
+ /* Create the thread latch level array where the latch levels
+ are stored for each OS thread */
+
+ sync_thread_level_arrays = ut_malloc(OS_THREAD_MAX_N
+ * sizeof(sync_thread_t));
+ for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+ thread_slot = sync_thread_level_arrays_get_nth(i);
+ thread_slot->levels = NULL;
+ }
+#endif /* UNIV_SYNC_DEBUG */
+ /* Init the mutex list and create the mutex to protect it. */
+
+ UT_LIST_INIT(mutex_list);
+ mutex_create(&mutex_list_mutex, SYNC_NO_ORDER_CHECK);
+#ifdef UNIV_SYNC_DEBUG
+ mutex_create(&sync_thread_mutex, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+ /* Init the rw-lock list and create the mutex to protect it. */
+
+ UT_LIST_INIT(rw_lock_list);
+ mutex_create(&rw_lock_list_mutex, SYNC_NO_ORDER_CHECK);
+
+#ifdef UNIV_SYNC_DEBUG
+ mutex_create(&rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK);
+
+ rw_lock_debug_event = os_event_create(NULL);
+ rw_lock_debug_waiters = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/******************************************************************//**
+Frees the resources in InnoDB's own synchronization data structures. Use
+os_sync_free() after calling this. */
+UNIV_INTERN
+void
+sync_close(void)
+/*===========*/
+{
+ mutex_t* mutex;
+
+ sync_array_free(sync_primary_wait_array);
+
+ mutex = UT_LIST_GET_FIRST(mutex_list);
+
+ while (mutex) {
+#ifdef UNIV_MEM_DEBUG
+ if (mutex == &mem_hash_mutex) {
+ mutex = UT_LIST_GET_NEXT(list, mutex);
+ continue;
+ }
+#endif /* UNIV_MEM_DEBUG */
+ mutex_free(mutex);
+ mutex = UT_LIST_GET_FIRST(mutex_list);
+ }
+
+ mutex_free(&mutex_list_mutex);
+#ifdef UNIV_SYNC_DEBUG
+ mutex_free(&sync_thread_mutex);
+
+ /* Switch latching order checks on in sync0sync.c */
+ sync_order_checks_on = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+
+ sync_initialized = FALSE;
+}
+
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+ FILE* file) /*!< in: file where to print */
+{
+#ifdef UNIV_SYNC_DEBUG
+ fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n",
+ mutex_exit_count, rw_s_exit_count, rw_x_exit_count);
+#endif
+
+ fprintf(file,
+ "Mutex spin waits %llu, rounds %llu, OS waits %llu\n"
+ "RW-shared spins %llu, OS waits %llu;"
+ " RW-excl spins %llu, OS waits %llu\n",
+ mutex_spin_wait_count,
+ mutex_spin_round_count,
+ mutex_os_wait_count,
+ rw_s_spin_wait_count,
+ rw_s_os_wait_count,
+ rw_x_spin_wait_count,
+ rw_x_os_wait_count);
+
+ fprintf(file,
+ "Spin rounds per wait: %.2f mutex, %.2f RW-shared, "
+ "%.2f RW-excl\n",
+ (double) mutex_spin_round_count /
+ (mutex_spin_wait_count ? mutex_spin_wait_count : 1),
+ (double) rw_s_spin_round_count /
+ (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1),
+ (double) rw_x_spin_round_count /
+ (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1));
+}
+
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+ FILE* file) /*!< in: file where to print */
+{
+#ifdef UNIV_SYNC_DEBUG
+ mutex_list_print_info(file);
+
+ rw_lock_list_print_info(file);
+#endif /* UNIV_SYNC_DEBUG */
+
+ sync_array_print_info(file, sync_primary_wait_array);
+
+ sync_print_wait_info(file);
+}
diff --git a/storage/xtradb/thr/thr0loc.c b/storage/xtradb/thr/thr0loc.c
new file mode 100644
index 00000000000..5b9e83be920
--- /dev/null
+++ b/storage/xtradb/thr/thr0loc.c
@@ -0,0 +1,308 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file thr/thr0loc.c
+The thread local storage
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "thr0loc.h"
+#ifdef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#include "sync0sync.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "srv0srv.h"
+
+/*
+ IMPLEMENTATION OF THREAD LOCAL STORAGE
+ ======================================
+
+The threads sometimes need private data which depends on the thread id.
+This is implemented as a hash table, where the hash value is calculated
+from the thread id, to prepare for a large number of threads. The hash table
+is protected by a mutex. If you need modify the program and put new data to
+the thread local storage, just add it to struct thr_local_struct in the
+header file. */
+
+/** Mutex protecting thr_local_hash */
+static mutex_t thr_local_mutex;
+
+/** The hash table. The module is not yet initialized when it is NULL. */
+static hash_table_t* thr_local_hash = NULL;
+ulint thr_local_hash_n_nodes = 0;
+
+/** Thread local data */
+typedef struct thr_local_struct thr_local_t;
+
+/** @brief Thread local data.
+The private data for each thread should be put to
+the structure below and the accessor functions written
+for the field. */
+struct thr_local_struct{
+ os_thread_id_t id; /*!< id of the thread which owns this struct */
+ os_thread_t handle; /*!< operating system handle to the thread */
+ ulint slot_no;/*!< the index of the slot in the thread table
+ for this thread */
+ ibool in_ibuf;/*!< TRUE if the thread is doing an ibuf
+ operation */
+ hash_node_t hash; /*!< hash chain node */
+ ulint magic_n;/*!< magic number (THR_LOCAL_MAGIC_N) */
+};
+
+/** The value of thr_local_struct::magic_n */
+#define THR_LOCAL_MAGIC_N 1231234
+
+/*******************************************************************//**
+Returns the local storage struct for a thread.
+@return local storage */
+static
+thr_local_t*
+thr_local_get(
+/*==========*/
+ os_thread_id_t id) /*!< in: thread id of the thread */
+{
+ thr_local_t* local;
+
+try_again:
+ ut_ad(thr_local_hash);
+ ut_ad(mutex_own(&thr_local_mutex));
+
+ /* Look for the local struct in the hash table */
+
+ local = NULL;
+
+ HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id),
+ thr_local_t*, local,, os_thread_eq(local->id, id));
+ if (local == NULL) {
+ mutex_exit(&thr_local_mutex);
+
+ thr_local_create();
+
+ mutex_enter(&thr_local_mutex);
+
+ goto try_again;
+ }
+
+ ut_ad(local->magic_n == THR_LOCAL_MAGIC_N);
+
+ return(local);
+}
+
+/*******************************************************************//**
+Gets the slot number in the thread table of a thread.
+@return slot number */
+UNIV_INTERN
+ulint
+thr_local_get_slot_no(
+/*==================*/
+ os_thread_id_t id) /*!< in: thread id of the thread */
+{
+ ulint slot_no;
+ thr_local_t* local;
+
+ mutex_enter(&thr_local_mutex);
+
+ local = thr_local_get(id);
+
+ slot_no = local->slot_no;
+
+ mutex_exit(&thr_local_mutex);
+
+ return(slot_no);
+}
+
+/*******************************************************************//**
+Sets the slot number in the thread table of a thread. */
+UNIV_INTERN
+void
+thr_local_set_slot_no(
+/*==================*/
+ os_thread_id_t id, /*!< in: thread id of the thread */
+ ulint slot_no)/*!< in: slot number */
+{
+ thr_local_t* local;
+
+ mutex_enter(&thr_local_mutex);
+
+ local = thr_local_get(id);
+
+ local->slot_no = slot_no;
+
+ mutex_exit(&thr_local_mutex);
+}
+
+/*******************************************************************//**
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage.
+@return pointer to the in_ibuf field */
+UNIV_INTERN
+ibool*
+thr_local_get_in_ibuf_field(void)
+/*=============================*/
+{
+ thr_local_t* local;
+
+ mutex_enter(&thr_local_mutex);
+
+ local = thr_local_get(os_thread_get_curr_id());
+
+ mutex_exit(&thr_local_mutex);
+
+ return(&(local->in_ibuf));
+}
+
+/*******************************************************************//**
+Creates a local storage struct for the calling new thread. */
+UNIV_INTERN
+void
+thr_local_create(void)
+/*==================*/
+{
+ thr_local_t* local;
+
+ if (thr_local_hash == NULL) {
+ thr_local_init();
+ }
+
+ local = mem_alloc(sizeof(thr_local_t));
+
+ local->id = os_thread_get_curr_id();
+ local->handle = os_thread_get_curr();
+ local->magic_n = THR_LOCAL_MAGIC_N;
+
+ local->in_ibuf = FALSE;
+
+ mutex_enter(&thr_local_mutex);
+
+ HASH_INSERT(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(os_thread_get_curr_id()),
+ local);
+
+ thr_local_hash_n_nodes++;
+ mutex_exit(&thr_local_mutex);
+}
+
+/*******************************************************************//**
+Frees the local storage struct for the specified thread. */
+UNIV_INTERN
+void
+thr_local_free(
+/*===========*/
+ os_thread_id_t id) /*!< in: thread id */
+{
+ thr_local_t* local;
+
+ mutex_enter(&thr_local_mutex);
+
+ /* Look for the local struct in the hash table */
+
+ HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id),
+ thr_local_t*, local,, os_thread_eq(local->id, id));
+ if (local == NULL) {
+ mutex_exit(&thr_local_mutex);
+
+ return;
+ }
+
+ HASH_DELETE(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(id), local);
+ thr_local_hash_n_nodes--;
+
+ mutex_exit(&thr_local_mutex);
+
+ ut_a(local->magic_n == THR_LOCAL_MAGIC_N);
+
+ mem_free(local);
+}
+
+/****************************************************************//**
+Initializes the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_init(void)
+/*================*/
+{
+
+ ut_a(thr_local_hash == NULL);
+
+ thr_local_hash = hash_create(OS_THREAD_MAX_N + 100);
+
+ mutex_create(&thr_local_mutex, SYNC_THR_LOCAL);
+}
+
+/********************************************************************
+Close the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_close(void)
+/*=================*/
+{
+ ulint i;
+
+ ut_a(thr_local_hash != NULL);
+
+ /* Free the hash elements. We don't remove them from the table
+ because we are going to destroy the table anyway. */
+ for (i = 0; i < hash_get_n_cells(thr_local_hash); i++) {
+ thr_local_t* local;
+
+ local = HASH_GET_FIRST(thr_local_hash, i);
+
+ while (local) {
+ thr_local_t* prev_local = local;
+
+ local = HASH_GET_NEXT(hash, prev_local);
+ ut_a(prev_local->magic_n == THR_LOCAL_MAGIC_N);
+ mem_free(prev_local);
+ }
+ }
+
+ hash_table_free(thr_local_hash);
+ thr_local_hash = NULL;
+}
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+thr_local_hash_cells(void)
+/*======================*/
+{
+ if (thr_local_hash) {
+ return (thr_local_hash->n_cells);
+ } else {
+ return 0;
+ }
+}
+
+ulint
+thr_local_hash_nodes(void)
+/*======================*/
+{
+ if (thr_local_hash) {
+ return (thr_local_hash_n_nodes
+ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE));
+ } else {
+ return 0;
+ }
+}
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c
new file mode 100644
index 00000000000..5bc8302d0c0
--- /dev/null
+++ b/storage/xtradb/trx/trx0i_s.c
@@ -0,0 +1,1481 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.c
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+ The includes "univ.i" -> "my_global.h" cause a different path
+ to be taken further down with pthread functions and types,
+ so they must come first.
+ From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
+#include <mysql/plugin.h>
+
+#include "mysql_addons.h"
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "ha_prototypes.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "sync0types.h"
+#include "trx0i_s.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "ut0mem.h"
+#include "ut0ut.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM 1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE 39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache hash storage
+@return maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache) \
+ (TRX_I_S_MEM_LIMIT \
+ - (cache)->mem_allocd \
+ - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+typedef struct i_s_mem_chunk_struct {
+ ulint offset; /*!< offset, in number of rows */
+ ulint rows_allocd; /*!< the size of this chunk, in number
+ of rows */
+ void* base; /*!< start of the chunk */
+} i_s_mem_chunk_t;
+
+/** This represents one table's cache. */
+typedef struct i_s_table_cache_struct {
+ ulint rows_used; /*!< number of used rows */
+ ulint rows_allocd; /*!< number of allocated rows */
+ ulint row_size; /*!< size of a single row */
+ i_s_mem_chunk_t chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+ memory chunks that stores the
+ rows */
+} i_s_table_cache_t;
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_struct {
+ rw_lock_t rw_lock; /*!< read-write lock protecting
+ the rest of this structure */
+ ullint last_read; /*!< last time the cache was read;
+ measured in microseconds since
+ epoch */
+ mutex_t last_read_mutex;/*!< mutex protecting the
+ last_read member - it is updated
+ inside a shared lock of the
+ rw_lock member */
+ i_s_table_cache_t innodb_trx; /*!< innodb_trx table */
+ i_s_table_cache_t innodb_locks; /*!< innodb_locks table */
+ i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM 10000
+ hash_table_t* locks_hash; /*!< hash table used to eliminate
+ duplicate entries in the
+ innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE 1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS 2048
+ ha_storage_t* storage; /*!< storage for external volatile
+ data that can possibly not be
+ available later, when we release
+ the kernel mutex */
+ ulint mem_allocd; /*!< the amount of memory
+ allocated with mem_alloc*() */
+ ibool is_truncated; /*!< this is TRUE if the memory
+ limit was hit and thus the data
+ in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+UNIV_INTERN trx_i_s_cache_t* trx_i_s_cache = &trx_i_s_cache_static;
+
+/*******************************************************************//**
+For a record lock that is in waiting state retrieves the only bit that
+is set, for a table lock returns ULINT_UNDEFINED.
+@return record number within the heap */
+static
+ulint
+wait_lock_get_heap_no(
+/*==================*/
+ const lock_t* lock) /*!< in: lock */
+{
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ret = lock_rec_find_set_bit(lock);
+ ut_a(ret != ULINT_UNDEFINED);
+ break;
+ case LOCK_TABLE:
+ ret = ULINT_UNDEFINED;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+ i_s_table_cache_t* table_cache, /*!< out: table cache */
+ size_t row_size) /*!< in: the size of a
+ row */
+{
+ ulint i;
+
+ table_cache->rows_used = 0;
+ table_cache->rows_allocd = 0;
+ table_cache->row_size = row_size;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ table_cache->chunks[i].base = NULL;
+ }
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+ i_s_table_cache_t* table_cache) /*!< in/out: table cache */
+{
+ ulint i;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ /* the memory is actually allocated in
+ table_cache_create_empty_row() */
+ if (table_cache->chunks[i].base) {
+ mem_free(table_cache->chunks[i].base);
+ table_cache->chunks[i].base = NULL;
+ }
+ }
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+ i_s_table_cache_t* table_cache, /*!< in/out: table cache */
+ trx_i_s_cache_t* cache) /*!< in/out: cache to record
+ how many bytes are
+ allocated */
+{
+ ulint i;
+ void* row;
+
+ ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+ if (table_cache->rows_used == table_cache->rows_allocd) {
+
+ /* rows_used == rows_allocd means that new chunk needs
+ to be allocated: either no more empty rows in the
+ last allocated chunk or nothing has been allocated yet
+ (rows_num == rows_allocd == 0); */
+
+ i_s_mem_chunk_t* chunk;
+ ulint req_bytes;
+ ulint got_bytes;
+ ulint req_rows;
+ ulint got_rows;
+
+ /* find the first not allocated chunk */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].base == NULL) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ have been allocated :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ /* allocate the chunk we just found */
+
+ if (i == 0) {
+
+ /* first chunk, nothing is allocated yet */
+ req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+ } else {
+
+ /* Memory is increased by the formula
+ new = old + old / 2; We are trying not to be
+ aggressive here (= using the common new = old * 2)
+ because the allocated memory will not be freed
+ until InnoDB exit (it is reused). So it is better
+ to once allocate the memory in more steps, but
+ have less unused/wasted memory than to use less
+ steps in allocation (which is done once in a
+ lifetime) but end up with lots of unused/wasted
+ memory. */
+ req_rows = table_cache->rows_allocd / 2;
+ }
+ req_bytes = req_rows * table_cache->row_size;
+
+ if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+ return(NULL);
+ }
+
+ chunk = &table_cache->chunks[i];
+
+ chunk->base = mem_alloc2(req_bytes, &got_bytes);
+
+ got_rows = got_bytes / table_cache->row_size;
+
+ cache->mem_allocd += got_bytes;
+
+#if 0
+ printf("allocating chunk %d req bytes=%lu, got bytes=%lu, "
+ "row size=%lu, "
+ "req rows=%lu, got rows=%lu\n",
+ i, req_bytes, got_bytes,
+ table_cache->row_size,
+ req_rows, got_rows);
+#endif
+
+ chunk->rows_allocd = got_rows;
+
+ table_cache->rows_allocd += got_rows;
+
+ /* adjust the offset of the next chunk */
+ if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+ table_cache->chunks[i + 1].offset
+ = chunk->offset + chunk->rows_allocd;
+ }
+
+ /* return the first empty row in the newly allocated
+ chunk */
+ row = chunk->base;
+ } else {
+
+ char* chunk_start;
+ ulint offset;
+
+ /* there is an empty row, no need to allocate new
+ chunks */
+
+ /* find the first chunk that contains allocated but
+ empty/unused rows */
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd
+ > table_cache->rows_used) {
+
+ break;
+ }
+ }
+
+ /* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+ are full, but
+ table_cache->rows_used != table_cache->rows_allocd means
+ exactly the opposite - there are allocated but
+ empty/unused rows :-X */
+ ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+ chunk_start = (char*) table_cache->chunks[i].base;
+ offset = table_cache->rows_used
+ - table_cache->chunks[i].offset;
+
+ row = chunk_start + offset * table_cache->row_size;
+ }
+
+ table_cache->rows_used++;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+ i_s_trx_row_t* row, /*!< out: result object
+ that's filled */
+ const trx_t* trx, /*!< in: transaction to
+ get data from */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ corresponding row in
+ innodb_locks if trx is
+ waiting or NULL if trx
+ is not waiting */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into
+ which to copy volatile
+ strings */
+{
+ const char* stmt;
+ size_t stmt_len;
+
+ row->trx_id = trx_get_id(trx);
+ row->trx_started = (ib_time_t) trx->start_time;
+ row->trx_state = trx_get_que_state_str(trx);
+
+ if (trx->wait_lock != NULL) {
+
+ ut_a(requested_lock_row != NULL);
+
+ row->requested_lock_row = requested_lock_row;
+ row->trx_wait_started = (ib_time_t) trx->wait_started;
+ } else {
+
+ ut_a(requested_lock_row == NULL);
+
+ row->requested_lock_row = NULL;
+ row->trx_wait_started = 0;
+ }
+
+ row->trx_weight = (ullint) ut_conv_dulint_to_longlong(TRX_WEIGHT(trx));
+
+ if (trx->mysql_thd == NULL) {
+ /* For internal transactions e.g., purge and transactions
+ being recovered at startup there is no associated MySQL
+ thread data structure. */
+ row->trx_mysql_thread_id = 0;
+ row->trx_query = NULL;
+ return(TRUE);
+ }
+
+ row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+ stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
+
+ if (stmt != NULL) {
+
+ char query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+
+ if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) {
+ stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN;
+ }
+
+ memcpy(query, stmt, stmt_len);
+ query[stmt_len] = '\0';
+
+ row->trx_query = ha_storage_put_memlim(
+ cache->storage, stmt, stmt_len + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (row->trx_query == NULL) {
+
+ return(FALSE);
+ }
+ } else {
+
+ row->trx_query = NULL;
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+ char* buf, /*!< out: buffer */
+ ulint buf_size,/*!< in: buffer size in bytes */
+ ulint n, /*!< in: number of field */
+ const dict_index_t* index, /*!< in: index */
+ const rec_t* rec, /*!< in: record */
+ const ulint* offsets)/*!< in: record offsets, returned
+ by rec_get_offsets() */
+{
+ const byte* data;
+ ulint data_len;
+ dict_field_t* dict_field;
+ ulint ret;
+
+ ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+ if (buf_size == 0) {
+
+ return(0);
+ }
+
+ ret = 0;
+
+ if (n > 0) {
+ /* we must append ", " before the actual data */
+
+ if (buf_size < 3) {
+
+ buf[0] = '\0';
+ return(1);
+ }
+
+ memcpy(buf, ", ", 3);
+
+ buf += 2;
+ buf_size -= 2;
+ ret += 2;
+ }
+
+ /* now buf_size >= 1 */
+
+ data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+ dict_field = dict_index_get_nth_field(index, n);
+
+ ret += row_raw_format((const char*) data, data_len,
+ dict_field, buf, buf_size);
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+ const char** lock_data,/*!< out: "lock_data" to fill */
+ const lock_t* lock, /*!< in: lock used to find the data */
+ ulint heap_no,/*!< in: rec num used to find the data */
+ trx_i_s_cache_t* cache) /*!< in/out: cache where to store
+ volatile data */
+{
+ mtr_t mtr;
+
+ const buf_block_t* block;
+ const page_t* page;
+ const rec_t* rec;
+
+ ut_a(lock_get_type(lock) == LOCK_REC);
+
+ mtr_start(&mtr);
+
+ block = buf_page_try_get(lock_rec_get_space_id(lock),
+ lock_rec_get_page_no(lock),
+ &mtr);
+
+ if (block == NULL) {
+
+ *lock_data = NULL;
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+ }
+
+ page = (const page_t*) buf_block_get_frame(block);
+
+ rec = page_find_rec_with_heap_no(page, heap_no);
+
+ if (page_rec_is_infimum(rec)) {
+
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage, "infimum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ } else if (page_rec_is_supremum(rec)) {
+
+ *lock_data = ha_storage_put_str_memlim(
+ cache->storage, "supremum pseudo-record",
+ MAX_ALLOWED_FOR_STORAGE(cache));
+ } else {
+
+ const dict_index_t* index;
+ ulint n_fields;
+ mem_heap_t* heap;
+ ulint offsets_onstack[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets;
+ char buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+ ulint buf_used;
+ ulint i;
+
+ rec_offs_init(offsets_onstack);
+ offsets = offsets_onstack;
+
+ index = lock_rec_get_index(lock);
+
+ n_fields = dict_index_get_n_unique(index);
+
+ ut_a(n_fields > 0);
+
+ heap = NULL;
+ offsets = rec_get_offsets(rec, index, offsets, n_fields,
+ &heap);
+
+ /* format and store the data */
+
+ buf_used = 0;
+ for (i = 0; i < n_fields; i++) {
+
+ buf_used += put_nth_field(
+ buf + buf_used, sizeof(buf) - buf_used,
+ i, index, rec, offsets) - 1;
+ }
+
+ *lock_data = (const char*) ha_storage_put_memlim(
+ cache->storage, buf, buf_used + 1,
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ if (UNIV_UNLIKELY(heap != NULL)) {
+
+ /* this means that rec_get_offsets() has created a new
+ heap and has stored offsets in it; check that this is
+ really the case and free the heap */
+ ut_a(offsets != offsets_onstack);
+ mem_heap_free(heap);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ if (*lock_data == NULL) {
+
+ return(FALSE);
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+fill_locks_row(
+/*===========*/
+ i_s_locks_row_t* row, /*!< out: result object that's filled */
+ const lock_t* lock, /*!< in: lock to get data from */
+ ulint heap_no,/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+ trx_i_s_cache_t* cache) /*!< in/out: cache into which to copy
+ volatile strings */
+{
+ row->lock_trx_id = lock_get_trx_id(lock);
+ row->lock_mode = lock_get_mode_str(lock);
+ row->lock_type = lock_get_type_str(lock);
+
+ row->lock_table = ha_storage_put_str_memlim(
+ cache->storage, lock_get_table_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_table == NULL) {
+
+ return(FALSE);
+ }
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ row->lock_index = ha_storage_put_str_memlim(
+ cache->storage, lock_rec_get_index_name(lock),
+ MAX_ALLOWED_FOR_STORAGE(cache));
+
+ /* memory could not be allocated */
+ if (row->lock_index == NULL) {
+
+ return(FALSE);
+ }
+
+ row->lock_space = lock_rec_get_space_id(lock);
+ row->lock_page = lock_rec_get_page_no(lock);
+ row->lock_rec = heap_no;
+
+ if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+
+ break;
+ case LOCK_TABLE:
+ row->lock_index = NULL;
+
+ row->lock_space = ULINT_UNDEFINED;
+ row->lock_page = ULINT_UNDEFINED;
+ row->lock_rec = ULINT_UNDEFINED;
+
+ row->lock_data = NULL;
+
+ break;
+ default:
+ ut_error;
+ }
+
+ row->lock_table_id = lock_get_table_id(lock);
+
+ row->hash_chain.value = row;
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+ i_s_lock_waits_row_t* row, /*!< out: result object
+ that's filled */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ row->requested_lock_row = requested_lock_row;
+ row->blocking_lock_row = blocking_lock_row;
+
+ return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return fold */
+static
+ulint
+fold_lock(
+/*======*/
+ const lock_t* lock, /*!< in: lock object to fold */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+ static ulint fold = 0;
+
+ return(fold++);
+#else
+ ulint ret;
+
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock),
+ lock_rec_get_space_id(lock));
+
+ ret = ut_fold_ulint_pair(ret,
+ lock_rec_get_page_no(lock));
+
+ ret = ut_fold_ulint_pair(ret, heap_no);
+
+ break;
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == ULINT_UNDEFINED);
+
+ ret = (ulint) lock_get_table_id(lock);
+
+ break;
+ default:
+ ut_error;
+ }
+
+ return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ const lock_t* lock, /*!< in: lock object */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+ return(0);
+#else
+ switch (lock_get_type(lock)) {
+ case LOCK_REC:
+ ut_a(heap_no != ULINT_UNDEFINED);
+
+ return(row->lock_trx_id == lock_get_trx_id(lock)
+ && row->lock_space == lock_rec_get_space_id(lock)
+ && row->lock_page == lock_rec_get_page_no(lock)
+ && row->lock_rec == heap_no);
+
+ case LOCK_TABLE:
+ /* this check is actually not necessary for continuing
+ correct operation, but something must have gone wrong if
+ it fails. */
+ ut_a(heap_no == ULINT_UNDEFINED);
+
+ return(row->lock_trx_id == lock_get_trx_id(lock)
+ && row->lock_table_id == lock_get_table_id(lock));
+
+ default:
+ ut_error;
+ return(FALSE);
+ }
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ const lock_t* lock, /*!< in: lock to search for */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+ i_s_hash_chain_t* hash_chain;
+
+ HASH_SEARCH(
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* the type of the next variable */
+ i_s_hash_chain_t*,
+ /* auxiliary variable */
+ hash_chain,
+ /* assertion on every traversed item */
+ ,
+ /* this determines if we have found the lock */
+ locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+ if (hash_chain == NULL) {
+
+ return(NULL);
+ }
+ /* else */
+
+ return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const lock_t* lock, /*!< in: the element to add */
+ ulint heap_no)/*!< in: lock's record number
+ or ULINT_UNDEFINED if the lock
+ is a table lock */
+{
+ i_s_locks_row_t* dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ ulint i;
+ for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+ /* quit if this lock is already present */
+ dst_row = search_innodb_locks(cache, lock, heap_no);
+ if (dst_row != NULL) {
+
+ return(dst_row);
+ }
+#endif
+
+ dst_row = (i_s_locks_row_t*)
+ table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(NULL);
+ }
+
+ if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_locks.rows_used--;
+ return(NULL);
+ }
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+ HASH_INSERT(
+ /* the type used in the hash chain */
+ i_s_hash_chain_t,
+ /* hash_chain->"next" */
+ next,
+ /* the hash table */
+ cache->locks_hash,
+ /* fold */
+ fold_lock(lock, heap_no),
+ /* add this data to the hash */
+ &dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+ } /* for()-loop */
+#endif
+
+ return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const i_s_locks_row_t* requested_lock_row,/*!< in: pointer to the
+ relevant requested lock
+ row in innodb_locks */
+ const i_s_locks_row_t* blocking_lock_row)/*!< in: pointer to the
+ relevant blocking lock
+ row in innodb_locks */
+{
+ i_s_lock_waits_row_t* dst_row;
+
+ dst_row = (i_s_lock_waits_row_t*)
+ table_cache_create_empty_row(&cache->innodb_lock_waits,
+ cache);
+
+ /* memory could not be allocated */
+ if (dst_row == NULL) {
+
+ return(FALSE);
+ }
+
+ fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+ return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+ trx_i_s_cache_t* cache, /*!< in/out: cache */
+ const trx_t* trx, /*!< in: transaction */
+ i_s_locks_row_t** requested_lock_row)/*!< out: pointer to the
+ requested lock row, or NULL or
+ undefined */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* If transaction is waiting we add the wait lock and all locks
+ from another transactions that are blocking the wait lock. */
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ const lock_t* curr_lock;
+ ulint wait_lock_heap_no;
+ i_s_locks_row_t* blocking_lock_row;
+ lock_queue_iterator_t iter;
+
+ ut_a(trx->wait_lock != NULL);
+
+ wait_lock_heap_no
+ = wait_lock_get_heap_no(trx->wait_lock);
+
+ /* add the requested lock */
+ *requested_lock_row
+ = add_lock_to_cache(cache, trx->wait_lock,
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (*requested_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* then iterate over the locks before the wait lock and
+ add the ones that are blocking it */
+
+ lock_queue_iterator_reset(&iter, trx->wait_lock,
+ ULINT_UNDEFINED);
+
+ curr_lock = lock_queue_iterator_get_prev(&iter);
+ while (curr_lock != NULL) {
+
+ if (lock_has_to_wait(trx->wait_lock,
+ curr_lock)) {
+
+ /* add the lock that is
+ blocking trx->wait_lock */
+ blocking_lock_row
+ = add_lock_to_cache(
+ cache, curr_lock,
+ /* heap_no is the same
+ for the wait and waited
+ locks */
+ wait_lock_heap_no);
+
+ /* memory could not be allocated */
+ if (blocking_lock_row == NULL) {
+
+ return(FALSE);
+ }
+
+ /* add the relation between both locks
+ to innodb_lock_waits */
+ if (!add_lock_wait_to_cache(
+ cache, *requested_lock_row,
+ blocking_lock_row)) {
+
+ /* memory could not be allocated */
+ return(FALSE);
+ }
+ }
+
+ curr_lock = lock_queue_iterator_get_prev(&iter);
+ }
+ } else {
+
+ *requested_lock_row = NULL;
+ }
+
+ return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in microseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_US 100000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return TRUE if can be updated */
+static
+ibool
+can_cache_be_updated(
+/*=================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ullint now;
+
+ /* Here we read cache->last_read without acquiring its mutex
+ because last_read is only updated when a shared rw lock on the
+ whole cache is being held (see trx_i_s_cache_end_read()) and
+ we are currently holding an exclusive rw lock on the cache.
+ So it is not possible for last_read to be updated while we are
+ reading it. */
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ now = ut_time_us(NULL);
+ if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+ trx_i_s_cache_t* cache) /*!< out: cache to clear */
+{
+ cache->innodb_trx.rows_used = 0;
+ cache->innodb_locks.rows_used = 0;
+ cache->innodb_lock_waits.rows_used = 0;
+
+ hash_table_clear(cache->locks_hash);
+
+ ha_storage_empty(&cache->storage);
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ trx_t* trx;
+ i_s_trx_row_t* trx_row;
+ i_s_locks_row_t* requested_lock_row;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx_i_s_cache_clear(cache);
+
+ /* We iterate over the list of all transactions and add each one
+ to innodb_trx's cache. We also add all locks that are relevant
+ to each transaction into innodb_locks' and innodb_lock_waits'
+ caches. */
+
+ for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+ trx != NULL;
+ trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+ if (!add_trx_relevant_locks_to_cache(cache, trx,
+ &requested_lock_row)) {
+
+ cache->is_truncated = TRUE;
+ return;
+ }
+
+ trx_row = (i_s_trx_row_t*)
+ table_cache_create_empty_row(&cache->innodb_trx,
+ cache);
+
+ /* memory could not be allocated */
+ if (trx_row == NULL) {
+
+ cache->is_truncated = TRUE;
+ return;
+ }
+
+ if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) {
+
+ /* memory could not be allocated */
+ cache->innodb_trx.rows_used--;
+ cache->is_truncated = TRUE;
+ return;
+ }
+ }
+
+ cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return 0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+ trx_i_s_cache_t* cache) /*!< in/out: cache */
+{
+ if (!can_cache_be_updated(cache)) {
+
+ return(1);
+ }
+
+ /* We need to read trx_sys and record/table lock queues */
+ mutex_enter(&kernel_mutex);
+
+ fetch_data_into_cache(cache);
+
+ mutex_exit(&kernel_mutex);
+
+ return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< out: cache to init */
+{
+ /* The latching is done in the following order:
+ acquire trx_i_s_cache_t::rw_lock, X
+ acquire kernel_mutex
+ release kernel_mutex
+ release trx_i_s_cache_t::rw_lock
+ acquire trx_i_s_cache_t::rw_lock, S
+ acquire trx_i_s_cache_t::last_read_mutex
+ release trx_i_s_cache_t::last_read_mutex
+ release trx_i_s_cache_t::rw_lock */
+
+ rw_lock_create(&cache->rw_lock, SYNC_TRX_I_S_RWLOCK);
+
+ cache->last_read = 0;
+
+ mutex_create(&cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
+
+ table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+ table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+ table_cache_init(&cache->innodb_lock_waits,
+ sizeof(i_s_lock_waits_row_t));
+
+ cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM);
+
+ cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+ CACHE_STORAGE_HASH_CELLS);
+
+ cache->mem_allocd = 0;
+
+ cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+ trx_i_s_cache_t* cache) /*!< in, own: cache to free */
+{
+ hash_table_free(cache->locks_hash);
+ ha_storage_free(cache->storage);
+ table_cache_free(&cache->innodb_trx);
+ table_cache_free(&cache->innodb_locks);
+ table_cache_free(&cache->innodb_lock_waits);
+ memset(cache, 0, sizeof *cache);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ ullint now;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED));
+#endif
+
+ /* update cache last read time */
+ now = ut_time_us(NULL);
+ mutex_enter(&cache->last_read_mutex);
+ cache->last_read = now;
+ mutex_exit(&cache->last_read_mutex);
+
+ rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+ rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+ trx_i_s_cache_t* cache) /*!< in: cache */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+ trx_i_s_cache_t* cache, /*!< in: whole cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+#ifdef UNIV_SYNC_DEBUG
+ ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)
+ || rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+ switch (table) {
+ case I_S_INNODB_TRX:
+ table_cache = &cache->innodb_trx;
+ break;
+ case I_S_INNODB_LOCKS:
+ table_cache = &cache->innodb_locks;
+ break;
+ case I_S_INNODB_LOCK_WAITS:
+ table_cache = &cache->innodb_lock_waits;
+ break;
+ default:
+ ut_error;
+ }
+
+ return(table_cache);
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table) /*!< in: which table */
+{
+ i_s_table_cache_t* table_cache;
+
+ table_cache = cache_select_table(cache, table);
+
+ return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+ trx_i_s_cache_t* cache, /*!< in: cache */
+ enum i_s_table table, /*!< in: which table */
+ ulint n) /*!< in: row number */
+{
+ i_s_table_cache_t* table_cache;
+ ulint i;
+ void* row;
+
+ table_cache = cache_select_table(cache, table);
+
+ ut_a(n < table_cache->rows_used);
+
+ row = NULL;
+
+ for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+ if (table_cache->chunks[i].offset
+ + table_cache->chunks[i].rows_allocd > n) {
+
+ row = (char*) table_cache->chunks[i].base
+ + (n - table_cache->chunks[i].offset)
+ * table_cache->row_size;
+ break;
+ }
+ }
+
+ ut_a(row != NULL);
+
+ return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+ const i_s_locks_row_t* row, /*!< in: innodb_locks row */
+ char* lock_id,/*!< out: resulting lock_id */
+ ulint lock_id_size)/*!< in: size of the lock id
+ buffer */
+{
+ int res_len;
+
+ /* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+ if (row->lock_space != ULINT_UNDEFINED) {
+ /* record lock */
+ res_len = ut_snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT ":%lu:%lu:%lu",
+ row->lock_trx_id, row->lock_space,
+ row->lock_page, row->lock_rec);
+ } else {
+ /* table lock */
+ res_len = ut_snprintf(lock_id, lock_id_size,
+ TRX_ID_FMT ":%llu",
+ row->lock_trx_id,
+ row->lock_table_id);
+ }
+
+ /* the typecast is safe because snprintf(3) never returns
+ negative result */
+ ut_a(res_len >= 0);
+ ut_a((ulint) res_len < lock_id_size);
+
+ return(lock_id);
+}
diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.c
new file mode 100644
index 00000000000..1c317665878
--- /dev/null
+++ b/storage/xtradb/trx/trx0purge.c
@@ -0,0 +1,1288 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.c
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+
+#ifdef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "read0read.h"
+#include "fut0fut.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "srv0que.h"
+#include "os0thread.h"
+
+/** The global data structure coordinating a purge */
+UNIV_INTERN trx_purge_t* purge_sys = NULL;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+UNIV_INTERN trx_undo_rec_t trx_purge_dummy_rec;
+
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+ trx_id_t trx_id) /*!< in: transaction id */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/*=================== PURGE RECORD ARRAY =============================*/
+
+/*******************************************************************//**
+Stores info of an undo log record during a purge.
+@return pointer to the storage cell */
+static
+trx_undo_inf_t*
+trx_purge_arr_store_info(
+/*=====================*/
+ trx_id_t trx_no, /*!< in: transaction number */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_arr_t* arr;
+ ulint i;
+
+ arr = purge_sys->arr;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!(cell->in_use)) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->trx_no = trx_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ return(cell);
+ }
+ }
+}
+
+/*******************************************************************//**
+Removes info of an undo log record during a purge. */
+UNIV_INLINE
+void
+trx_purge_arr_remove_info(
+/*======================*/
+ trx_undo_inf_t* cell) /*!< in: pointer to the storage cell */
+{
+ trx_undo_arr_t* arr;
+
+ arr = purge_sys->arr;
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+}
+
+/*******************************************************************//**
+Gets the biggest pair of a trx number and an undo number in a purge array. */
+static
+void
+trx_purge_arr_get_biggest(
+/*======================*/
+ trx_undo_arr_t* arr, /*!< in: purge array */
+ trx_id_t* trx_no, /*!< out: transaction number: ut_dulint_zero
+ if array is empty */
+ undo_no_t* undo_no)/*!< out: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_id_t pair_trx_no;
+ undo_no_t pair_undo_no;
+ int trx_cmp;
+ ulint n_used;
+ ulint i;
+ ulint n;
+
+ n = 0;
+ n_used = arr->n_used;
+ pair_trx_no = ut_dulint_zero;
+ pair_undo_no = ut_dulint_zero;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use) {
+ n++;
+ trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no);
+
+ if ((trx_cmp > 0)
+ || ((trx_cmp == 0)
+ && (ut_dulint_cmp(cell->undo_no,
+ pair_undo_no) >= 0))) {
+
+ pair_trx_no = cell->trx_no;
+ pair_undo_no = cell->undo_no;
+ }
+ }
+
+ if (n == n_used) {
+ *trx_no = pair_trx_no;
+ *undo_no = pair_undo_no;
+
+ return;
+ }
+ }
+}
+
+/****************************************************************//**
+Builds a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return own: the query graph */
+static
+que_t*
+trx_purge_graph_build(
+/*=======================*/
+ trx_t* trx)
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+
+ thr->child = row_purge_node_create(thr, heap);
+
+ /* thr2 = que_thr_create(fork, fork, heap);
+
+ thr2->child = row_purge_node_create(fork, thr2, heap); */
+
+ return(fork);
+}
+
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(void)
+/*======================*/
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ purge_sys = mem_alloc(sizeof(trx_purge_t));
+
+ purge_sys->state = TRX_STOP_PURGE;
+
+ purge_sys->n_pages_handled = 0;
+
+ purge_sys->purge_trx_no = ut_dulint_zero;
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->next_stored = FALSE;
+
+ rw_lock_create(&purge_sys->latch, SYNC_PURGE_LATCH);
+
+ mutex_create(&purge_sys->mutex, SYNC_PURGE_SYS);
+
+ purge_sys->heap = mem_heap_create(256);
+
+ purge_sys->arr = trx_undo_arr_create();
+
+ purge_sys->sess = sess_open();
+
+ purge_sys->trx = purge_sys->sess->trx;
+
+ purge_sys->trx->is_purge = 1;
+
+ ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+
+ purge_sys->query = trx_purge_graph_build(purge_sys->trx);
+
+ purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero,
+ purge_sys->heap);
+
+ purge_sys->n_worker = 0;
+ if (srv_use_purge_thread > 1) {
+ /* Use worker threads */
+ ulint i;
+
+ purge_sys->n_worker = srv_use_purge_thread - 1;
+
+ purge_sys->sess_arr = mem_alloc(sizeof(sess_t*) * purge_sys->n_worker);
+ purge_sys->trx_arr = mem_alloc(sizeof(trx_t*) * purge_sys->n_worker);
+ purge_sys->query_arr = mem_alloc(sizeof(que_t*) * purge_sys->n_worker);
+
+ purge_sys->worker_event = os_event_create(NULL);
+ os_event_reset(purge_sys->worker_event);
+
+ for (i = 0; i < purge_sys->n_worker; i++) {
+ purge_sys->sess_arr[i] = sess_open();
+
+ purge_sys->trx_arr[i] = purge_sys->sess_arr[i]->trx;
+ purge_sys->trx_arr[i]->is_purge = 1;
+ ut_a(trx_start_low(purge_sys->trx_arr[i], ULINT_UNDEFINED));
+
+ purge_sys->query_arr[i] = trx_purge_graph_build(purge_sys->trx_arr[i]);
+ }
+ }
+}
+
+/************************************************************************
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void)
+/*======================*/
+{
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ que_graph_free(purge_sys->query);
+
+ ut_a(purge_sys->sess->trx->is_purge);
+ purge_sys->sess->trx->conc_state = TRX_NOT_STARTED;
+ sess_close(purge_sys->sess);
+ purge_sys->sess = NULL;
+
+ if (purge_sys->view != NULL) {
+ /* Because acquiring the kernel mutex is a pre-condition
+ of read_view_close(). We don't really need it here. */
+ mutex_enter(&kernel_mutex);
+
+ read_view_close(purge_sys->view);
+ purge_sys->view = NULL;
+
+ mutex_exit(&kernel_mutex);
+ }
+
+ trx_undo_arr_free(purge_sys->arr);
+
+ rw_lock_free(&purge_sys->latch);
+ mutex_free(&purge_sys->mutex);
+
+ mem_heap_free(purge_sys->heap);
+ mem_free(purge_sys);
+
+ purge_sys = NULL;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/********************************************************************//**
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+ trx_t* trx, /*!< in: transaction */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_t* undo;
+ trx_rseg_t* rseg;
+ trx_rsegf_t* rseg_header;
+ trx_usegf_t* seg_header;
+ trx_ulogf_t* undo_header;
+ trx_upagef_t* page_header;
+ ulint hist_size;
+
+ undo = trx->update_undo;
+
+ ut_ad(undo);
+
+ rseg = undo->rseg;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, mtr);
+
+ undo_header = undo_page + undo->hdr_offset;
+ seg_header = undo_page + TRX_UNDO_SEG_HDR;
+ page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+ if (undo->state != TRX_UNDO_CACHED) {
+ /* The undo log segment will not be reused */
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ ut_error;
+ }
+
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+
+ hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr);
+ ut_ad(undo->size == flst_get_len(
+ seg_header + TRX_UNDO_PAGE_LIST, mtr));
+
+ mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size + undo->size, MLOG_4BYTES, mtr);
+ }
+
+ /* Add the log as the first in the history list */
+ flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+ undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+ mutex_enter(&kernel_mutex);
+ trx_sys->rseg_history_len++;
+ mutex_exit(&kernel_mutex);
+
+ /* Write the trx number to the undo log header */
+ mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+ /* Write information about delete markings to the undo log header */
+
+ if (!undo->del_marks) {
+ mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, mtr);
+ }
+
+ if (rseg->last_page_no == FIL_NULL) {
+
+ rseg->last_page_no = undo->hdr_page_no;
+ rseg->last_offset = undo->hdr_offset;
+ rseg->last_trx_no = trx->no;
+ rseg->last_del_marks = undo->del_marks;
+ }
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is in the history list. Cuts the end of the
+history list at the youngest undo log in this segment. */
+static
+void
+trx_purge_free_segment(
+/*===================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ fil_addr_t hdr_addr, /*!< in: the file address of log_hdr */
+ ulint n_removed_logs) /*!< in: count of how many undo logs we
+ will cut off from the end of the
+ history list */
+{
+ page_t* undo_page;
+ trx_rsegf_t* rseg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ ibool freed;
+ ulint seg_size;
+ ulint hist_size;
+ ibool marked = FALSE;
+ mtr_t mtr;
+
+ /* fputs("Freeing an update undo log segment\n", stderr); */
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+loop:
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ hdr_addr.page, &mtr);
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ log_hdr = undo_page + hdr_addr.boffset;
+
+ /* Mark the last undo log totally purged, so that if the system
+ crashes, the tail of the undo log will not get accessed again. The
+ list of pages in the undo log tail gets inconsistent during the
+ freeing of the segment, and therefore purge should not try to access
+ them again. */
+
+ if (!marked) {
+ mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+ MLOG_2BYTES, &mtr);
+ marked = TRUE;
+ }
+
+ freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ if (!freed) {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ goto loop;
+ }
+
+ /* The page list may now be inconsistent, but the length field
+ stored in the list base node tells us how big it was before we
+ started the freeing. */
+
+ seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr);
+
+ /* We may free the undo log segment header page; it must be freed
+ within the same mtr as the undo log header is removed from the
+ history list: otherwise, in case of a database crash, the segment
+ could become inaccessible garbage in the file space. */
+
+ flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
+
+ mutex_enter(&kernel_mutex);
+ ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ freed = FALSE;
+
+ while (!freed) {
+ /* Here we assume that a file segment with just the header
+ page can be freed in a few steps, so that the buffer pool
+ is not flooded with bufferfixed pages: see the note in
+ fsp0fsp.c. */
+
+ freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
+ &mtr);
+ }
+
+ hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, &mtr);
+ ut_ad(hist_size >= seg_size);
+
+ mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+ hist_size - seg_size, MLOG_4BYTES, &mtr);
+
+ ut_ad(rseg->curr_size >= seg_size);
+
+ rseg->curr_size -= seg_size;
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Removes unnecessary history data from a rollback segment. */
+static
+void
+trx_purge_truncate_rseg_history(
+/*============================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ trx_id_t limit_trx_no, /*!< in: remove update undo logs whose
+ trx number is < limit_trx_no */
+ undo_no_t limit_undo_no) /*!< in: if transaction number is equal
+ to limit_trx_no, truncate undo records
+ with undo number < limit_undo_no */
+{
+ fil_addr_t hdr_addr;
+ fil_addr_t prev_hdr_addr;
+ trx_rsegf_t* rseg_hdr;
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ int cmp;
+ ulint n_removed_logs = 0;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
+loop:
+ if (hdr_addr.page == FIL_NULL) {
+
+ mutex_exit(&(rseg->mutex));
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ hdr_addr.page, &mtr);
+
+ log_hdr = undo_page + hdr_addr.boffset;
+
+ cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO),
+ limit_trx_no);
+ if (cmp == 0) {
+ trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page,
+ hdr_addr.boffset, limit_undo_no);
+ }
+
+ if (cmp >= 0) {
+ mutex_enter(&kernel_mutex);
+ ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+ trx_sys->rseg_history_len -= n_removed_logs;
+ mutex_exit(&kernel_mutex);
+
+ flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
+ log_hdr + TRX_UNDO_HISTORY_NODE,
+ n_removed_logs, &mtr);
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ prev_hdr_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+ n_removed_logs++;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
+ && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+
+ /* We can free the whole log segment */
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ trx_purge_free_segment(rseg, hdr_addr, n_removed_logs);
+
+ n_removed_logs = 0;
+ } else {
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ }
+
+ mtr_start(&mtr);
+ mutex_enter(&(rseg->mutex));
+
+ rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ hdr_addr = prev_hdr_addr;
+
+ goto loop;
+}
+
+/********************************************************************//**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages! */
+static
+void
+trx_purge_truncate_history(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ trx_id_t limit_trx_no;
+ undo_no_t limit_undo_no;
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+
+ trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no,
+ &limit_undo_no);
+
+ if (ut_dulint_is_zero(limit_trx_no)) {
+
+ limit_trx_no = purge_sys->purge_trx_no;
+ limit_undo_no = purge_sys->purge_undo_no;
+ }
+
+ /* We play safe and set the truncate limit at most to the purge view
+ low_limit number, though this is not necessary */
+
+ if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) {
+ limit_trx_no = purge_sys->view->low_limit_no;
+ limit_undo_no = ut_dulint_zero;
+ }
+
+ ut_ad((ut_dulint_cmp(limit_trx_no,
+ purge_sys->view->low_limit_no) <= 0));
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg) {
+ trx_purge_truncate_rseg_history(rseg, limit_trx_no,
+ limit_undo_no);
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+}
+
+/********************************************************************//**
+Does a truncate if the purge array is empty. NOTE that when this function is
+called, the caller must not have any latches on undo log pages!
+@return TRUE if array empty */
+UNIV_INLINE
+ibool
+trx_purge_truncate_if_arr_empty(void)
+/*=================================*/
+{
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+
+ if (purge_sys->arr->n_used == 0) {
+
+ trx_purge_truncate_history();
+
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
+static
+void
+trx_purge_rseg_get_next_history_log(
+/*================================*/
+ trx_rseg_t* rseg) /*!< in: rollback segment */
+{
+ page_t* undo_page;
+ trx_ulogf_t* log_hdr;
+ trx_usegf_t* seg_hdr;
+ fil_addr_t prev_log_addr;
+ trx_id_t trx_no;
+ ibool del_marks;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_a(rseg->last_page_no != FIL_NULL);
+
+ purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1);
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->next_stored = FALSE;
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+ rseg->last_page_no, &mtr);
+ log_hdr = undo_page + rseg->last_offset;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ /* Increase the purge page count by one for every handled log */
+
+ purge_sys->n_pages_handled++;
+
+ prev_log_addr = trx_purge_get_log_from_hist(
+ flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+ if (prev_log_addr.page == FIL_NULL) {
+ /* No logs left in the history list */
+
+ rseg->last_page_no = FIL_NULL;
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ mutex_enter(&kernel_mutex);
+
+ /* Add debug code to track history list corruption reported
+ on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+ file-based list was corrupt. The prev node pointer was
+ FIL_NULL, even though the list length was over 8 million nodes!
+ We assume that purge truncates the history list in moderate
+ size pieces, and if we here reach the head of the list, the
+ list cannot be longer than 20 000 undo logs now. */
+
+ if (trx_sys->rseg_history_len > 20000) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: purge reached the"
+ " head of the history list,\n"
+ "InnoDB: but its length is still"
+ " reported as %lu! Make a detailed bug\n"
+ "InnoDB: report, and submit it"
+ " to http://bugs.mysql.com\n",
+ (ulong) trx_sys->rseg_history_len);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return;
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ /* Read the trx number and del marks from the previous log header */
+ mtr_start(&mtr);
+
+ log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+ prev_log_addr.page, &mtr)
+ + prev_log_addr.boffset;
+
+ trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+ del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);
+
+ mtr_commit(&mtr);
+
+ mutex_enter(&(rseg->mutex));
+
+ rseg->last_page_no = prev_log_addr.page;
+ rseg->last_offset = prev_log_addr.boffset;
+ rseg->last_trx_no = trx_no;
+ rseg->last_del_marks = del_marks;
+
+ mutex_exit(&(rseg->mutex));
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+ trx_undo_rec_t* rec;
+ trx_rseg_t* rseg;
+ trx_rseg_t* min_rseg;
+ trx_id_t min_trx_no;
+ ulint space = 0; /* remove warning (??? bug ???) */
+ ulint zip_size = 0;
+ ulint page_no = 0; /* remove warning (??? bug ???) */
+ ulint offset = 0; /* remove warning (??? bug ???) */
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+ ut_ad(purge_sys->next_stored == FALSE);
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ min_trx_no = ut_dulint_max;
+
+ min_rseg = NULL;
+
+ while (rseg) {
+ mutex_enter(&(rseg->mutex));
+
+ if (rseg->last_page_no != FIL_NULL) {
+
+ if ((min_rseg == NULL)
+ || (ut_dulint_cmp(min_trx_no,
+ rseg->last_trx_no) > 0)) {
+
+ min_rseg = rseg;
+ min_trx_no = rseg->last_trx_no;
+ space = rseg->space;
+ zip_size = rseg->zip_size;
+ ut_a(space == 0); /* We assume in purge of
+ externally stored fields
+ that space id == 0 */
+ page_no = rseg->last_page_no;
+ offset = rseg->last_offset;
+ }
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+
+ if (min_rseg == NULL) {
+
+ return;
+ }
+
+ mtr_start(&mtr);
+
+ if (!min_rseg->last_del_marks) {
+ /* No need to purge this log */
+
+ rec = &trx_purge_dummy_rec;
+ } else {
+ rec = trx_undo_get_first_rec(space, zip_size, page_no, offset,
+ RW_S_LATCH, &mtr);
+ if (rec == NULL) {
+ /* Undo log empty */
+
+ rec = &trx_purge_dummy_rec;
+ }
+ }
+
+ purge_sys->next_stored = TRUE;
+ purge_sys->rseg = min_rseg;
+
+ purge_sys->hdr_page_no = page_no;
+ purge_sys->hdr_offset = offset;
+
+ purge_sys->purge_trx_no = min_trx_no;
+
+ if (rec == &trx_purge_dummy_rec) {
+
+ purge_sys->purge_undo_no = ut_dulint_zero;
+ purge_sys->page_no = page_no;
+ purge_sys->offset = 0;
+ } else {
+ purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec);
+
+ purge_sys->page_no = page_get_page_no(page_align(rec));
+ purge_sys->offset = page_offset(rec);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* rec_copy;
+ trx_undo_rec_t* rec2;
+ trx_undo_rec_t* next_rec;
+ page_t* undo_page;
+ page_t* page;
+ ulint offset;
+ ulint page_no;
+ ulint space;
+ ulint zip_size;
+ ulint type;
+ ulint cmpl_info;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(purge_sys->mutex)));
+ ut_ad(purge_sys->next_stored);
+
+ space = purge_sys->rseg->space;
+ zip_size = purge_sys->rseg->zip_size;
+ page_no = purge_sys->page_no;
+ offset = purge_sys->offset;
+
+ if (offset == 0) {
+ /* It is the dummy undo log record, which means that there is
+ no need to purge this undo log */
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ return(&trx_purge_dummy_rec);
+ }
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, zip_size,
+ page_no, &mtr);
+ rec = undo_page + offset;
+
+ rec2 = rec;
+
+ for (;;) {
+ /* Try first to find the next record which requires a purge
+ operation from the same page of the same undo log */
+
+ next_rec = trx_undo_page_get_next_rec(rec2,
+ purge_sys->hdr_page_no,
+ purge_sys->hdr_offset);
+ if (next_rec == NULL) {
+ rec2 = trx_undo_get_next_rec(
+ rec2, purge_sys->hdr_page_no,
+ purge_sys->hdr_offset, &mtr);
+ break;
+ }
+
+ rec2 = next_rec;
+
+ type = trx_undo_rec_get_type(rec2);
+
+ if (type == TRX_UNDO_DEL_MARK_REC) {
+
+ break;
+ }
+
+ cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+ if (trx_undo_rec_get_extern_storage(rec2)) {
+ break;
+ }
+
+ if ((type == TRX_UNDO_UPD_EXIST_REC)
+ && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ break;
+ }
+ }
+
+ if (rec2 == NULL) {
+ mtr_commit(&mtr);
+
+ trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+ /* Look for the next undo log and record to purge */
+
+ trx_purge_choose_next_log();
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(space, zip_size,
+ page_no, &mtr);
+
+ rec = undo_page + offset;
+ } else {
+ page = page_align(rec2);
+
+ purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
+ purge_sys->page_no = page_get_page_no(page);
+ purge_sys->offset = rec2 - page;
+
+ if (undo_page != page) {
+ /* We advance to a new page of the undo log: */
+ purge_sys->n_pages_handled++;
+ }
+ }
+
+ rec_copy = trx_undo_rec_copy(rec, heap);
+
+ mtr_commit(&mtr);
+
+ return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ trx_undo_inf_t** cell, /*!< out: storage cell for the record in the
+ purge array */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ if (purge_sys->state == TRX_STOP_PURGE) {
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+ if (!purge_sys->next_stored) {
+ trx_purge_choose_next_log();
+
+ if (!purge_sys->next_stored) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ if (srv_print_thread_releases) {
+ fprintf(stderr,
+ "Purge: No logs left in the"
+ " history list; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+ }
+
+ if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+ if (ut_dulint_cmp(purge_sys->purge_trx_no,
+ purge_sys->view->low_limit_no) >= 0) {
+ purge_sys->state = TRX_STOP_PURGE;
+
+ trx_purge_truncate_if_arr_empty();
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(NULL);
+ }
+
+ /* fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n",
+ os_thread_get_curr_id(),
+ ut_dulint_get_low(purge_sys->purge_trx_no),
+ ut_dulint_get_low(purge_sys->purge_undo_no)); */
+
+ *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id,
+ purge_sys->page_no,
+ purge_sys->offset);
+
+ *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no,
+ purge_sys->purge_undo_no);
+
+ ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no,
+ (purge_sys->view)->low_limit_no) < 0);
+
+ /* The following call will advance the stored values of purge_trx_no
+ and purge_undo_no, therefore we had to store them first */
+
+ undo_rec = trx_purge_get_next_rec(heap);
+
+ mutex_exit(&(purge_sys->mutex));
+
+ return(undo_rec);
+}
+
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+ trx_undo_inf_t* cell) /*!< in: storage cell */
+{
+ trx_undo_arr_t* arr;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ arr = purge_sys->arr;
+
+ trx_purge_arr_remove_info(cell);
+
+ mutex_exit(&(purge_sys->mutex));
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(void)
+/*===========*/
+{
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+ ulint old_pages_handled;
+
+ mutex_enter(&(purge_sys->mutex));
+
+ if (purge_sys->trx->n_active_thrs > 0) {
+
+ mutex_exit(&(purge_sys->mutex));
+
+ /* Should not happen */
+
+ ut_error;
+
+ return(0);
+ }
+
+ rw_lock_x_lock(&(purge_sys->latch));
+
+ mutex_enter(&kernel_mutex);
+
+ /* Close and free the old purge view */
+
+ read_view_close(purge_sys->view);
+ purge_sys->view = NULL;
+ mem_heap_empty(purge_sys->heap);
+
+ /* Determine how much data manipulation language (DML) statements
+ need to be delayed in order to reduce the lagging of the purge
+ thread. */
+ srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
+
+ /* If we cannot advance the 'purge view' because of an old
+ 'consistent read view', then the DML statements cannot be delayed.
+ Also, srv_max_purge_lag <= 0 means 'infinity'. */
+ if (srv_max_purge_lag > 0) {
+ float ratio = (float) trx_sys->rseg_history_len
+ / srv_max_purge_lag;
+ if (ratio > ULINT_MAX / 10000) {
+ /* Avoid overflow: maximum delay is 4295 seconds */
+ srv_dml_needed_delay = ULINT_MAX;
+ } else if (ratio > 1) {
+ /* If the history list length exceeds the
+ innodb_max_purge_lag, the
+ data manipulation statements are delayed
+ by at least 5000 microseconds. */
+ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
+ }
+ }
+
+ purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero,
+ purge_sys->heap);
+ mutex_exit(&kernel_mutex);
+
+ rw_lock_x_unlock(&(purge_sys->latch));
+
+ purge_sys->state = TRX_PURGE_ON;
+
+ /* Handle at most 20 undo log pages in one purge batch */
+
+ purge_sys->handle_limit = purge_sys->n_pages_handled + 20 * (srv_use_purge_thread + 1);
+
+ old_pages_handled = purge_sys->n_pages_handled;
+
+ mutex_exit(&(purge_sys->mutex));
+
+ mutex_enter(&kernel_mutex);
+
+ thr = que_fork_start_command(purge_sys->query);
+
+ ut_ad(thr);
+
+ /* thr2 = que_fork_start_command(purge_sys->query);
+
+ ut_ad(thr2); */
+
+
+ mutex_exit(&kernel_mutex);
+
+ if (purge_sys->n_worker)
+ os_event_set(purge_sys->worker_event);
+
+ /* srv_que_task_enqueue(thr2); */
+
+ if (srv_print_thread_releases) {
+
+ fputs("Starting purge\n", stderr);
+ }
+
+ que_run_threads(thr);
+
+ if (purge_sys->n_worker)
+ os_event_reset(purge_sys->worker_event);
+
+ if (srv_print_thread_releases) {
+
+ fprintf(stderr,
+ "Purge ends; pages handled %lu\n",
+ (ulong) purge_sys->n_pages_handled);
+ }
+
+ return(purge_sys->n_pages_handled - old_pages_handled);
+}
+
+/**********************************************************************
+This function runs a purge worker batch */
+UNIV_INTERN
+void
+trx_purge_worker(
+/*=============*/
+ ulint worker_id)
+{
+ que_thr_t* thr;
+
+ mutex_enter(&kernel_mutex);
+
+ thr = que_fork_start_command(purge_sys->query_arr[worker_id]);
+
+ ut_ad(thr);
+
+ mutex_exit(&kernel_mutex);
+
+ que_run_threads(thr);
+
+ if (purge_sys->state == TRX_STOP_PURGE) { /* optimistic */
+ os_event_reset(purge_sys->worker_event);
+ }
+}
+
+/**********************************************************************
+This function waits the event for worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wait(void)
+/*=======================*/
+{
+ os_event_wait(purge_sys->worker_event);
+}
+
+/**********************************************************************
+This function wakes the waiting worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wake(void)
+/*=======================*/
+{
+ if (purge_sys->n_worker)
+ os_event_set(purge_sys->worker_event);
+}
+
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void)
+/*=====================*/
+{
+ fprintf(stderr, "InnoDB: Purge system view:\n");
+ read_view_print(purge_sys->view);
+
+ fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT
+ ", undo n:o " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no),
+ TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no));
+ fprintf(stderr,
+ "InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
+ "InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
+ (ulong) purge_sys->next_stored,
+ (ulong) purge_sys->page_no,
+ (ulong) purge_sys->offset,
+ (ulong) purge_sys->hdr_page_no,
+ (ulong) purge_sys->hdr_offset);
+}
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c
new file mode 100644
index 00000000000..f50e10ed756
--- /dev/null
+++ b/storage/xtradb/trx/trx0rec.c
@@ -0,0 +1,1611 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.c
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of the inserted undo log record on the undo log
+page. */
+UNIV_INLINE
+void
+trx_undof_page_add_undo_rec_log(
+/*============================*/
+ page_t* undo_page, /*!< in: undo log page */
+ ulint old_free, /*!< in: start offset of the inserted entry */
+ ulint new_free, /*!< in: end offset of the entry */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ byte* log_ptr;
+ const byte* log_end;
+ ulint len;
+
+ log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
+
+ if (log_ptr == NULL) {
+
+ return;
+ }
+
+ log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
+ log_ptr = mlog_write_initial_log_record_fast(
+ undo_page, MLOG_UNDO_INSERT, log_ptr, mtr);
+ len = new_free - old_free - 4;
+
+ mach_write_to_2(log_ptr, len);
+ log_ptr += 2;
+
+ if (log_ptr + len <= log_end) {
+ memcpy(log_ptr, undo_page + old_free + 2, len);
+ mlog_close(mtr, log_ptr + len);
+ } else {
+ mlog_close(mtr, log_ptr);
+ mlog_catenate_string(mtr, undo_page + old_free + 2, len);
+ }
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page) /*!< in: page or NULL */
+{
+ ulint len;
+ byte* rec;
+ ulint first_free;
+
+ if (end_ptr < ptr + 2) {
+
+ return(NULL);
+ }
+
+ len = mach_read_from_2(ptr);
+ ptr += 2;
+
+ if (end_ptr < ptr + len) {
+
+ return(NULL);
+ }
+
+ if (page == NULL) {
+
+ return(ptr + len);
+ }
+
+ first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ rec = page + first_free;
+
+ mach_write_to_2(rec, first_free + 4 + len);
+ mach_write_to_2(rec + 2 + len, first_free);
+
+ mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ first_free + 4 + len);
+ ut_memcpy(rec + 2, ptr, len);
+
+ return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Calculates the free space left for extending an undo log record.
+@return bytes left */
+UNIV_INLINE
+ulint
+trx_undo_left(
+/*==========*/
+ const page_t* page, /*!< in: undo log page */
+ const byte* ptr) /*!< in: pointer to page */
+{
+ /* The '- 10' is a safety margin, in case we have some small
+ calculation error below */
+
+ return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+ulint
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+ page_t* undo_page, /*!< in/out: undo log page */
+ byte* ptr, /*!< in: ptr up to where data has been
+ written on this undo page. */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint first_free; /*!< offset within undo_page */
+ ulint end_of_rec; /*!< offset within undo_page */
+ byte* ptr_to_first_free;
+ /* pointer within undo_page
+ that points to the next free
+ offset value within undo_page.*/
+
+ ut_ad(ptr > undo_page);
+ ut_ad(ptr < undo_page + UNIV_PAGE_SIZE);
+
+ if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) {
+
+ return(0);
+ }
+
+ ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;
+
+ first_free = mach_read_from_2(ptr_to_first_free);
+
+ /* Write offset of the previous undo log record */
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+
+ end_of_rec = ptr - undo_page;
+
+ /* Write offset of the next undo log record */
+ mach_write_to_2(undo_page + first_free, end_of_rec);
+
+ /* Update the offset to first free undo record */
+ mach_write_to_2(ptr_to_first_free, end_of_rec);
+
+ /* Write this log entry to the UNDO log */
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ end_of_rec, mtr);
+
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return offset of the inserted entry on the page if succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_insert(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: index entry which will be
+ inserted to the clustered index */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint first_free;
+ byte* ptr;
+ ulint i;
+
+ ut_ad(dict_index_is_clust(index));
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT);
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) {
+
+ /* Not enough space for writing the general parameters */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+ *ptr++ = TRX_UNDO_INSERT_REC;
+ ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no);
+ ptr += mach_dulint_write_much_compressed(ptr, index->table->id);
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the record
+ to be inserted in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ const dfield_t* field = dtuple_get_nth_field(clust_entry, i);
+ ulint flen = dfield_get_len(field);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, dfield_get_data(field), flen);
+ ptr += flen;
+ }
+ }
+
+ return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+ trx_undo_rec_t* undo_rec, /*!< in: undo log record */
+ ulint* type, /*!< out: undo record type:
+ TRX_UNDO_INSERT_REC, ... */
+ ulint* cmpl_info, /*!< out: compiler info, relevant only
+ for update type records */
+ ibool* updated_extern, /*!< out: TRUE if we updated an
+ externally stored fild */
+ undo_no_t* undo_no, /*!< out: undo log record number */
+ dulint* table_id) /*!< out: table id */
+{
+ byte* ptr;
+ ulint type_cmpl;
+
+ ptr = undo_rec + 2;
+
+ type_cmpl = mach_read_from_1(ptr);
+ ptr++;
+
+ if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
+ *updated_extern = TRUE;
+ type_cmpl -= TRX_UNDO_UPD_EXTERN;
+ } else {
+ *updated_extern = FALSE;
+ }
+
+ *type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+ *cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+ *undo_no = mach_dulint_read_much_compressed(ptr);
+ ptr += mach_dulint_get_much_compressed_size(*undo_no);
+
+ *table_id = mach_dulint_read_much_compressed(ptr);
+ ptr += mach_dulint_get_much_compressed_size(*table_id);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an undo log record a stored column value.
+@return remaining part of undo log record after reading these values */
+static
+byte*
+trx_undo_rec_get_col_val(
+/*=====================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ byte** field, /*!< out: pointer to stored field */
+ ulint* len, /*!< out: length of the field, or UNIV_SQL_NULL */
+ ulint* orig_len)/*!< out: original length of the locally
+ stored part of an externally stored column, or 0 */
+{
+ *len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*len);
+
+ *orig_len = 0;
+
+ switch (*len) {
+ case UNIV_SQL_NULL:
+ *field = NULL;
+ break;
+ case UNIV_EXTERN_STORAGE_FIELD:
+ *orig_len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*orig_len);
+ *len = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*len);
+ *field = ptr;
+ ptr += *len;
+
+ ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+ ut_ad(*len > *orig_len);
+ /* @see dtuple_convert_big_rec() */
+ ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE * 2);
+ /* we do not have access to index->table here
+ ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+ || *len >= REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ */
+
+ *len += UNIV_EXTERN_STORAGE_FIELD;
+ break;
+ default:
+ *field = ptr;
+ if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+ ptr += *len - UNIV_EXTERN_STORAGE_FIELD;
+ } else {
+ ptr += *len;
+ }
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+ byte* ptr, /*!< in: remaining part of a copy of an undo log
+ record, at the start of the row reference;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the row reference is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** ref, /*!< out, own: row reference */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr && ref && heap);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ *ref = dtuple_create(heap, ref_len);
+
+ dict_index_copy_types(*ref, index, ref_len);
+
+ for (i = 0; i < ref_len; i++) {
+ dfield_t* dfield;
+ byte* field;
+ ulint len;
+ ulint orig_len;
+
+ dfield = dtuple_get_nth_field(*ref, i);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield_set_data(dfield, field, len);
+ }
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, at the start of the row reference */
+ dict_index_t* index) /*!< in: clustered index */
+{
+ ulint ref_len;
+ ulint i;
+
+ ut_ad(index && ptr);
+ ut_a(dict_index_is_clust(index));
+
+ ref_len = dict_index_get_n_unique(index);
+
+ for (i = 0; i < ref_len; i++) {
+ byte* field;
+ ulint len;
+ ulint orig_len;
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Fetch a prefix of an externally stored column, for writing to the undo log
+of an update or delete marking of a clustered index record.
+@return ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+/*====================*/
+ byte* ext_buf, /*!< in: a buffer of
+ REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE */
+ ulint zip_size, /*!< compressed page size in bytes,
+ or 0 for uncompressed BLOB */
+ const byte* field, /*!< in: an externally stored column */
+ ulint* len) /*!< in: length of field;
+ out: used length of ext_buf */
+{
+ /* Fetch the BLOB. */
+ ulint ext_len = btr_copy_externally_stored_field_prefix(
+ ext_buf, REC_MAX_INDEX_COL_LEN, zip_size, field, *len);
+ /* BLOBs should always be nonempty. */
+ ut_a(ext_len);
+ /* Append the BLOB pointer to the prefix. */
+ memcpy(ext_buf + ext_len,
+ field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+ BTR_EXTERN_FIELD_REF_SIZE);
+ *len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+ return(ext_buf);
+}
+
+/**********************************************************************//**
+Writes to the undo log a prefix of an externally stored column.
+@return undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+/*============================*/
+ byte* ptr, /*!< in: undo log position,
+ at least 15 bytes must be available */
+ byte* ext_buf, /*!< in: a buffer of
+ REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE,
+ or NULL when should not fetch
+ a longer prefix */
+ ulint zip_size, /*!< compressed page size in bytes,
+ or 0 for uncompressed BLOB */
+ const byte** field, /*!< in/out: the locally stored part of
+ the externally stored column */
+ ulint* len) /*!< in/out: length of field, in bytes */
+{
+ if (ext_buf) {
+ /* If an ordering column is externally stored, we will
+ have to store a longer prefix of the field. In this
+ case, write to the log a marker followed by the
+ original length and the real length of the field. */
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+ ptr += mach_write_compressed(ptr, *len);
+
+ *field = trx_undo_page_fetch_ext(ext_buf, zip_size,
+ *field, len);
+
+ ptr += mach_write_compressed(ptr, *len);
+ } else {
+ ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+ + *len);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_modify(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log page */
+ trx_t* trx, /*!< in: transaction */
+ dict_index_t* index, /*!< in: clustered index where update or
+ delete marking is done */
+ const rec_t* rec, /*!< in: clustered index record which
+ has NOT yet been modified */
+ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */
+ const upd_t* update, /*!< in: update vector which tells the
+ columns to be updated; in the case of
+ a delete, this should be set to NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ dict_table_t* table;
+ ulint first_free;
+ byte* ptr;
+ const byte* field;
+ ulint flen;
+ ulint col_no;
+ ulint type_cmpl;
+ byte* type_cmpl_ptr;
+ ulint i;
+ trx_id_t trx_id;
+ ibool ignore_prefix = FALSE;
+ byte ext_buf[REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE];
+
+ ut_a(dict_index_is_clust(index));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+ ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
+ table = index->table;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ ptr = undo_page + first_free;
+
+ ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+ if (trx_undo_left(undo_page, ptr) < 50) {
+
+ /* NOTE: the value 50 must be big enough so that the general
+ fields written below fit on the undo log page */
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes for the pointer to the next undo log record */
+ ptr += 2;
+
+ /* Store first some general parameters to the undo log */
+
+ if (!update) {
+ type_cmpl = TRX_UNDO_DEL_MARK_REC;
+ } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+ type_cmpl = TRX_UNDO_UPD_DEL_REC;
+ /* We are about to update a delete marked record.
+ We don't typically need the prefix in this case unless
+ the delete marking is done by the same transaction
+ (which we check below). */
+ ignore_prefix = TRUE;
+ } else {
+ type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+ }
+
+ type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+ type_cmpl_ptr = ptr;
+
+ *ptr++ = (byte) type_cmpl;
+ ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no);
+
+ ptr += mach_dulint_write_much_compressed(ptr, table->id);
+
+ /*----------------------------------------*/
+ /* Store the state of the info bits */
+
+ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+ /* Store the values of the system columns */
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(
+ index, DATA_TRX_ID), &flen);
+ ut_ad(flen == DATA_TRX_ID_LEN);
+
+ trx_id = trx_read_trx_id(field);
+
+ /* If it is an update of a delete marked record, then we are
+ allowed to ignore blob prefixes if the delete marking was done
+ by some other trx as it must have committed by now for us to
+ allow an over-write. */
+ if (ignore_prefix) {
+ ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0;
+ }
+ ptr += mach_dulint_write_compressed(ptr, trx_id);
+
+ field = rec_get_nth_field(rec, offsets,
+ dict_index_get_sys_col_pos(
+ index, DATA_ROLL_PTR), &flen);
+ ut_ad(flen == DATA_ROLL_PTR_LEN);
+
+ ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field));
+
+ /*----------------------------------------*/
+ /* Store then the fields required to uniquely determine the
+ record which will be modified in the clustered index */
+
+ for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+ field = rec_get_nth_field(rec, offsets, i, &flen);
+
+ /* The ordering columns must not be stored externally. */
+ ut_ad(!rec_offs_nth_extern(offsets, i));
+ ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, flen);
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+
+ /*----------------------------------------*/
+ /* Save to the undo log the old values of the columns to be updated. */
+
+ if (update) {
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, upd_get_n_fields(update));
+
+ for (i = 0; i < upd_get_n_fields(update); i++) {
+
+ ulint pos = upd_get_nth_field(update, i)->field_no;
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+ if (trx_undo_left(undo_page, ptr) < 15) {
+
+ return(0);
+ }
+
+ if (rec_offs_nth_extern(offsets, pos)) {
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ dict_index_get_nth_col(index, pos)
+ ->ord_part
+ && !ignore_prefix
+ && flen < REC_MAX_INDEX_COL_LEN
+ ? ext_buf : NULL,
+ dict_table_zip_size(table),
+ &field, &flen);
+
+ /* Notify purge that it eventually has to
+ free the old externally stored field */
+
+ trx->update_undo->del_marks = TRUE;
+
+ *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+ } else {
+ ptr += mach_write_compressed(ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr) < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ /*----------------------------------------*/
+ /* In the case of a delete marking, and also in the case of an update
+ where any ordering field of any index changes, store the values of all
+ columns which occur as ordering fields in any index. This info is used
+ in the purge of old versions where we use it to build and search the
+ delete marked index records, to look if we can remove them from the
+ index tree. Note that starting from 4.0.14 also externally stored
+ fields can be ordering in some index. Starting from 5.2, we no longer
+ store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+ but we can construct the column prefix fields in the index by
+ fetching the first page of the BLOB that is pointed to by the
+ clustered index. This works also in crash recovery, because all pages
+ (including BLOBs) are recovered before anything is rolled back. */
+
+ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+ byte* old_ptr = ptr;
+
+ trx->update_undo->del_marks = TRUE;
+
+ if (trx_undo_left(undo_page, ptr) < 5) {
+
+ return(0);
+ }
+
+ /* Reserve 2 bytes to write the number of bytes the stored
+ fields take in this undo record */
+
+ ptr += 2;
+
+ for (col_no = 0; col_no < dict_table_get_n_cols(table);
+ col_no++) {
+
+ const dict_col_t* col
+ = dict_table_get_nth_col(table, col_no);
+
+ if (col->ord_part) {
+ ulint pos;
+
+ /* Write field number to undo log */
+ if (trx_undo_left(undo_page, ptr) < 5 + 15) {
+
+ return(0);
+ }
+
+ pos = dict_index_get_nth_col_pos(index,
+ col_no);
+ ptr += mach_write_compressed(ptr, pos);
+
+ /* Save the old value of field */
+ field = rec_get_nth_field(rec, offsets, pos,
+ &flen);
+
+ if (rec_offs_nth_extern(offsets, pos)) {
+ ptr = trx_undo_page_report_modify_ext(
+ ptr,
+ flen < REC_MAX_INDEX_COL_LEN
+ && !ignore_prefix
+ ? ext_buf : NULL,
+ dict_table_zip_size(table),
+ &field, &flen);
+ } else {
+ ptr += mach_write_compressed(
+ ptr, flen);
+ }
+
+ if (flen != UNIV_SQL_NULL) {
+ if (trx_undo_left(undo_page, ptr)
+ < flen) {
+
+ return(0);
+ }
+
+ ut_memcpy(ptr, field, flen);
+ ptr += flen;
+ }
+ }
+ }
+
+ mach_write_to_2(old_ptr, ptr - old_ptr);
+ }
+
+ /*----------------------------------------*/
+ /* Write pointers to the previous and the next undo log records */
+ if (trx_undo_left(undo_page, ptr) < 2) {
+
+ return(0);
+ }
+
+ mach_write_to_2(ptr, first_free);
+ ptr += 2;
+ mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+ mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+ ptr - undo_page);
+
+ /* Write to the REDO log about this change in the UNDO log */
+
+ trx_undof_page_add_undo_rec_log(undo_page, first_free,
+ ptr - undo_page, mtr);
+ return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+ byte* ptr, /*!< in: remaining part of undo
+ log record after reading
+ general parameters */
+ trx_id_t* trx_id, /*!< out: trx id */
+ roll_ptr_t* roll_ptr, /*!< out: roll ptr */
+ ulint* info_bits) /*!< out: info bits state */
+{
+ /* Read the state of the info bits */
+ *info_bits = mach_read_from_1(ptr);
+ ptr += 1;
+
+ /* Read the values of the system columns */
+
+ *trx_id = mach_dulint_read_compressed(ptr);
+ ptr += mach_dulint_get_compressed_size(*trx_id);
+
+ *roll_ptr = mach_dulint_read_compressed(ptr);
+ ptr += mach_dulint_get_compressed_size(*roll_ptr);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record the number of updated fields.
+@return remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_n_upd_fields(
+/*=================================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ ulint* n) /*!< out: number of fields */
+{
+ *n = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*n);
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record a stored field number.
+@return remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_field_no(
+/*=============================*/
+ byte* ptr, /*!< in: pointer to remaining part of undo log record */
+ ulint* field_no)/*!< out: field number */
+{
+ *field_no = mach_read_compressed(ptr);
+ ptr += mach_get_compressed_size(*field_no);
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record, after reading the row reference
+ NOTE that this copy of the undo log record must
+ be preserved as long as the update vector is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint type, /*!< in: TRX_UNDO_UPD_EXIST_REC,
+ TRX_UNDO_UPD_DEL_REC, or
+ TRX_UNDO_DEL_MARK_REC; in the last case,
+ only trx id and roll ptr fields are added to
+ the update vector */
+ trx_id_t trx_id, /*!< in: transaction id from this undo record */
+ roll_ptr_t roll_ptr,/*!< in: roll pointer from this undo record */
+ ulint info_bits,/*!< in: info bits from this undo record */
+ trx_t* trx, /*!< in: transaction */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ upd_t** upd) /*!< out, own: update vector */
+{
+ upd_field_t* upd_field;
+ upd_t* update;
+ ulint n_fields;
+ byte* buf;
+ ulint i;
+
+ ut_a(dict_index_is_clust(index));
+
+ if (type != TRX_UNDO_DEL_MARK_REC) {
+ ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields);
+ } else {
+ n_fields = 0;
+ }
+
+ update = upd_create(n_fields + 2, heap);
+
+ update->info_bits = info_bits;
+
+ /* Store first trx id and roll ptr to update vector */
+
+ upd_field = upd_get_nth_field(update, n_fields);
+ buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+ trx_write_trx_id(buf, trx_id);
+
+ upd_field_set_field_no(upd_field,
+ dict_index_get_sys_col_pos(index, DATA_TRX_ID),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+ upd_field = upd_get_nth_field(update, n_fields + 1);
+ buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(buf, roll_ptr);
+
+ upd_field_set_field_no(
+ upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),
+ index, trx);
+ dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+ /* Store then the updated ordinary columns to the update vector */
+
+ for (i = 0; i < n_fields; i++) {
+
+ byte* field;
+ ulint len;
+ ulint field_no;
+ ulint orig_len;
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ if (field_no >= dict_index_get_n_fields(index)) {
+ fprintf(stderr,
+ "InnoDB: Error: trying to access"
+ " update undo rec field %lu in ",
+ (ulong) field_no);
+ dict_index_name_print(stderr, trx, index);
+ fprintf(stderr, "\n"
+ "InnoDB: but index has only %lu fields\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n"
+ "InnoDB: Run also CHECK TABLE ",
+ (ulong) dict_index_get_n_fields(index));
+ ut_print_name(stderr, trx, TRUE, index->table_name);
+ fprintf(stderr, "\n"
+ "InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
+ (ulong) n_fields, (ulong) i, ptr);
+ *upd = NULL;
+ return(NULL);
+ }
+
+ upd_field = upd_get_nth_field(update, i);
+
+ upd_field_set_field_no(upd_field, field_no, index, trx);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ upd_field->orig_len = orig_len;
+
+ if (len == UNIV_SQL_NULL) {
+ dfield_set_null(&upd_field->new_val);
+ } else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_data(&upd_field->new_val, field, len);
+ } else {
+ len -= UNIV_EXTERN_STORAGE_FIELD;
+
+ dfield_set_data(&upd_field->new_val, field, len);
+ dfield_set_ext(&upd_field->new_val);
+ }
+ }
+
+ *upd = update;
+
+ return(ptr);
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+ byte* ptr, /*!< in: remaining part in update undo log
+ record of a suitable type, at the start of
+ the stored index columns;
+ NOTE that this copy of the undo log record must
+ be preserved as long as the partial row is
+ used, as we do NOT copy the data in the
+ record! */
+ dict_index_t* index, /*!< in: clustered index */
+ dtuple_t** row, /*!< out, own: partial row */
+ ibool ignore_prefix, /*!< in: flag to indicate if we
+ expect blob prefixes in undo. Used
+ only in the assertion. */
+ mem_heap_t* heap) /*!< in: memory heap from which the memory
+ needed is allocated */
+{
+ const byte* end_ptr;
+ ulint row_len;
+
+ ut_ad(index);
+ ut_ad(ptr);
+ ut_ad(row);
+ ut_ad(heap);
+ ut_ad(dict_index_is_clust(index));
+
+ row_len = dict_table_get_n_cols(index->table);
+
+ *row = dtuple_create(heap, row_len);
+
+ dict_table_copy_types(*row, index->table);
+
+ end_ptr = ptr + mach_read_from_2(ptr);
+ ptr += 2;
+
+ while (ptr != end_ptr) {
+ dfield_t* dfield;
+ byte* field;
+ ulint field_no;
+ const dict_col_t* col;
+ ulint col_no;
+ ulint len;
+ ulint orig_len;
+
+ ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+ col = dict_index_get_nth_col(index, field_no);
+ col_no = dict_col_get_no(col);
+
+ ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+ dfield = dtuple_get_nth_field(*row, col_no);
+
+ dfield_set_data(dfield, field, len);
+
+ if (len != UNIV_SQL_NULL
+ && len >= UNIV_EXTERN_STORAGE_FIELD) {
+ dfield_set_len(dfield,
+ len - UNIV_EXTERN_STORAGE_FIELD);
+ dfield_set_ext(dfield);
+ /* If the prefix of this column is indexed,
+ ensure that enough prefix is stored in the
+ undo log record. */
+ if (!ignore_prefix && col->ord_part) {
+ ut_a(dfield_get_len(dfield)
+ >= 2 * BTR_EXTERN_FIELD_REF_SIZE);
+ ut_a(dict_table_get_format(index->table)
+ >= DICT_TF_FORMAT_ZIP
+ || dfield_get_len(dfield)
+ >= REC_MAX_INDEX_COL_LEN
+ + BTR_EXTERN_FIELD_REF_SIZE);
+ }
+ }
+ }
+
+ return(ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Erases the unused undo log page end. */
+static
+void
+trx_undo_erase_page_end(
+/*====================*/
+ page_t* undo_page, /*!< in: undo page whose end to erase */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint first_free;
+
+ first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE);
+ memset(undo_page + first_free, 0xff,
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);
+
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+}
+
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(ptr && end_ptr);
+
+ if (page == NULL) {
+
+ return(ptr);
+ }
+
+ trx_undo_erase_page_end(page, mtr);
+
+ return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+ ulint flags, /*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+ set, does nothing */
+ ulint op_type, /*!< in: TRX_UNDO_INSERT_OP or
+ TRX_UNDO_MODIFY_OP */
+ que_thr_t* thr, /*!< in: query thread */
+ dict_index_t* index, /*!< in: clustered index */
+ const dtuple_t* clust_entry, /*!< in: in the case of an insert,
+ index entry to insert into the
+ clustered index, otherwise NULL */
+ const upd_t* update, /*!< in: in the case of an update,
+ the update vector, otherwise NULL */
+ ulint cmpl_info, /*!< in: compiler info on secondary
+ index updates */
+ const rec_t* rec, /*!< in: in case of an update or delete
+ marking, the record in the clustered
+ index, otherwise NULL */
+ roll_ptr_t* roll_ptr) /*!< out: rollback pointer to the
+ inserted undo log record,
+ ut_dulint_zero if BTR_NO_UNDO_LOG
+ flag was specified */
+{
+ trx_t* trx;
+ trx_undo_t* undo;
+ ulint page_no;
+ trx_rseg_t* rseg;
+ mtr_t mtr;
+ ulint err = DB_SUCCESS;
+ mem_heap_t* heap = NULL;
+ ulint offsets_[REC_OFFS_NORMAL_SIZE];
+ ulint* offsets = offsets_;
+ rec_offs_init(offsets_);
+
+ ut_a(dict_index_is_clust(index));
+
+ if (flags & BTR_NO_UNDO_LOG_FLAG) {
+
+ *roll_ptr = ut_dulint_zero;
+
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(thr);
+ ut_ad((op_type != TRX_UNDO_INSERT_OP)
+ || (clust_entry && !update && !rec));
+
+ trx = thr_get_trx(thr);
+ rseg = trx->rseg;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ /* If the undo log is not assigned yet, assign one */
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+
+ if (trx->insert_undo == NULL) {
+
+ err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
+ }
+
+ undo = trx->insert_undo;
+
+ if (UNIV_UNLIKELY(!undo)) {
+ /* Did not succeed */
+ mutex_exit(&(trx->undo_mutex));
+
+ return(err);
+ }
+ } else {
+ ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+
+ if (trx->update_undo == NULL) {
+
+ err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+ }
+
+ undo = trx->update_undo;
+
+ if (UNIV_UNLIKELY(!undo)) {
+ /* Did not succeed */
+ mutex_exit(&(trx->undo_mutex));
+ return(err);
+ }
+
+ offsets = rec_get_offsets(rec, index, offsets,
+ ULINT_UNDEFINED, &heap);
+ }
+
+ page_no = undo->last_page_no;
+
+ mtr_start(&mtr);
+
+ for (;;) {
+ buf_block_t* undo_block;
+ page_t* undo_page;
+ ulint offset;
+
+ undo_block = buf_page_get_gen(undo->space, undo->zip_size,
+ page_no, RW_X_LATCH,
+ undo->guess_block, BUF_GET,
+ __FILE__, __LINE__, &mtr);
+ buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE);
+
+ undo_page = buf_block_get_frame(undo_block);
+
+ if (op_type == TRX_UNDO_INSERT_OP) {
+ offset = trx_undo_page_report_insert(
+ undo_page, trx, index, clust_entry, &mtr);
+ } else {
+ offset = trx_undo_page_report_modify(
+ undo_page, trx, index, rec, offsets, update,
+ cmpl_info, &mtr);
+ }
+
+ if (UNIV_UNLIKELY(offset == 0)) {
+ /* The record did not fit on the page. We erase the
+ end segment of the undo log page and write a log
+ record of it: this is to ensure that in the debug
+ version the replicate page constructed using the log
+ records stays identical to the original page */
+
+ trx_undo_erase_page_end(undo_page, &mtr);
+ mtr_commit(&mtr);
+ } else {
+ /* Success */
+
+ mtr_commit(&mtr);
+
+ undo->empty = FALSE;
+ undo->top_page_no = page_no;
+ undo->top_offset = offset;
+ undo->top_undo_no = trx->undo_no;
+ undo->guess_block = undo_block;
+
+ UT_DULINT_INC(trx->undo_no);
+
+ mutex_exit(&trx->undo_mutex);
+
+ *roll_ptr = trx_undo_build_roll_ptr(
+ op_type == TRX_UNDO_INSERT_OP,
+ rseg->id, page_no, offset);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(DB_SUCCESS);
+ }
+
+ ut_ad(page_no == undo->last_page_no);
+
+ /* We have to extend the undo log by one page */
+
+ mtr_start(&mtr);
+
+ /* When we add a page to an undo log, this is analogous to
+ a pessimistic insert in a B-tree, and we must reserve the
+ counterpart of the tree latch, which is the rseg mutex. */
+
+ mutex_enter(&(rseg->mutex));
+
+ page_no = trx_undo_add_page(trx, undo, &mtr);
+
+ mutex_exit(&(rseg->mutex));
+
+ if (UNIV_UNLIKELY(page_no == FIL_NULL)) {
+ /* Did not succeed: out of space */
+
+ mutex_exit(&(trx->undo_mutex));
+ mtr_commit(&mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+ mem_heap_free(heap);
+ }
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+ }
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_rec_t* undo_rec;
+ ulint rseg_id;
+ ulint page_no;
+ ulint offset;
+ const page_t* undo_page;
+ trx_rseg_t* rseg;
+ ibool is_insert;
+ mtr_t mtr;
+
+ trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+ &offset);
+ rseg = trx_rseg_get_on_id(rseg_id);
+
+ mtr_start(&mtr);
+
+ undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+ page_no, &mtr);
+
+ undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+
+ mtr_commit(&mtr);
+
+ return(undo_rec);
+}
+
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+ roll_ptr_t roll_ptr, /*!< in: roll pointer to record */
+ trx_id_t trx_id, /*!< in: id of the trx that generated
+ the roll pointer: it points to an
+ undo log of this transaction */
+ trx_undo_rec_t** undo_rec, /*!< out, own: copy of the record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+ if (!trx_purge_update_undo_must_exist(trx_id)) {
+
+ /* It may be that the necessary undo log has already been
+ deleted */
+
+ return(DB_MISSING_HISTORY);
+ }
+
+ *undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked all the way down to the purge_view.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+ const rec_t* index_rec,/*!< in: clustered index record in the
+ index tree */
+ mtr_t* index_mtr __attribute__((unused)),
+ /*!< in: mtr which contains the latch to
+ index_rec page and purge_view */
+ const rec_t* rec, /*!< in: version of a clustered index record */
+ dict_index_t* index, /*!< in: clustered index */
+ ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
+ mem_heap_t* heap, /*!< in: memory heap from which the memory
+ needed is allocated */
+ rec_t** old_vers)/*!< out, own: previous version, or NULL if
+ rec is the first inserted version, or if
+ history data has been deleted (an error),
+ or if the purge COULD have removed the version
+ though it has not yet done so */
+{
+ trx_undo_rec_t* undo_rec = NULL;
+ dtuple_t* entry;
+ trx_id_t rec_trx_id;
+ ulint type;
+ undo_no_t undo_no;
+ dulint table_id;
+ trx_id_t trx_id;
+ roll_ptr_t roll_ptr;
+ roll_ptr_t old_roll_ptr;
+ upd_t* update;
+ byte* ptr;
+ ulint info_bits;
+ ulint cmpl_info;
+ ibool dummy_extern;
+ byte* buf;
+ ulint err;
+#ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+ ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
+ || mtr_memo_contains_page(index_mtr, index_rec,
+ MTR_MEMO_PAGE_X_FIX));
+ ut_ad(rec_offs_validate(rec, index, offsets));
+
+ if (!dict_index_is_clust(index)) {
+ fprintf(stderr, "InnoDB: Error: trying to access"
+ " update undo rec for non-clustered index %s\n"
+ "InnoDB: Submit a detailed bug report to"
+ " http://bugs.mysql.com\n"
+ "InnoDB: index record ", index->name);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ putc('\n', stderr);
+ return(DB_ERROR);
+ }
+
+ roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+ old_roll_ptr = roll_ptr;
+
+ *old_vers = NULL;
+
+ if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+ /* The record rec is the first inserted version */
+
+ return(DB_SUCCESS);
+ }
+
+ rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+ err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap);
+
+ if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+ /* The undo record may already have been purged.
+ This should never happen in InnoDB. */
+
+ return(err);
+ }
+
+ ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+ &dummy_extern, &undo_no, &table_id);
+
+ ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+ &info_bits);
+
+ /* (a) If a clustered index record version is such that the
+ trx id stamp in it is bigger than purge_sys->view, then the
+ BLOBs in that version are known to exist (the purge has not
+ progressed that far);
+
+ (b) if the version is the first version such that trx id in it
+ is less than purge_sys->view, and it is not delete-marked,
+ then the BLOBs in that version are known to exist (the purge
+ cannot have purged the BLOBs referenced by that version
+ yet).
+
+ This function does not fetch any BLOBs. The callers might, by
+ possibly invoking row_ext_create() via row_build(). However,
+ they should have all needed information in the *old_vers
+ returned by this function. This is because *old_vers is based
+ on the transaction undo log records. The function
+ trx_undo_page_fetch_ext() will write BLOB prefixes to the
+ transaction undo log that are at least as long as the longest
+ possible column prefix in a secondary index. Thus, secondary
+ index entries for *old_vers can be constructed without
+ dereferencing any BLOB pointers. */
+
+ ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+ ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+ roll_ptr, info_bits,
+ NULL, heap, &update);
+
+ if (ut_dulint_cmp(table_id, index->table->id) != 0) {
+ ptr = NULL;
+
+ fprintf(stderr,
+ "InnoDB: Error: trying to access update undo rec"
+ " for table %s\n"
+ "InnoDB: but the table id in the"
+ " undo record is wrong\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com\n"
+ "InnoDB: Run also CHECK TABLE %s\n",
+ index->table_name, index->table_name);
+ }
+
+ if (ptr == NULL) {
+ /* The record was corrupted, return an error; these printfs
+ should catch an elusive bug in row_vers_old_has_index_entry */
+
+ fprintf(stderr,
+ "InnoDB: table %s, index %s, n_uniq %lu\n"
+ "InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
+ "InnoDB: undo rec table id %lu %lu,"
+ " index table id %lu %lu\n"
+ "InnoDB: dump of 150 bytes in undo rec: ",
+ index->table_name, index->name,
+ (ulong) dict_index_get_n_unique(index),
+ undo_rec, (ulong) type, (ulong) cmpl_info,
+ (ulong) ut_dulint_get_high(table_id),
+ (ulong) ut_dulint_get_low(table_id),
+ (ulong) ut_dulint_get_high(index->table->id),
+ (ulong) ut_dulint_get_low(index->table->id));
+ ut_print_buf(stderr, undo_rec, 150);
+ fputs("\n"
+ "InnoDB: index record ", stderr);
+ rec_print(stderr, index_rec, index);
+ fputs("\n"
+ "InnoDB: record version ", stderr);
+ rec_print_new(stderr, rec, offsets);
+ fprintf(stderr, "\n"
+ "InnoDB: Record trx id " TRX_ID_FMT
+ ", update rec trx id " TRX_ID_FMT "\n"
+ "InnoDB: Roll ptr in rec %lu %lu, in update rec"
+ " %lu %lu\n",
+ TRX_ID_PREP_PRINTF(rec_trx_id),
+ TRX_ID_PREP_PRINTF(trx_id),
+ (ulong) ut_dulint_get_high(old_roll_ptr),
+ (ulong) ut_dulint_get_low(old_roll_ptr),
+ (ulong) ut_dulint_get_high(roll_ptr),
+ (ulong) ut_dulint_get_low(roll_ptr));
+
+ trx_purge_sys_print();
+ return(DB_ERROR);
+ }
+
+ if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+ ulint n_ext;
+
+ /* We have to set the appropriate extern storage bits in the
+ old version of the record: the extern bits in rec for those
+ fields that update does NOT update, as well as the bits for
+ those fields that update updates to become externally stored
+ fields. Store the info: */
+
+ entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index,
+ offsets, &n_ext, heap);
+ n_ext += btr_push_update_extern_fields(entry, update, heap);
+ /* The page containing the clustered index record
+ corresponding to entry is latched in mtr. Thus the
+ following call is safe. */
+ row_upd_index_replace_new_col_vals(entry, index, update, heap);
+
+ buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry,
+ n_ext));
+
+ *old_vers = rec_convert_dtuple_to_rec(buf, index,
+ entry, n_ext);
+ } else {
+ buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+ *old_vers = rec_copy(buf, rec, offsets);
+ rec_offs_make_valid(*old_vers, index, offsets);
+ row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
+ }
+
+ return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.c
new file mode 100644
index 00000000000..c925478cdf4
--- /dev/null
+++ b/storage/xtradb/trx/trx0roll.c
@@ -0,0 +1,1366 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.c
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#ifdef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "usr0sess.h"
+#include "srv0que.h"
+#include "srv0start.h"
+#include "row0undo.h"
+#include "row0mysql.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+
+/** This many pages must be undone before a truncate is tried within
+rollback */
+#define TRX_ROLL_TRUNC_THRESHOLD 1
+
+/** In crash recovery, the current trx to be rolled back */
+static trx_t* trx_roll_crash_recv_trx = NULL;
+
+/** In crash recovery we set this to the undo n:o of the current trx to be
+rolled back. Then we can print how many % the rollback has progressed. */
+static ib_int64_t trx_roll_max_undo_no;
+
+/** Auxiliary variable which tells the previous progress % we printed */
+static ulint trx_roll_progress_printed_pct;
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_savept_t* savept) /*!< in: pointer to savepoint undo number, if
+ partial rollback requested, or NULL for
+ complete rollback */
+{
+ mem_heap_t* heap;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ trx_start_if_not_started(trx);
+
+ heap = mem_heap_create(512);
+
+ roll_node = roll_node_create(heap);
+
+ if (savept) {
+ roll_node->partial = TRUE;
+ roll_node->savept = *savept;
+ }
+
+ trx->error_state = DB_SUCCESS;
+
+ thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+
+ ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ mem_heap_free(heap);
+
+ ut_a(trx->error_state == DB_SUCCESS);
+
+ /* Tell Innobase server that there might be work for
+ utility threads: */
+
+ srv_active_wake_master_thread();
+
+ return((int) trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ int err;
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback";
+
+ /* If we are doing the XA recovery of prepared transactions, then
+ the transaction object does not have an InnoDB session object, and we
+ set a dummy session that we use for all MySQL transactions. */
+
+ err = trx_general_rollback_for_mysql(trx, NULL);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ int err;
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ return(DB_SUCCESS);
+ }
+
+ trx->op_info = "rollback of SQL statement";
+
+ err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start);
+ /* The following call should not be needed, but we play safe: */
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: savepoint to free */
+{
+ ut_a(savep != NULL);
+ ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0);
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+ mem_free(savep->name);
+ mem_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ trx_named_savept_t* savep) /*!< in: free all savepoints > this one;
+ if this is NULL, free all savepoints
+ of trx */
+{
+ trx_named_savept_t* next_savep;
+
+ if (savep == NULL) {
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+ } else {
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ while (savep != NULL) {
+ next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+ trx_roll_savepoint_free(trx, savep);
+
+ savep = next_savep;
+ }
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t* mysql_binlog_cache_pos) /*!< out: the MySQL binlog cache
+ position corresponding to this
+ savepoint; MySQL needs this
+ information to remove the
+ binlog entries of the queries
+ executed after the savepoint */
+{
+ trx_named_savept_t* savep;
+ ulint err;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep == NULL) {
+
+ return(DB_NO_SAVEPOINT);
+ }
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: transaction has a savepoint ", stderr);
+ ut_print_name(stderr, trx, FALSE, savep->name);
+ fputs(" though it is not started\n", stderr);
+ return(DB_ERROR);
+ }
+
+ /* We can now free all savepoints strictly later than this one */
+
+ trx_roll_savepoints_free(trx, savep);
+
+ *mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+ trx->op_info = "rollback to a savepoint";
+
+ err = trx_general_rollback_for_mysql(trx, &savep->savept);
+
+ /* Store the current undo_no of the transaction so that we know where
+ to roll back if we have to roll back the next SQL statement: */
+
+ trx_mark_sql_stat_end(trx);
+
+ trx->op_info = "";
+
+ return(err);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name, /*!< in: savepoint name */
+ ib_int64_t binlog_cache_pos) /*!< in: MySQL binlog cache
+ position corresponding to this
+ connection at the time of the
+ savepoint */
+{
+ trx_named_savept_t* savep;
+
+ ut_a(trx);
+ ut_a(savepoint_name);
+
+ trx_start_if_not_started(trx);
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ /* Found */
+ break;
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ if (savep) {
+ /* There is a savepoint with the same name: free that */
+
+ UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+ mem_free(savep->name);
+ mem_free(savep);
+ }
+
+ /* Create a new savepoint and add it as the last in the list */
+
+ savep = mem_alloc(sizeof(trx_named_savept_t));
+
+ savep->name = mem_strdup(savepoint_name);
+
+ savep->savept = trx_savept_take(trx);
+
+ savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+ UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep);
+
+ return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+ trx_t* trx, /*!< in: transaction handle */
+ const char* savepoint_name) /*!< in: savepoint name */
+{
+ trx_named_savept_t* savep;
+
+ savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+ /* Search for the savepoint by name and free if found. */
+ while (savep != NULL) {
+ if (0 == ut_strcmp(savep->name, savepoint_name)) {
+ trx_roll_savepoint_free(trx, savep);
+ return(DB_SUCCESS);
+ }
+ savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+ }
+
+ return(DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+ const trx_t* trx) /*!< in: transaction */
+{
+ return(trx == trx_roll_crash_recv_trx);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_savept_t savept;
+
+ savept.least_undo_no = trx->undo_no;
+
+ return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ roll_node_t* roll_node;
+ dict_table_t* table;
+ ib_int64_t rows_to_undo;
+ const char* unit = "";
+ ibool dictionary_locked = FALSE;
+
+ heap = mem_heap_create(512);
+
+ fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+
+ roll_node = roll_node_create(heap);
+
+ thr->child = roll_node;
+ roll_node->common.parent = thr;
+
+ mutex_enter(&kernel_mutex);
+
+ trx->graph = fork;
+
+ ut_a(thr == que_fork_start_command(fork));
+
+ trx_roll_crash_recv_trx = trx;
+ trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no);
+ trx_roll_progress_printed_pct = 0;
+ rows_to_undo = trx_roll_max_undo_no;
+
+ if (rows_to_undo > 1000000000) {
+ rows_to_undo = rows_to_undo / 1000000;
+ unit = "M";
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
+ " rows to undo\n",
+ TRX_ID_PREP_PRINTF(trx->id),
+ (ulong) rows_to_undo, unit);
+ mutex_exit(&kernel_mutex);
+
+ trx->mysql_thread_id = os_thread_get_curr_id();
+
+ trx->mysql_process_no = os_proc_get_number();
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+ row_mysql_lock_data_dictionary(trx);
+ dictionary_locked = TRUE;
+ }
+
+ que_run_threads(thr);
+
+ mutex_enter(&kernel_mutex);
+
+ while (trx->que_state != TRX_QUE_RUNNING) {
+
+ mutex_exit(&kernel_mutex);
+
+ fprintf(stderr,
+ "InnoDB: Waiting for rollback of trx id %lu to end\n",
+ (ulong) ut_dulint_get_low(trx->id));
+ os_thread_sleep(100000);
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+ && !ut_dulint_is_zero(trx->table_id)) {
+
+ /* If the transaction was for a dictionary operation, we
+ drop the relevant table, if it still exists */
+
+ fprintf(stderr,
+ "InnoDB: Dropping table with id %lu %lu"
+ " in recovery if it exists\n",
+ (ulong) ut_dulint_get_high(trx->table_id),
+ (ulong) ut_dulint_get_low(trx->table_id));
+
+ table = dict_table_get_on_id_low(trx->table_id);
+
+ if (table) {
+ ulint err;
+
+ fputs("InnoDB: Table found: dropping table ", stderr);
+ ut_print_name(stderr, trx, TRUE, table->name);
+ fputs(" in recovery\n", stderr);
+
+ err = row_drop_table_for_mysql(table->name, trx, TRUE);
+ trx_commit_for_mysql(trx);
+
+ ut_a(err == (int) DB_SUCCESS);
+ }
+ }
+
+ if (dictionary_locked) {
+ row_mysql_unlock_data_dictionary(trx);
+ }
+
+ fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
+ " completed\n",
+ TRX_ID_PREP_PRINTF(trx->id));
+ mem_heap_free(heap);
+
+ trx_roll_crash_recv_trx = NULL;
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+ ibool all) /*!< in: FALSE=roll back dictionary transactions;
+ TRUE=roll back all non-PREPARED transactions */
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
+ goto leave_function;
+ }
+
+ if (all) {
+ fprintf(stderr,
+ "InnoDB: Starting in background the rollback"
+ " of uncommitted transactions\n");
+ }
+
+ mutex_exit(&kernel_mutex);
+
+loop:
+ mutex_enter(&kernel_mutex);
+
+ for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
+ trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+ if (!trx->is_recovered) {
+ continue;
+ }
+
+ switch (trx->conc_state) {
+ case TRX_NOT_STARTED:
+ case TRX_PREPARED:
+ continue;
+
+ case TRX_COMMITTED_IN_MEMORY:
+ mutex_exit(&kernel_mutex);
+ fprintf(stderr,
+ "InnoDB: Cleaning up trx with id "
+ TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(trx->id));
+ trx_cleanup_at_db_startup(trx);
+ goto loop;
+
+ case TRX_ACTIVE:
+ if (all || trx_get_dict_operation(trx)
+ != TRX_DICT_OP_NONE) {
+ mutex_exit(&kernel_mutex);
+ trx_rollback_active(trx);
+ goto loop;
+ }
+ }
+ }
+
+ if (all) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Rollback of non-prepared"
+ " transactions completed\n");
+ }
+
+leave_function:
+ mutex_exit(&kernel_mutex);
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery. If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+ void* arg __attribute__((unused)))
+ /*!< in: a dummy parameter required by
+ os_thread_create */
+{
+ trx_rollback_or_clean_recovered(TRUE);
+
+ /* We count the number of threads in os_thread_exit(). A created
+ thread should always use that to exit and not use return() to exit. */
+
+ os_thread_exit(NULL);
+
+ OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Creates an undo number array.
+@return own: undo number array */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void)
+/*=====================*/
+{
+ trx_undo_arr_t* arr;
+ mem_heap_t* heap;
+ ulint i;
+
+ heap = mem_heap_create(1024);
+
+ arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+
+ arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
+ * UNIV_MAX_PARALLELISM);
+ arr->n_cells = UNIV_MAX_PARALLELISM;
+ arr->n_used = 0;
+
+ arr->heap = heap;
+
+ for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
+
+ (trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
+ }
+
+ return(arr);
+}
+
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+ trx_undo_arr_t* arr) /*!< in: undo number array */
+{
+ ut_ad(arr->n_used == 0);
+
+ mem_heap_free(arr->heap);
+}
+
+/*******************************************************************//**
+Stores info of an undo log record to the array if it is not stored yet.
+@return FALSE if the record already existed in the array */
+static
+ibool
+trx_undo_arr_store_info(
+/*====================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ trx_undo_inf_t* stored_here;
+ trx_undo_arr_t* arr;
+ ulint n_used;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ arr = trx->undo_no_arr;
+ n_used = arr->n_used;
+ stored_here = NULL;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (!cell->in_use) {
+ if (!stored_here) {
+ /* Not in use, we may store here */
+ cell->undo_no = undo_no;
+ cell->in_use = TRUE;
+
+ arr->n_used++;
+
+ stored_here = cell;
+ }
+ } else {
+ n++;
+
+ if (0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+ if (stored_here) {
+ stored_here->in_use = FALSE;
+ ut_ad(arr->n_used > 0);
+ arr->n_used--;
+ }
+
+ ut_ad(arr->n_used == n_used);
+
+ return(FALSE);
+ }
+ }
+
+ if (n == n_used && stored_here) {
+
+ ut_ad(arr->n_used == 1 + n_used);
+
+ return(TRUE);
+ }
+ }
+}
+
+/*******************************************************************//**
+Removes an undo number from the array. */
+static
+void
+trx_undo_arr_remove_info(
+/*=====================*/
+ trx_undo_arr_t* arr, /*!< in: undo number array */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_inf_t* cell;
+ ulint n_used;
+ ulint n;
+ ulint i;
+
+ n_used = arr->n_used;
+ n = 0;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use
+ && 0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+ cell->in_use = FALSE;
+
+ ut_ad(arr->n_used > 0);
+
+ arr->n_used--;
+
+ return;
+ }
+ }
+}
+
+/*******************************************************************//**
+Gets the biggest undo number in an array.
+@return biggest value, ut_dulint_zero if the array is empty */
+static
+undo_no_t
+trx_undo_arr_get_biggest(
+/*=====================*/
+ trx_undo_arr_t* arr) /*!< in: undo number array */
+{
+ trx_undo_inf_t* cell;
+ ulint n_used;
+ undo_no_t biggest;
+ ulint n;
+ ulint i;
+
+ n = 0;
+ n_used = arr->n_used;
+ biggest = ut_dulint_zero;
+
+ for (i = 0;; i++) {
+ cell = trx_undo_arr_get_nth_info(arr, i);
+
+ if (cell->in_use) {
+ n++;
+ if (ut_dulint_cmp(cell->undo_no, biggest) > 0) {
+
+ biggest = cell->undo_no;
+ }
+ }
+
+ if (n == n_used) {
+ return(biggest);
+ }
+ }
+}
+
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+ trx_t* trx) /*!< in/out: transaction */
+{
+ trx_undo_arr_t* arr;
+ undo_no_t limit;
+ undo_no_t biggest;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&((trx->rseg)->mutex)));
+
+ trx->pages_undone = 0;
+
+ arr = trx->undo_no_arr;
+
+ limit = trx->undo_no;
+
+ if (arr->n_used > 0) {
+ biggest = trx_undo_arr_get_biggest(arr);
+
+ if (ut_dulint_cmp(biggest, limit) >= 0) {
+
+ limit = ut_dulint_add(biggest, 1);
+ }
+ }
+
+ if (trx->insert_undo) {
+ trx_undo_truncate_end(trx, trx->insert_undo, limit);
+ }
+
+ if (trx->update_undo) {
+ trx_undo_truncate_end(trx, trx->update_undo, limit);
+ }
+}
+
+/***********************************************************************//**
+Pops the topmost undo log record in a single undo log and updates the info
+about the topmost record in the undo log memory struct.
+@return undo log record, the page s-latched */
+static
+trx_undo_rec_t*
+trx_roll_pop_top_rec(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ ulint offset;
+ trx_undo_rec_t* prev_rec;
+ page_t* prev_rec_page;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+
+ undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size,
+ undo->top_page_no, mtr);
+ offset = undo->top_offset;
+
+ /* fprintf(stderr, "Thread %lu undoing trx %lu undo record %lu\n",
+ os_thread_get_curr_id(), ut_dulint_get_low(trx->id),
+ ut_dulint_get_low(undo->top_undo_no)); */
+
+ prev_rec = trx_undo_get_prev_rec(undo_page + offset,
+ undo->hdr_page_no, undo->hdr_offset,
+ mtr);
+ if (prev_rec == NULL) {
+
+ undo->empty = TRUE;
+ } else {
+ prev_rec_page = page_align(prev_rec);
+
+ if (prev_rec_page != undo_page) {
+
+ trx->pages_undone++;
+ }
+
+ undo->top_page_no = page_get_page_no(prev_rec_page);
+ undo->top_offset = prev_rec - prev_rec_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+ }
+
+ return(undo_page + offset);
+}
+
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+ trx_t* trx, /*!< in: transaction */
+ undo_no_t limit, /*!< in: least undo number we need */
+ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */
+ mem_heap_t* heap) /*!< in: memory heap where copied */
+{
+ trx_undo_t* undo;
+ trx_undo_t* ins_undo;
+ trx_undo_t* upd_undo;
+ trx_undo_rec_t* undo_rec;
+ trx_undo_rec_t* undo_rec_copy;
+ undo_no_t undo_no;
+ ibool is_insert;
+ trx_rseg_t* rseg;
+ ulint progress_pct;
+ mtr_t mtr;
+
+ rseg = trx->rseg;
+try_again:
+ mutex_enter(&(trx->undo_mutex));
+
+ if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ ins_undo = trx->insert_undo;
+ upd_undo = trx->update_undo;
+
+ if (!ins_undo || ins_undo->empty) {
+ undo = upd_undo;
+ } else if (!upd_undo || upd_undo->empty) {
+ undo = ins_undo;
+ } else if (ut_dulint_cmp(upd_undo->top_undo_no,
+ ins_undo->top_undo_no) > 0) {
+ undo = upd_undo;
+ } else {
+ undo = ins_undo;
+ }
+
+ if (!undo || undo->empty
+ || (ut_dulint_cmp(limit, undo->top_undo_no) > 0)) {
+
+ if ((trx->undo_no_arr)->n_used == 0) {
+ /* Rollback is ending */
+
+ mutex_enter(&(rseg->mutex));
+
+ trx_roll_try_truncate(trx);
+
+ mutex_exit(&(rseg->mutex));
+ }
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(NULL);
+ }
+
+ if (undo == ins_undo) {
+ is_insert = TRUE;
+ } else {
+ is_insert = FALSE;
+ }
+
+ *roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
+ undo->top_page_no,
+ undo->top_offset);
+ mtr_start(&mtr);
+
+ undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
+
+ undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+ ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0);
+
+ /* We print rollback progress info if we are in a crash recovery
+ and the transaction has at least 1000 row operations to undo. */
+
+ if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) {
+
+ progress_pct = 100 - (ulint)
+ ((ut_conv_dulint_to_longlong(undo_no) * 100)
+ / trx_roll_max_undo_no);
+ if (progress_pct != trx_roll_progress_printed_pct) {
+ if (trx_roll_progress_printed_pct == 0) {
+ fprintf(stderr,
+ "\nInnoDB: Progress in percents:"
+ " %lu", (ulong) progress_pct);
+ } else {
+ fprintf(stderr,
+ " %lu", (ulong) progress_pct);
+ }
+ fflush(stderr);
+ trx_roll_progress_printed_pct = progress_pct;
+ }
+ }
+
+ trx->undo_no = undo_no;
+
+ if (!trx_undo_arr_store_info(trx, undo_no)) {
+ /* A query thread is already processing this undo log record */
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ goto try_again;
+ }
+
+ undo_rec_copy = trx_undo_rec_copy(undo_rec, heap);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ mtr_commit(&mtr);
+
+ return(undo_rec_copy);
+}
+
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no)/*!< in: undo number of the record */
+{
+ ibool ret;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ ret = trx_undo_arr_store_info(trx, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+
+ return(ret);
+}
+
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+ trx_t* trx, /*!< in/out: transaction */
+ undo_no_t undo_no)/*!< in: undo number */
+{
+ trx_undo_arr_t* arr;
+
+ mutex_enter(&(trx->undo_mutex));
+
+ arr = trx->undo_no_arr;
+
+ trx_undo_arr_remove_info(arr, undo_no);
+
+ mutex_exit(&(trx->undo_mutex));
+}
+
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+ trx_t* trx, /*!< in: transaction */
+ trx_sig_t* sig, /*!< in: signal starting the rollback */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the passed value is
+ NULL, the parameter is ignored */
+{
+ que_t* roll_graph;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
+
+ /* Initialize the rollback field in the transaction */
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ trx->roll_limit = ut_dulint_zero;
+
+ } else if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+ trx->roll_limit = (sig->savept).least_undo_no;
+
+ } else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
+ } else {
+ ut_error;
+ }
+
+ ut_a(ut_dulint_cmp(trx->roll_limit, trx->undo_no) <= 0);
+
+ trx->pages_undone = 0;
+
+ if (trx->undo_no_arr == NULL) {
+ trx->undo_no_arr = trx_undo_arr_create();
+ }
+
+ /* Build a 'query' graph which will perform the undo operations */
+
+ roll_graph = trx_roll_graph_build(trx);
+
+ trx->graph = roll_graph;
+ trx->que_state = TRX_QUE_ROLLING_BACK;
+
+ thr = que_fork_start_command(roll_graph);
+
+ ut_ad(thr);
+
+ /* thr2 = que_fork_start_command(roll_graph);
+
+ ut_ad(thr2); */
+
+ if (next_thr && (*next_thr == NULL)) {
+ *next_thr = thr;
+ /* srv_que_task_enqueue_low(thr2); */
+ } else {
+ srv_que_task_enqueue_low(thr);
+ /* srv_que_task_enqueue_low(thr2); */
+ }
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ mem_heap_t* heap;
+ que_fork_t* fork;
+ que_thr_t* thr;
+ /* que_thr_t* thr2; */
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ heap = mem_heap_create(512);
+ fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+ fork->trx = trx;
+
+ thr = que_thr_create(fork, heap);
+ /* thr2 = que_thr_create(fork, heap); */
+
+ thr->child = row_undo_node_create(trx, thr, heap);
+ /* thr2->child = row_undo_node_create(trx, thr2, heap); */
+
+ return(fork);
+}
+
+/*********************************************************************//**
+Finishes error processing after the necessary partial rollback has been
+done. */
+static
+void
+trx_finish_error_processing(
+/*========================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/*********************************************************************//**
+Finishes a partial rollback operation. */
+static
+void
+trx_finish_partial_rollback_off_kernel(
+/*===================================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is a pointer
+ to a NULL pointer, then the calling function
+ can start running a new query thread; if this
+ parameter is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ /* Remove the signal from the signal queue and send reply message
+ to it */
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+ que_t* graph, /*!< in: undo graph which can now be freed */
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr)/*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if this parameter is
+ NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+ /* Free the memory reserved by the undo graph */
+ que_graph_free(graph);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+ trx_finish_partial_rollback_off_kernel(trx, next_thr);
+
+ return;
+
+ } else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_finish_error_processing(trx);
+
+ return;
+ }
+
+#ifdef UNIV_DEBUG
+ if (lock_print_waits) {
+ fprintf(stderr, "Trx %lu rollback finished\n",
+ (ulong) ut_dulint_get_low(trx->id));
+ }
+#endif /* UNIV_DEBUG */
+
+ trx_commit_off_kernel(trx);
+
+ /* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
+ send reply messages to them */
+
+ trx->que_state = TRX_QUE_RUNNING;
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ trx_sig_reply(sig, next_thr);
+
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ roll_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(roll_node_t));
+ node->common.type = QUE_NODE_ROLLBACK;
+ node->state = ROLL_NODE_SEND;
+
+ node->partial = FALSE;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ roll_node_t* node;
+ ulint sig_no;
+ trx_savept_t* savept;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = ROLL_NODE_SEND;
+ }
+
+ if (node->state == ROLL_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = ROLL_NODE_WAIT;
+
+ if (node->partial) {
+ sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
+ savept = &(node->savept);
+ } else {
+ sig_no = TRX_SIG_TOTAL_ROLLBACK;
+ savept = NULL;
+ }
+
+ /* Send a rollback signal to the transaction */
+
+ trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr,
+ savept, NULL);
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ mutex_exit(&kernel_mutex);
+
+ return(NULL);
+ }
+
+ ut_ad(node->state == ROLL_NODE_WAIT);
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c
new file mode 100644
index 00000000000..57b5611d624
--- /dev/null
+++ b/storage/xtradb/trx/trx0rseg.c
@@ -0,0 +1,324 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.c
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+ ulint id) /*!< in: rollback segment id */
+{
+ trx_rseg_t* rseg;
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ ut_ad(rseg);
+
+ while (rseg->id != id) {
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ ut_ad(rseg);
+ }
+
+ return(rseg);
+}
+
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+ ulint space, /*!< in: space id */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* slot_no, /*!< out: rseg id == slot number in trx sys */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint page_no;
+ trx_rsegf_t* rsegf;
+ trx_sysf_t* sys_header;
+ ulint i;
+ buf_block_t* block;
+
+ ut_ad(mtr);
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+ MTR_MEMO_X_LOCK));
+ sys_header = trx_sysf_get(mtr);
+
+ *slot_no = trx_sysf_rseg_find_free(mtr);
+
+ if (*slot_no == ULINT_UNDEFINED) {
+
+ return(FIL_NULL);
+ }
+
+ /* Allocate a new file segment for the rollback segment */
+ block = fseg_create(space, 0,
+ TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+ if (block == NULL) {
+ /* No space left */
+
+ return(FIL_NULL);
+ }
+
+ buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+ page_no = buf_block_get_page_no(block);
+
+ /* Get the rollback segment file page */
+ rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+ /* Initialize max size field */
+ mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size,
+ MLOG_4BYTES, mtr);
+
+ /* Initialize the history list */
+
+ mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr);
+ flst_init(rsegf + TRX_RSEG_HISTORY, mtr);
+
+ /* Reset the undo log slots */
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+ trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
+ }
+
+ /* Add the rollback segment info to the free slot in the trx system
+ header */
+
+ trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr);
+
+ return(page_no);
+}
+
+/***********************************************************************//**
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+ trx_rseg_t* rseg) /* in, own: instance to free */
+{
+ trx_undo_t* undo;
+
+ mutex_free(&rseg->mutex);
+
+ /* There can't be any active transactions. */
+ ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
+ ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+
+ while (undo != NULL) {
+ trx_undo_t* prev_undo = undo;
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo);
+
+ trx_undo_mem_free(prev_undo);
+ }
+
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+
+ while (undo != NULL) {
+ trx_undo_t* prev_undo = undo;
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo);
+
+ trx_undo_mem_free(prev_undo);
+ }
+
+ trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL);
+
+ mem_free(rseg);
+}
+
+/***************************************************************************
+Creates and initializes a rollback segment object. The values for the
+fields are read from the header. The object is inserted to the rseg
+list of the trx system object and a pointer is inserted in the rseg
+array in the trx system object.
+@return own: rollback segment object */
+static
+trx_rseg_t*
+trx_rseg_mem_create(
+/*================*/
+ ulint id, /*!< in: rollback segment id */
+ ulint space, /*!< in: space where the segment placed */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no, /*!< in: page number of the segment header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_rsegf_t* rseg_header;
+ trx_rseg_t* rseg;
+ trx_ulogf_t* undo_log_hdr;
+ fil_addr_t node_addr;
+ ulint sum_of_undo_sizes;
+ ulint len;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ rseg = mem_alloc(sizeof(trx_rseg_t));
+
+ rseg->id = id;
+ rseg->space = space;
+ rseg->zip_size = zip_size;
+ rseg->page_no = page_no;
+
+ mutex_create(&rseg->mutex, SYNC_RSEG);
+
+ UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
+
+ trx_sys_set_nth_rseg(trx_sys, id, rseg);
+
+ rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+ rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
+ MLOG_4BYTES, mtr);
+
+ /* Initialize the undo log lists according to the rseg header */
+
+ sum_of_undo_sizes = trx_undo_lists_init(rseg);
+
+ rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr)
+ + 1 + sum_of_undo_sizes;
+
+ len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+ if (len > 0) {
+ trx_sys->rseg_history_len += len;
+
+ node_addr = trx_purge_get_log_from_hist(
+ flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+ rseg->last_page_no = node_addr.page;
+ rseg->last_offset = node_addr.boffset;
+
+ undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size,
+ node_addr.page,
+ mtr) + node_addr.boffset;
+
+ rseg->last_trx_no = mtr_read_dulint(
+ undo_log_hdr + TRX_UNDO_TRX_NO, mtr);
+ rseg->last_del_marks = mtr_read_ulint(
+ undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
+ } else {
+ rseg->last_page_no = FIL_NULL;
+ }
+
+ return(rseg);
+}
+
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+ trx_sysf_t* sys_header, /*!< in: trx system header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint i;
+ ulint page_no;
+ ulint space;
+
+ UT_LIST_INIT(trx_sys->rseg_list);
+
+ trx_sys->rseg_history_len = 0;
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ trx_sys_set_nth_rseg(trx_sys, i, NULL);
+ } else {
+ ulint zip_size;
+
+ space = trx_sysf_rseg_get_space(sys_header, i, mtr);
+
+ zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+ trx_rseg_mem_create(i, space, zip_size, page_no, mtr);
+ }
+ }
+}
+
+/****************************************************************//**
+Creates a new rollback segment to the database.
+@return the created segment object, NULL if fail */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+ ulint space, /*!< in: space id */
+ ulint max_size, /*!< in: max size in pages */
+ ulint* id, /*!< out: rseg id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint flags;
+ ulint zip_size;
+ ulint page_no;
+ trx_rseg_t* rseg;
+
+ mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+ zip_size = dict_table_flags_to_zip_size(flags);
+ mutex_enter(&kernel_mutex);
+
+ page_no = trx_rseg_header_create(space, zip_size, max_size, id, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ mutex_exit(&kernel_mutex);
+ return(NULL);
+ }
+
+ rseg = trx_rseg_mem_create(*id, space, zip_size, page_no, mtr);
+
+ mutex_exit(&kernel_mutex);
+
+ return(rseg);
+}
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
new file mode 100644
index 00000000000..11581a3f2ae
--- /dev/null
+++ b/storage/xtradb/trx/trx0sys.c
@@ -0,0 +1,1936 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.c
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+#include "read0read.h"
+
+/** The file format tag structure with id and name. */
+struct file_format_struct {
+ ulint id; /*!< id of the file format */
+ const char* name; /*!< text representation of the
+ file format */
+ mutex_t mutex; /*!< covers changes to the above
+ fields */
+};
+
+/** The file format tag */
+typedef struct file_format_struct file_format_t;
+
+/** The transaction system */
+UNIV_INTERN trx_sys_t* trx_sys = NULL;
+/** The doublewrite buffer */
+UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL;
+
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE;
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE;
+
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+/** Master binlog file position. We have successfully got the updates
+up to this position. -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1;
+/* @} */
+
+UNIV_INTERN char trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+UNIV_INTERN ib_int64_t trx_sys_mysql_relay_log_pos = -1;
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Binlog file position, or -1 if unknown */
+UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1;
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** List of animal names representing file format. */
+static const char* file_format_name_map[] = {
+ "Antelope",
+ "Barracuda",
+ "Cheetah",
+ "Dragon",
+ "Elk",
+ "Fox",
+ "Gazelle",
+ "Hornet",
+ "Impala",
+ "Jaguar",
+ "Kangaroo",
+ "Leopard",
+ "Moose",
+ "Nautilus",
+ "Ocelot",
+ "Porpoise",
+ "Quail",
+ "Rabbit",
+ "Shark",
+ "Tiger",
+ "Urchin",
+ "Viper",
+ "Whale",
+ "Xenops",
+ "Yak",
+ "Zebra"
+};
+
+/** The number of elements in the file format name array. */
+static const ulint FILE_FORMAT_NAME_N
+ = sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+
+#ifndef UNIV_HOTBACKUP
+/** This is used to track the maximum file format id known to InnoDB. It's
+updated via SET GLOBAL innodb_file_format_check = 'x' or when we open
+or create a table. */
+static file_format_t file_format_max;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+ ulint page_no) /*!< in: page number */
+{
+ if (trx_doublewrite == NULL) {
+
+ return(FALSE);
+ }
+
+ if (page_no >= trx_doublewrite->block1
+ && page_no < trx_doublewrite->block1
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ if (page_no >= trx_doublewrite->block2
+ && page_no < trx_doublewrite->block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ return(TRUE);
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+trx_doublewrite_init(
+/*=================*/
+ byte* doublewrite) /*!< in: pointer to the doublewrite buf
+ header on trx sys page */
+{
+ trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
+
+ /* Since we now start to use the doublewrite buffer, no need to call
+ fsync() after every write to a data file */
+#ifdef UNIV_DO_FLUSH
+ os_do_not_call_flush_at_each_write = TRUE;
+#endif /* UNIV_DO_FLUSH */
+
+ mutex_create(&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
+
+ trx_doublewrite->first_free = 0;
+
+ trx_doublewrite->block1 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+ trx_doublewrite->block2 = mach_read_from_4(
+ doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+ trx_doublewrite->write_buf_unaligned = ut_malloc(
+ (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
+
+ trx_doublewrite->write_buf = ut_align(
+ trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
+ trx_doublewrite->buf_block_arr = mem_alloc(
+ 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
+}
+
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void)
+/*===============================================*/
+{
+ buf_block_t* block;
+ byte* doublewrite;
+ mtr_t mtr;
+
+ /* We upgraded to 4.1.x and reset the space id fields in the
+ doublewrite buffer. Let us mark to the trx_sys header that the upgrade
+ has been done. */
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ trx_sys_multiple_tablespace_format = TRUE;
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void)
+/*================================*/
+{
+ buf_block_t* block;
+ buf_block_t* block2;
+ buf_block_t* new_block;
+ byte* doublewrite;
+ byte* fseg_header;
+ ulint page_no;
+ ulint prev_page_no;
+ ulint i;
+ mtr_t mtr;
+
+ if (trx_doublewrite) {
+ /* Already inited */
+
+ return;
+ }
+
+start_again:
+ mtr_start(&mtr);
+ trx_doublewrite_buf_is_being_created = TRUE;
+
+ block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ trx_doublewrite_init(doublewrite);
+
+ mtr_commit(&mtr);
+ trx_doublewrite_buf_is_being_created = FALSE;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found:"
+ " creating new\n");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = buf_block_get_frame(block)
+ + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ page_no = fseg_alloc_free_page(fseg_header,
+ prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (page_no == FIL_NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite"
+ " buffer: you must\n"
+ "InnoDB: increase your"
+ " tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n"
+ );
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ new_block = buf_page_get(TRX_SYS_SPACE, 0, page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(new_block,
+ SYNC_NO_ORDER_CHECK);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+
+ trx_sys_multiple_tablespace_format = TRUE;
+
+ goto start_again;
+ }
+
+ if (srv_doublewrite_file) {
+ /* the same doublewrite buffer to TRX_SYS_SPACE should exist.
+ check and create if not exist.*/
+
+ mtr_start(&mtr);
+ trx_doublewrite_buf_is_being_created = TRUE;
+
+ block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+ doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has already been created:
+ just read in some numbers */
+
+ mtr_commit(&mtr);
+ } else {
+ fprintf(stderr,
+ "InnoDB: Doublewrite buffer not found in the doublewrite file:"
+ " creating new\n");
+
+ if (buf_pool_get_curr_size()
+ < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2 + 100)
+ * UNIV_PAGE_SIZE)) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your buffer pool size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ exit(1);
+ }
+
+ block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
+ TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+ /* fseg_create acquires a second latch on the page,
+ therefore we must declare it: */
+
+ buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+ if (block2 == NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite buffer:"
+ " you must\n"
+ "InnoDB: increase your tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n");
+
+ /* We exit without committing the mtr to prevent
+ its modifications to the database getting to disk */
+
+ exit(1);
+ }
+
+ fseg_header = buf_block_get_frame(block)
+ + TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+ prev_page_no = 0;
+
+ for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+ + FSP_EXTENT_SIZE / 2; i++) {
+ page_no = fseg_alloc_free_page(fseg_header,
+ prev_page_no + 1,
+ FSP_UP, &mtr);
+ if (page_no == FIL_NULL) {
+ fprintf(stderr,
+ "InnoDB: Cannot create doublewrite"
+ " buffer: you must\n"
+ "InnoDB: increase your"
+ " tablespace size.\n"
+ "InnoDB: Cannot continue operation.\n"
+ );
+
+ exit(1);
+ }
+
+ /* We read the allocated pages to the buffer pool;
+ when they are written to disk in a flush, the space
+ id and page number fields are also written to the
+ pages. When we at database startup read pages
+ from the doublewrite buffer, we know that if the
+ space id and page number in them are the same as
+ the page position in the tablespace, then the page
+ has not been written to in doublewrite. */
+
+ new_block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, page_no,
+ RW_X_LATCH, &mtr);
+ buf_block_dbg_add_level(new_block,
+ SYNC_NO_ORDER_CHECK);
+
+ if (i == FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK1,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i == FSP_EXTENT_SIZE / 2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_REPEAT
+ + TRX_SYS_DOUBLEWRITE_BLOCK2,
+ page_no, MLOG_4BYTES, &mtr);
+ } else if (i > FSP_EXTENT_SIZE / 2) {
+ ut_a(page_no == prev_page_no + 1);
+ }
+
+ prev_page_no = page_no;
+ }
+
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+ mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+ + TRX_SYS_DOUBLEWRITE_REPEAT,
+ TRX_SYS_DOUBLEWRITE_MAGIC_N,
+ MLOG_4BYTES, &mtr);
+
+ mlog_write_ulint(doublewrite
+ + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+ MLOG_4BYTES, &mtr);
+ mtr_commit(&mtr);
+
+ /* Flush the modified pages to disk and make a checkpoint */
+ log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+ fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+ trx_doublewrite_buf_is_being_created = FALSE;
+ }
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+ ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */
+{
+ byte* buf;
+ byte* read_buf;
+ byte* unaligned_read_buf;
+ ulint block1;
+ ulint block2;
+ ulint source_page_no;
+ byte* page;
+ byte* doublewrite;
+ ulint doublewrite_space_id;
+ ulint space_id;
+ ulint page_no;
+ ulint i;
+
+ doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+ if (srv_doublewrite_file) {
+ fprintf(stderr,
+ "InnoDB: doublewrite file '%s' is used.\n",
+ srv_doublewrite_file);
+ }
+
+ /* We do the file i/o past the buffer pool */
+
+ unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+ read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
+
+ /* Read the trx sys header to check if we are using the doublewrite
+ buffer */
+
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
+ UNIV_PAGE_SIZE, read_buf, NULL);
+ doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+ == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+ /* The doublewrite buffer has been created */
+
+ trx_doublewrite_init(doublewrite);
+
+ block1 = trx_doublewrite->block1;
+ block2 = trx_doublewrite->block2;
+
+ buf = trx_doublewrite->write_buf;
+ } else {
+ goto leave_func;
+ }
+
+ if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+ != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+ /* We are upgrading from a version < 4.1.x to a version where
+ multiple tablespaces are supported. We must reset the space id
+ field in the pages in the doublewrite buffer because starting
+ from this version the space id is stored to
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+ trx_doublewrite_must_reset_space_ids = TRUE;
+
+ fprintf(stderr,
+ "InnoDB: Resetting space id's in the"
+ " doublewrite buffer\n");
+ } else {
+ trx_sys_multiple_tablespace_format = TRUE;
+ }
+
+ /* Read the pages from the doublewrite buffer to memory */
+
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf, NULL);
+ fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+ NULL);
+ /* Check if any of these pages is half-written in data files, in the
+ intended position */
+
+ page = buf;
+
+ for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+ page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+ if (trx_doublewrite_must_reset_space_ids) {
+
+ space_id = 0;
+ mach_write_to_4(page
+ + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+ /* We do not need to calculate new checksums for the
+ pages because the field .._SPACE_ID does not affect
+ them. Write the page back to where we read it from. */
+
+ if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+ source_page_no = block1 + i;
+ } else {
+ source_page_no = block2
+ + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+ }
+
+ fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
+ UNIV_PAGE_SIZE, page, NULL);
+ /* printf("Resetting space id in page %lu\n",
+ source_page_no); */
+ } else {
+ space_id = mach_read_from_4(
+ page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+ }
+
+ if (!restore_corrupt_pages) {
+ /* The database was shut down gracefully: no need to
+ restore pages */
+
+ } else if (!fil_tablespace_exists_in_mem(space_id)) {
+ /* Maybe we have dropped the single-table tablespace
+ and this page once belonged to it: do nothing */
+
+ } else if (!fil_check_adress_in_tablespace(space_id,
+ page_no)) {
+ fprintf(stderr,
+ "InnoDB: Warning: a page in the"
+ " doublewrite buffer is not within space\n"
+ "InnoDB: bounds; space id %lu"
+ " page number %lu, page %lu in"
+ " doublewrite buf.\n",
+ (ulong) space_id, (ulong) page_no, (ulong) i);
+
+ } else if ((space_id == TRX_SYS_SPACE
+ || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
+ && ((page_no >= block1
+ && page_no
+ < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+ || (page_no >= block2
+ && page_no
+ < (block2
+ + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+ /* It is an unwritten doublewrite buffer page:
+ do nothing */
+ } else {
+ ulint zip_size = fil_space_get_zip_size(space_id);
+
+ /* Read in the actual page from the file */
+ fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
+ page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ read_buf, NULL);
+
+ if (srv_recovery_stats && recv_recovery_is_on()) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_doublewrite_check_pages++;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ /* Check if the page is corrupt */
+
+ if (UNIV_UNLIKELY
+ (buf_page_is_corrupted(read_buf, zip_size))) {
+
+ fprintf(stderr,
+ "InnoDB: Warning: database page"
+ " corruption or a failed\n"
+ "InnoDB: file read of"
+ " space %lu page %lu.\n"
+ "InnoDB: Trying to recover it from"
+ " the doublewrite buffer.\n",
+ (ulong) space_id, (ulong) page_no);
+
+ if (buf_page_is_corrupted(page, zip_size)) {
+ fprintf(stderr,
+ "InnoDB: Dump of the page:\n");
+ buf_page_print(read_buf, zip_size);
+ fprintf(stderr,
+ "InnoDB: Dump of"
+ " corresponding page"
+ " in doublewrite buffer:\n");
+ buf_page_print(page, zip_size);
+
+ fprintf(stderr,
+ "InnoDB: Also the page in the"
+ " doublewrite buffer"
+ " is corrupt.\n"
+ "InnoDB: Cannot continue"
+ " operation.\n"
+ "InnoDB: You can try to"
+ " recover the database"
+ " with the my.cnf\n"
+ "InnoDB: option:\n"
+ "InnoDB:"
+ " innodb_force_recovery=6\n");
+ exit(1);
+ }
+
+ /* Write the good page from the
+ doublewrite buffer to the intended
+ position */
+
+ fil_io(OS_FILE_WRITE, TRUE, space_id,
+ zip_size, page_no, 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ page, NULL);
+
+ if (srv_recovery_stats && recv_recovery_is_on()) {
+ mutex_enter(&(recv_sys->mutex));
+ recv_sys->stats_doublewrite_overwrite_pages++;
+ mutex_exit(&(recv_sys->mutex));
+ }
+
+ fprintf(stderr,
+ "InnoDB: Recovered the page from"
+ " the doublewrite buffer.\n");
+ }
+ }
+
+ page += UNIV_PAGE_SIZE;
+ }
+
+ fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+ ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+ trx_t* in_trx) /*!< in: trx */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx != NULL) {
+
+ if (trx == in_trx) {
+
+ return(TRUE);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ return(FALSE);
+}
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ trx_sys->max_trx_id, &mtr);
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+ trx_sysf_t* sys_header,
+ const char* file_name_in,/*!< in: MySQL log file name */
+ ib_int64_t offset, /*!< in: position in that log file */
+ ulint field, /*!< in: offset of the MySQL log info field in
+ the trx sys header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ const char* file_name;
+
+ if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) {
+
+ /* We cannot fit the name to the 512 bytes we have reserved */
+ /* -> To store relay log file information, file_name must fit to the 480 bytes */
+
+ file_name = "";
+ }
+ else {
+ file_name = file_name_in;
+ }
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+ TRX_SYS_MYSQL_LOG_MAGIC_N,
+ MLOG_4BYTES, mtr);
+ }
+
+ if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
+ file_name)) {
+
+ mlog_write_string(sys_header + field
+ + TRX_SYS_MYSQL_LOG_NAME,
+ (byte*) file_name, 1 + ut_strlen(file_name),
+ mtr);
+ }
+
+ if (mach_read_from_4(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+ || (offset >> 32) > 0) {
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+ (ulint)(offset >> 32),
+ MLOG_4BYTES, mtr);
+ }
+
+ mlog_write_ulint(sys_header + field
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+ (ulint)(offset & 0xFFFFFFFFUL),
+ MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+ ulint trx_sys_mysql_bin_log_pos_high;
+ ulint trx_sys_mysql_bin_log_pos_low;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+ trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+ trx_sys_mysql_bin_log_pos
+ = (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
+ + (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
+
+ ut_memcpy(trx_sys_mysql_bin_log_name,
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+ fprintf(stderr,
+ "InnoDB: Last MySQL binlog file position %lu %lu,"
+ " file name %s\n",
+ trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+ trx_sys_mysql_bin_log_name);
+
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+ trx_sysf_t* sys_header;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ sys_header = trx_sysf_get(&mtr);
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ fprintf(stderr,
+ "InnoDB: In a MySQL replication slave the last"
+ " master binlog file\n"
+ "InnoDB: position %lu %lu, file name %s\n",
+ (ulong) mach_read_from_4(sys_header
+ + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(sys_header
+ + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME);
+
+ fprintf(stderr,
+ "InnoDB: and relay log file\n"
+ "InnoDB: position %lu %lu, file name %s\n",
+ (ulong) mach_read_from_4(sys_header
+ + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(sys_header
+ + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME);
+
+ /* Copy the master log position info to global variables we can
+ use in ha_innobase.cc to initialize glob_mi to right values */
+
+ ut_memcpy(trx_sys_mysql_master_log_name,
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME,
+ TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+
+ trx_sys_mysql_master_log_pos
+ = (((ib_int64_t) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+ + ((ib_int64_t) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+
+ ut_memcpy(trx_sys_mysql_relay_log_name,
+ sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME,
+ TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+
+ trx_sys_mysql_relay_log_pos
+ = (((ib_int64_t) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+ + ((ib_int64_t) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+ mtr_commit(&mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint page_no;
+ ulint i;
+
+ ut_ad(mutex_own(&(kernel_mutex)));
+
+ sys_header = trx_sysf_get(mtr);
+
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+ if (page_no == FIL_NULL) {
+
+ return(i);
+ }
+ }
+
+ return(ULINT_UNDEFINED);
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_sysf_t* sys_header;
+ ulint slot_no;
+ buf_block_t* block;
+ page_t* page;
+ ulint page_no;
+ ulint i;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+ MLOG_2BYTES, mtr);
+
+ /* Reset the doublewrite buffer magic number to zero so that we
+ know that the doublewrite buffer has not yet been created (this
+ suppresses a Valgrind warning) */
+
+ mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1), mtr);
+
+ /* Reset the rollback segment slots */
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+ }
+
+ /* The remaining area (up to the page trailer) is uninitialized.
+ Silence Valgrind warnings about it. */
+ UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE),
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ - (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE))
+ + page - sys_header);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, &slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no != FIL_NULL);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Creates dummy of the file page for the transaction system. */
+static
+void
+trx_sysf_dummy_create(
+/*==================*/
+ ulint space,
+ mtr_t* mtr)
+{
+ buf_block_t* block;
+ page_t* page;
+
+ ut_ad(mtr);
+
+ /* Note that below we first reserve the file space x-latch, and
+ then enter the kernel: we must do it in this order to conform
+ to the latching order rules. */
+
+ mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
+ mutex_enter(&kernel_mutex);
+
+ /* Create the trx sys file block in a new allocated file segment */
+ block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+ mtr);
+ buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+ fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
+ ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+ page = buf_block_get_frame(block);
+
+ mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+ MLOG_2BYTES, mtr);
+
+ /* Reset the doublewrite buffer magic number to zero so that we
+ know that the doublewrite buffer has not yet been created (this
+ suppresses a Valgrind warning) */
+
+ mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+ + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+#ifdef UNDEFINED
+ /* TODO: REMOVE IT: The bellow is not needed, I think */
+ sys_header = trx_sysf_get(mtr);
+
+ /* Start counting transaction ids from number 1 up */
+ mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+ ut_dulint_create(0, 1), mtr);
+
+ /* Reset the rollback segment slots */
+ for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+ trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+ trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+ }
+
+ /* The remaining area (up to the page trailer) is uninitialized.
+ Silence Valgrind warnings about it. */
+ UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE),
+ (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+ - (TRX_SYS_RSEGS
+ + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+ + TRX_SYS_RSEG_SPACE))
+ + page - sys_header);
+
+ /* Create the first rollback segment in the SYSTEM tablespace */
+ page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
+ mtr);
+ ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+ ut_a(page_no != FIL_NULL);
+#endif
+
+ mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+ trx_sysf_t* sys_header;
+ ib_int64_t rows_to_undo = 0;
+ const char* unit = "";
+ trx_t* trx;
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ ut_ad(trx_sys == NULL);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_sys = mem_alloc(sizeof(trx_sys_t));
+
+ sys_header = trx_sysf_get(&mtr);
+
+ trx_rseg_list_and_array_init(sys_header, &mtr);
+
+ trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ /* VERY important: after the database is started, max_trx_id value is
+ divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+ trx_sys_get_new_trx_id will evaluate to TRUE when the function
+ is first time called, and the value for trx id will be written
+ to the disk-based header! Thus trx id values will not overlap when
+ the database is repeatedly started! */
+
+ trx_sys->max_trx_id = ut_dulint_add(
+ ut_dulint_align_up(mtr_read_dulint(
+ sys_header
+ + TRX_SYS_TRX_ID_STORE, &mtr),
+ TRX_SYS_TRX_ID_WRITE_MARGIN),
+ 2 * TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+ UT_LIST_INIT(trx_sys->mysql_trx_list);
+ trx_dummy_sess = sess_open();
+ trx_lists_init_at_db_start();
+
+ if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ for (;;) {
+
+ if ( trx->conc_state != TRX_PREPARED) {
+ rows_to_undo += ut_conv_dulint_to_longlong(
+ trx->undo_no);
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+
+ if (!trx) {
+ break;
+ }
+ }
+
+ if (rows_to_undo > 1000000000) {
+ unit = "M";
+ rows_to_undo = rows_to_undo / 1000000;
+ }
+
+ fprintf(stderr,
+ "InnoDB: %lu transaction(s) which must be"
+ " rolled back or cleaned up\n"
+ "InnoDB: in total %lu%s row operations to undo\n",
+ (ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+ (ulong) rows_to_undo, unit);
+
+ fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
+ TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+ }
+
+ UT_LIST_INIT(trx_sys->view_list);
+
+ trx_purge_sys_create();
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void)
+/*================*/
+{
+ mtr_t mtr;
+
+ mtr_start(&mtr);
+
+ trx_sysf_create(&mtr);
+
+ mtr_commit(&mtr);
+
+ trx_sys_init_at_db_start();
+}
+
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+ ulint space)
+{
+ mtr_t mtr;
+
+ /* This function is only for doublewrite file for now */
+ ut_a(space == TRX_DOUBLEWRITE_SPACE);
+
+ mtr_start(&mtr);
+
+ trx_sysf_dummy_create(space, &mtr);
+
+ mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Create extra rollback segments when create_new_db */
+UNIV_INTERN
+void
+trx_sys_create_extra_rseg(
+/*======================*/
+ ulint num) /* in: number of extra user rollback segments */
+{
+ mtr_t mtr;
+ ulint slot_no;
+ ulint i;
+
+ /* Craete extra rollback segments */
+ mtr_start(&mtr);
+ for (i = 1; i < num + 1; i++) {
+ if(!trx_rseg_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, &mtr)) {
+ fprintf(stderr,
+"InnoDB: Warning: Failed to create extra rollback segments.\n");
+ break;
+ }
+ ut_a(slot_no == i);
+ }
+ mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Update the file format tag.
+@return always TRUE */
+static
+ibool
+trx_sys_file_format_max_write(
+/*==========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name) /*!< out: max file format name, can
+ be NULL */
+{
+ mtr_t mtr;
+ byte* ptr;
+ buf_block_t* block;
+ ulint tag_value_low;
+
+ mtr_start(&mtr);
+
+ block = buf_page_get(
+ TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+ file_format_max.id = format_id;
+ file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+ ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+ tag_value_low = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+ if (name) {
+ *name = file_format_max.name;
+ }
+
+ mlog_write_dulint(
+ ptr,
+ ut_dulint_create(TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH,
+ tag_value_low),
+ &mtr);
+
+ mtr_commit(&mtr);
+
+ return(TRUE);
+}
+
+/*****************************************************************//**
+Read the file format tag.
+@return the file format or ULINT_UNDEFINED if not set. */
+static
+ulint
+trx_sys_file_format_max_read(void)
+/*==============================*/
+{
+ mtr_t mtr;
+ const byte* ptr;
+ const buf_block_t* block;
+ ulint format_id;
+ dulint file_format_id;
+
+ /* Since this is called during the startup phase it's safe to
+ read the value without a covering mutex. */
+ mtr_start(&mtr);
+
+ block = buf_page_get(
+ TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+ ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+ file_format_id = mach_read_from_8(ptr);
+
+ mtr_commit(&mtr);
+
+ format_id = file_format_id.low - TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+ if (file_format_id.high != TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH
+ || format_id >= FILE_FORMAT_NAME_N) {
+
+ /* Either it has never been tagged, or garbage in it. */
+ return(ULINT_UNDEFINED);
+ }
+
+ return(format_id);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id) /*!< in: id of the file format */
+{
+ ut_a(id < FILE_FORMAT_NAME_N);
+
+ return(file_format_name_map[id]);
+}
+
+/*****************************************************************//**
+Check for the max file format tag stored on disk. Note: If max_format_id
+is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+ ulint max_format_id) /*!< in: max format id to check */
+{
+ ulint format_id;
+
+ /* Check the file format in the tablespace. Do not try to
+ recover if the file format is not supported by the engine
+ unless forced by the user. */
+ format_id = trx_sys_file_format_max_read();
+ if (format_id == ULINT_UNDEFINED) {
+ /* Format ID was not set. Set it to minimum possible
+ value. */
+ format_id = DICT_TF_FORMAT_51;
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: highest supported file format is %s.\n",
+ trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
+
+ if (format_id > DICT_TF_FORMAT_MAX) {
+
+ ut_a(format_id < FILE_FORMAT_NAME_N);
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: %s: the system tablespace is in a file "
+ "format that this version doesn't support - %s\n",
+ ((max_format_id <= DICT_TF_FORMAT_MAX)
+ ? "Error" : "Warning"),
+ trx_sys_file_format_id_to_name(format_id));
+
+ if (max_format_id <= DICT_TF_FORMAT_MAX) {
+ return(DB_ERROR);
+ }
+ }
+
+ format_id = (format_id > max_format_id) ? format_id : max_format_id;
+
+ /* We don't need a mutex here, as this function should only
+ be called once at start up. */
+ file_format_max.id = format_id;
+ file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+ return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+ ulint format_id, /*!< in: file format id */
+ const char** name) /*!< out: max file format name or
+ NULL if not needed. */
+{
+ ibool ret = FALSE;
+
+ ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+ mutex_enter(&file_format_max.mutex);
+
+ /* Only update if not already same value. */
+ if (format_id != file_format_max.id) {
+
+ ret = trx_sys_file_format_max_write(format_id, name);
+ }
+
+ mutex_exit(&file_format_max.mutex);
+
+ return(ret);
+}
+
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void)
+/*==============================*/
+{
+ ulint format_id;
+
+ format_id = trx_sys_file_format_max_read();
+
+ /* If format_id is not set then set it to the minimum. */
+ if (format_id == ULINT_UNDEFINED) {
+ trx_sys_file_format_max_set(DICT_TF_FORMAT_51, NULL);
+ }
+}
+
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+ const char** name, /*!< out: max file format name */
+ ulint format_id) /*!< in: file format identifier */
+{
+ ibool ret = FALSE;
+
+ ut_a(name);
+ ut_a(file_format_max.name != NULL);
+ ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+ mutex_enter(&file_format_max.mutex);
+
+ if (format_id > file_format_max.id) {
+
+ ret = trx_sys_file_format_max_write(format_id, name);
+ }
+
+ mutex_exit(&file_format_max.mutex);
+
+ return(ret);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void)
+/*=============================*/
+{
+ return(file_format_max.name);
+}
+
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void)
+/*==========================*/
+{
+ mutex_create(&file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+
+ /* We don't need a mutex here, as this function should only
+ be called once at start up. */
+ file_format_max.id = DICT_TF_FORMAT_51;
+
+ file_format_max.name = trx_sys_file_format_id_to_name(
+ file_format_max.id);
+}
+
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void)
+/*===========================*/
+{
+ /* Does nothing at the moment */
+}
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+ const byte* page) /*!< in: buffer containing the trx
+ system header page, i.e., page number
+ TRX_SYS_PAGE_NO in the tablespace */
+{
+ const trx_sysf_t* sys_header;
+
+ sys_header = page + TRX_SYS;
+
+ if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+ == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+ fprintf(stderr,
+ "ibbackup: Last MySQL binlog file position %lu %lu,"
+ " file name %s\n",
+ (ulong) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+ (ulong) mach_read_from_4(
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+ sys_header + TRX_SYS_MYSQL_LOG_INFO
+ + TRX_SYS_MYSQL_LOG_NAME);
+ }
+}
+
+
+/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
+ (This code duplicaton should be fixed at some point!)
+*/
+
+#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
+/* The offset of the file format tag on the trx system header page */
+#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
+/* We use these random constants to reduce the probability of reading
+garbage (from previous versions) that maps to an actual format id. We
+use these as bit masks at the time of reading and writing from/to disk. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
+
+/* END OF COPIED DEFINITIONS */
+
+
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+ const char *pathname, /*!< in: pathname of the first system
+ table space file */
+ ulint *format_id) /*!< out: file format of the system table
+ space */
+{
+ os_file_t file;
+ ibool success;
+ byte buf[UNIV_PAGE_SIZE * 2];
+ page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
+ const byte* ptr;
+ dulint file_format_id;
+
+ *format_id = ULINT_UNDEFINED;
+
+ file = os_file_create_simple_no_error_handling(
+ pathname,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read system tablespace file format,\n"
+" ibbackup: but could not open the tablespace file %s!\n",
+ pathname
+ );
+ return(FALSE);
+ }
+
+ /* Read the page on which file format is stored */
+
+ success = os_file_read_no_error_handling(
+ file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read system table space file format,\n"
+" ibbackup: but failed to read the tablespace file %s!\n",
+ pathname
+ );
+ os_file_close(file);
+ return(FALSE);
+ }
+ os_file_close(file);
+
+ /* get the file format from the page */
+ ptr = page + TRX_SYS_FILE_FORMAT_TAG;
+ file_format_id = mach_read_from_8(ptr);
+
+ *format_id = file_format_id.low - TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+ if (file_format_id.high != TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH
+ || *format_id >= FILE_FORMAT_NAME_N) {
+
+ /* Either it has never been tagged, or garbage in it. */
+ *format_id = ULINT_UNDEFINED;
+ return(TRUE);
+ }
+
+ return(TRUE);
+}
+
+
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+ const char *pathname, /*!< in: pathname of a per-table
+ datafile */
+ ulint *format_id) /*!< out: file format of the per-table
+ data file */
+{
+ os_file_t file;
+ ibool success;
+ byte buf[UNIV_PAGE_SIZE * 2];
+ page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
+ const byte* ptr;
+ ib_uint32_t flags;
+
+ *format_id = ULINT_UNDEFINED;
+
+ file = os_file_create_simple_no_error_handling(
+ pathname,
+ OS_FILE_OPEN,
+ OS_FILE_READ_ONLY,
+ &success
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to read per-table tablespace format,\n"
+" ibbackup: but could not open the tablespace file %s!\n",
+ pathname
+ );
+ return(FALSE);
+ }
+
+ /* Read the first page of the per-table datafile */
+
+ success = os_file_read_no_error_handling(
+ file, page, 0, 0, UNIV_PAGE_SIZE
+ );
+ if (!success) {
+ /* The following call prints an error message */
+ os_file_get_last_error(TRUE);
+
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+" ibbackup: Error: trying to per-table data file format,\n"
+" ibbackup: but failed to read the tablespace file %s!\n",
+ pathname
+ );
+ os_file_close(file);
+ return(FALSE);
+ }
+ os_file_close(file);
+
+ /* get the file format from the page */
+ ptr = page + 54;
+ flags = mach_read_from_4(ptr);
+ if (flags == 0) {
+ /* file format is Antelope */
+ *format_id = 0;
+ return (TRUE);
+ } else if (flags & 1) {
+ /* tablespace flags are ok */
+ *format_id = (flags / 32) % 128;
+ return (TRUE);
+ } else {
+ /* bad tablespace flags */
+ return(FALSE);
+ }
+}
+
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+ const ulint id) /*!< in: id of the file format */
+{
+ if (!(id < FILE_FORMAT_NAME_N)) {
+ /* unknown id */
+ return ("Unknown");
+ }
+
+ return(file_format_name_map[id]);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void)
+/*===============*/
+{
+ trx_rseg_t* rseg;
+ read_view_t* view;
+
+ ut_ad(trx_sys != NULL);
+
+ /* Check that all read views are closed except read view owned
+ by a purge. */
+
+ if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
+ fprintf(stderr,
+ "InnoDB: Error: all read views were not closed"
+ " before shutdown:\n"
+ "InnoDB: %lu read views open \n",
+ UT_LIST_GET_LEN(trx_sys->view_list) - 1);
+ }
+
+ sess_close(trx_dummy_sess);
+ trx_dummy_sess = NULL;
+
+ trx_purge_sys_close();
+
+ mutex_enter(&kernel_mutex);
+
+ /* Free the double write data structures. */
+ ut_a(trx_doublewrite != NULL);
+ ut_free(trx_doublewrite->write_buf_unaligned);
+ trx_doublewrite->write_buf_unaligned = NULL;
+
+ mem_free(trx_doublewrite->buf_block_arr);
+ trx_doublewrite->buf_block_arr = NULL;
+
+ mutex_free(&trx_doublewrite->mutex);
+ mem_free(trx_doublewrite);
+ trx_doublewrite = NULL;
+
+ /* There can't be any active transactions. */
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg != NULL) {
+ trx_rseg_t* prev_rseg = rseg;
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
+ UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
+
+ trx_rseg_mem_free(prev_rseg);
+ }
+
+ view = UT_LIST_GET_FIRST(trx_sys->view_list);
+
+ while (view != NULL) {
+ read_view_t* prev_view = view;
+
+ view = UT_LIST_GET_NEXT(view_list, prev_view);
+
+ /* Views are allocated from the trx_sys->global_read_view_heap.
+ So, we simply remove the element here. */
+ UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+ ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
+
+ mem_free(trx_sys);
+
+ trx_sys = NULL;
+ mutex_exit(&kernel_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
new file mode 100644
index 00000000000..9584f0c4c46
--- /dev/null
+++ b/storage/xtradb/trx/trx0trx.c
@@ -0,0 +1,2156 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.c
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "thr0loc.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "ha_prototypes.h"
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t* trx_dummy_sess = NULL;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+UNIV_INTERN ulint trx_n_mysql_transactions = 0;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+ trx_t* trx, /*!< in: transaction struct */
+ const char* msg) /*!< in: detailed error message */
+{
+ ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction struct */
+ FILE* file) /*!< in: file to read message from */
+{
+ os_file_read_string(file, trx->detailed_error,
+ sizeof(trx->detailed_error));
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+ sess_t* sess) /*!< in: session */
+{
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(sess);
+
+ trx = mem_alloc(sizeof(trx_t));
+
+ trx->magic_n = TRX_MAGIC_N;
+
+ trx->op_info = "";
+
+ trx->is_purge = 0;
+ trx->is_recovered = 0;
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->start_time = time(NULL);
+
+ trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+ trx->id = ut_dulint_zero;
+ trx->no = ut_dulint_max;
+
+ trx->support_xa = TRUE;
+
+ trx->flush_log_at_trx_commit_session = 3; /* means to use innodb_flush_log_at_trx_commit value */
+
+ trx->check_foreigns = TRUE;
+ trx->check_unique_secondary = TRUE;
+
+ trx->flush_log_later = FALSE;
+ trx->must_flush_log_later = FALSE;
+
+ trx->dict_operation = TRX_DICT_OP_NONE;
+ trx->table_id = ut_dulint_zero;
+
+ trx->mysql_thd = NULL;
+ trx->active_trans = 0;
+ trx->duplicates = 0;
+
+ trx->n_mysql_tables_in_use = 0;
+ trx->mysql_n_tables_locked = 0;
+
+ trx->mysql_log_file_name = NULL;
+ trx->mysql_log_offset = 0;
+ trx->mysql_master_log_file_name = "";
+ trx->mysql_master_log_pos = 0;
+ trx->mysql_relay_log_file_name = "";
+ trx->mysql_relay_log_pos = 0;
+
+ mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
+
+ trx->rseg = NULL;
+
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+ trx->insert_undo = NULL;
+ trx->update_undo = NULL;
+ trx->undo_no_arr = NULL;
+
+ trx->error_state = DB_SUCCESS;
+ trx->error_key_num = 0;
+ trx->detailed_error[0] = '\0';
+
+ trx->sess = sess;
+ trx->que_state = TRX_QUE_RUNNING;
+ trx->n_active_thrs = 0;
+
+ trx->handling_signals = FALSE;
+
+ UT_LIST_INIT(trx->signals);
+ UT_LIST_INIT(trx->reply_signals);
+
+ trx->graph = NULL;
+
+ trx->wait_lock = NULL;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ UT_LIST_INIT(trx->wait_thrs);
+
+ trx->lock_heap = mem_heap_create_in_buffer(256);
+ UT_LIST_INIT(trx->trx_locks);
+
+ UT_LIST_INIT(trx->trx_savepoints);
+
+ trx->dict_operation_lock_mode = 0;
+ trx->has_search_latch = FALSE;
+ trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+
+ trx->global_read_view_heap = mem_heap_create(256);
+ trx->global_read_view = NULL;
+ trx->read_view = NULL;
+
+ trx->io_reads = 0;
+ trx->io_read = 0;
+ trx->io_reads_wait_timer = 0;
+ trx->lock_que_wait_timer = 0;
+ trx->innodb_que_wait_timer = 0;
+ trx->distinct_page_access = 0;
+ trx->distinct_page_access_hash = NULL;
+ trx->take_stats = FALSE;
+
+ /* Set X/Open XA transaction identification to NULL */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+ trx->xid.formatID = -1;
+
+ trx->n_autoinc_rows = 0;
+
+ /* Remember to free the vector explicitly. */
+ trx->autoinc_locks = ib_vector_create(
+ mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
+
+ return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ trx = trx_create(trx_dummy_sess);
+
+ trx_n_mysql_transactions++;
+
+ UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->mysql_thread_id = os_thread_get_curr_id();
+
+ trx->mysql_process_no = os_proc_get_number();
+
+ if (innobase_get_slow_log() && trx->take_stats) {
+ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+ }
+
+ return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+ trx_t* trx;
+
+ mutex_enter(&kernel_mutex);
+
+ trx = trx_create(trx_dummy_sess);
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx);
+}
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INTERN
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ if (trx->has_search_latch) {
+ rw_lock_s_unlock(&btr_search_latch);
+
+ trx->has_search_latch = FALSE;
+ }
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (trx->declared_to_be_inside_innodb) {
+ ut_print_timestamp(stderr);
+ fputs(" InnoDB: Error: Freeing a trx which is declared"
+ " to be processing\n"
+ "InnoDB: inside InnoDB.\n", stderr);
+ trx_print(stderr, trx, 600);
+ putc('\n', stderr);
+
+ /* This is an error but not a fatal error. We must keep
+ the counters like srv_conc_n_threads accurate. */
+ srv_conc_force_exit_innodb(trx);
+ }
+
+ if (trx->n_mysql_tables_in_use != 0
+ || trx->mysql_n_tables_locked != 0) {
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: MySQL is freeing a thd\n"
+ "InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
+ "InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
+ (ulong)trx->n_mysql_tables_in_use,
+ (ulong)trx->mysql_n_tables_locked);
+
+ trx_print(stderr, trx, 600);
+
+ ut_print_buf(stderr, trx, sizeof(trx_t));
+ putc('\n', stderr);
+ }
+
+ ut_a(trx->magic_n == TRX_MAGIC_N);
+
+ trx->magic_n = 11112222;
+
+ ut_a(trx->conc_state == TRX_NOT_STARTED);
+
+ mutex_free(&(trx->undo_mutex));
+
+ ut_a(trx->insert_undo == NULL);
+ ut_a(trx->update_undo == NULL);
+
+ if (trx->undo_no_arr) {
+ trx_undo_arr_free(trx->undo_no_arr);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+ ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+ ut_a(trx->wait_lock == NULL);
+ ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ ut_a(!trx->has_search_latch);
+
+ ut_a(trx->dict_operation_lock_mode == 0);
+
+ if (trx->lock_heap) {
+ mem_heap_free(trx->lock_heap);
+ }
+
+ ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ if (trx->global_read_view_heap) {
+ mem_heap_free(trx->global_read_view_heap);
+ }
+
+ trx->global_read_view = NULL;
+
+ ut_a(trx->read_view == NULL);
+
+ ut_a(ib_vector_is_empty(trx->autoinc_locks));
+ /* We allocated a dedicated heap for the vector. */
+ ib_vector_free(trx->autoinc_locks);
+
+ mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ if (trx->distinct_page_access_hash)
+ {
+ mem_free(trx->distinct_page_access_hash);
+ trx->distinct_page_access_hash= NULL;
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+ trx_free(trx);
+
+ ut_a(trx_n_mysql_transactions > 0);
+
+ trx_n_mysql_transactions--;
+
+ mutex_exit(&kernel_mutex);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+ trx_t* trx) /*!< in, own: trx object */
+{
+ if (trx->distinct_page_access_hash)
+ {
+ mem_free(trx->distinct_page_access_hash);
+ trx->distinct_page_access_hash= NULL;
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx_free(trx);
+
+ mutex_exit(&kernel_mutex);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_insert_ordered(
+/*====================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ trx_t* trx2;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx2 != NULL) {
+ if (ut_dulint_cmp(trx->id, trx2->id) >= 0) {
+
+ ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1);
+ break;
+ }
+ trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
+ }
+
+ if (trx2 != NULL) {
+ trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+ if (trx2 == NULL) {
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+ } else {
+ UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
+ trx2, trx);
+ }
+ } else {
+ UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
+ }
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ trx_t* trx;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ UT_LIST_INIT(trx_sys->trx_list);
+
+ /* Look from the rollback segments if there exist undo logs for
+ transactions */
+
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+ while (rseg != NULL) {
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+
+ while (undo != NULL) {
+
+ trx = trx_create(trx_dummy_sess);
+
+ trx->is_recovered = TRUE;
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+ trx->insert_undo = undo;
+ trx->rseg = rseg;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+
+ fprintf(stderr,
+ "InnoDB: Transaction "
+ TRX_ID_FMT
+ " was in the"
+ " XA prepared state.\n",
+ TRX_ID_PREP_PRINTF(trx->id));
+
+ if (srv_force_recovery == 0) {
+
+ trx->conc_state = TRX_PREPARED;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Since"
+ " innodb_force_recovery"
+ " > 0, we will"
+ " rollback it"
+ " anyway.\n");
+
+ trx->conc_state = TRX_ACTIVE;
+ }
+ } else {
+ trx->conc_state
+ = TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx no;
+ this should have no relevance since purge
+ is not interested in committed transaction
+ numbers, unless they are in the history
+ list, in which case it looks the number
+ from the disk based undo log structure */
+
+ trx->no = trx->id;
+ } else {
+ trx->conc_state = TRX_ACTIVE;
+
+ /* A running transaction always has the number
+ field inited to ut_dulint_max */
+
+ trx->no = ut_dulint_max;
+ }
+
+ if (undo->dict_operation) {
+ trx_set_dict_operation(
+ trx, TRX_DICT_OP_TABLE);
+ trx->table_id = undo->table_id;
+ }
+
+ if (!undo->empty) {
+ trx->undo_no = ut_dulint_add(undo->top_undo_no,
+ 1);
+ }
+
+ trx_list_insert_ordered(trx);
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+
+ while (undo != NULL) {
+ trx = trx_get_on_id(undo->trx_id);
+
+ if (NULL == trx) {
+ trx = trx_create(trx_dummy_sess);
+
+ trx->is_recovered = TRUE;
+ trx->id = undo->trx_id;
+ trx->xid = undo->xid;
+
+ if (undo->state != TRX_UNDO_ACTIVE) {
+
+ /* Prepared transactions are left in
+ the prepared state waiting for a
+ commit or abort decision from MySQL */
+
+ if (undo->state == TRX_UNDO_PREPARED) {
+ fprintf(stderr,
+ "InnoDB: Transaction "
+ TRX_ID_FMT " was in the"
+ " XA prepared state.\n",
+ TRX_ID_PREP_PRINTF(
+ trx->id));
+
+ if (srv_force_recovery == 0) {
+
+ trx->conc_state
+ = TRX_PREPARED;
+ } else {
+ fprintf(stderr,
+ "InnoDB: Since"
+ " innodb_force_recovery"
+ " > 0, we will"
+ " rollback it"
+ " anyway.\n");
+
+ trx->conc_state
+ = TRX_ACTIVE;
+ }
+ } else {
+ trx->conc_state
+ = TRX_COMMITTED_IN_MEMORY;
+ }
+
+ /* We give a dummy value for the trx
+ number */
+
+ trx->no = trx->id;
+ } else {
+ trx->conc_state = TRX_ACTIVE;
+
+ /* A running transaction always has
+ the number field inited to
+ ut_dulint_max */
+
+ trx->no = ut_dulint_max;
+ }
+
+ trx->rseg = rseg;
+ trx_list_insert_ordered(trx);
+
+ if (undo->dict_operation) {
+ trx_set_dict_operation(
+ trx, TRX_DICT_OP_TABLE);
+ trx->table_id = undo->table_id;
+ }
+ }
+
+ trx->update_undo = undo;
+
+ if ((!undo->empty)
+ && (ut_dulint_cmp(undo->top_undo_no,
+ trx->undo_no) >= 0)) {
+
+ trx->undo_no = ut_dulint_add(undo->top_undo_no,
+ 1);
+ }
+
+ undo = UT_LIST_GET_NEXT(undo_list, undo);
+ }
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+ }
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+Skips the SYSTEM rollback segment if another is available.
+@return assigned rollback segment id */
+UNIV_INLINE
+ulint
+trx_assign_rseg(void)
+/*=================*/
+{
+ trx_rseg_t* rseg = trx_sys->latest_rseg;
+
+ ut_ad(mutex_own(&kernel_mutex));
+loop:
+ /* Get next rseg in a round-robin fashion */
+
+ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+
+ if (rseg == NULL) {
+ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+ }
+
+ /* If it is the SYSTEM rollback segment, and there exist others, skip
+ it */
+
+ if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
+ && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
+ goto loop;
+ }
+
+ trx_sys->latest_rseg = rseg;
+
+ return(rseg->id);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ trx_rseg_t* rseg;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->rseg == NULL);
+
+ if (trx->is_purge) {
+ trx->id = ut_dulint_zero;
+ trx->conc_state = TRX_ACTIVE;
+ trx->start_time = time(NULL);
+
+ return(TRUE);
+ }
+
+ ut_ad(trx->conc_state != TRX_ACTIVE);
+
+ if (rseg_id == ULINT_UNDEFINED) {
+
+ rseg_id = trx_assign_rseg();
+ }
+
+ rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+
+ trx->id = trx_sys_get_new_trx_id();
+
+ /* The initial value for trx->no: ut_dulint_max is used in
+ read_view_open_now: */
+
+ trx->no = ut_dulint_max;
+
+ trx->rseg = rseg;
+
+ trx->conc_state = TRX_ACTIVE;
+ trx->start_time = time(NULL);
+
+ UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+ trx_t* trx, /*!< in: transaction */
+ ulint rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+ is passed, the system chooses the rollback segment
+ automatically in a round-robin fashion */
+{
+ ibool ret;
+
+ /* Update the info whether we should skip XA steps that eat CPU time
+ For the duration of the transaction trx->support_xa is not reread
+ from thd so any changes in the value take effect in the next
+ transaction. This is to avoid a scenario where some undo
+ generated by a transaction, has XA stuff, and other undo,
+ generated by the same transaction, doesn't. */
+ trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+ trx->flush_log_at_trx_commit_session =
+ thd_flush_log_at_trx_commit_session(trx->mysql_thd);
+
+ mutex_enter(&kernel_mutex);
+
+ ret = trx_start_low(trx, rseg_id);
+
+ mutex_exit(&kernel_mutex);
+
+ return(ret);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ page_t* update_hdr_page;
+ ib_uint64_t lsn = 0;
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ mtr_t mtr;
+ trx_sysf_t* sys_header = NULL;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx->must_flush_log_later = FALSE;
+
+ rseg = trx->rseg;
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to some other state: these modifications to the file data
+ structure define the transaction as committed in the file
+ based world, at the serialization point of the log sequence
+ number lsn obtained below. */
+
+ mutex_enter(&(rseg->mutex));
+
+ if (trx->insert_undo != NULL) {
+ trx_undo_set_state_at_finish(
+ rseg, trx, trx->insert_undo, &mtr);
+ }
+
+ undo = trx->update_undo;
+
+ if (undo) {
+ mutex_enter(&kernel_mutex);
+ trx->no = trx_sys_get_new_trx_no();
+
+ mutex_exit(&kernel_mutex);
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction commit for this transaction. */
+
+ update_hdr_page = trx_undo_set_state_at_finish(
+ rseg, trx, undo, &mtr);
+
+ /* We have to do the cleanup for the update log while
+ holding the rseg mutex because update log headers
+ have to be put to the history list in the order of
+ the trx number. */
+
+ trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ /* Update the latest MySQL binlog name and offset info
+ in trx sys header if MySQL binlogging is on or the database
+ server is a MySQL replication slave */
+
+ if (trx->mysql_log_file_name
+ && trx->mysql_log_file_name[0] != '\0') {
+ if (!sys_header) {
+ sys_header = trx_sysf_get(&mtr);
+ }
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_log_file_name,
+ trx->mysql_log_offset,
+ TRX_SYS_MYSQL_LOG_INFO, &mtr);
+ trx->mysql_log_file_name = NULL;
+ }
+
+ if (trx->mysql_master_log_file_name[0] != '\0') {
+ /* This database server is a MySQL replication slave */
+ if (!sys_header) {
+ sys_header = trx_sysf_get(&mtr);
+ }
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_relay_log_file_name,
+ trx->mysql_relay_log_pos,
+ TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
+ trx_sys_update_mysql_binlog_offset(
+ sys_header,
+ trx->mysql_master_log_file_name,
+ trx->mysql_master_log_pos,
+ TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+ trx->mysql_master_log_file_name = "";
+ }
+
+ /* The following call commits the mini-transaction, making the
+ whole transaction committed in the file-based world, at this
+ log sequence number. The transaction becomes 'durable' when
+ we write the log to disk, but in the logical sense the commit
+ in the file-based data structures (undo logs etc.) happens
+ here.
+
+ NOTE that transaction numbers, which are assigned only to
+ transactions with an update undo log, do not necessarily come
+ in exactly the same order as commit lsn's, if the transactions
+ have different rollback segments. To get exactly the same
+ order we should hold the kernel mutex up to this point,
+ adding to the contention of the kernel mutex. However, if
+ a transaction T2 is able to see modifications made by
+ a transaction T1, T2 will always get a bigger transaction
+ number and a bigger commit lsn than T1. */
+
+ /*--------------*/
+ mtr_commit(&mtr);
+ /*--------------*/
+ lsn = mtr.end_lsn;
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ ut_ad(trx->conc_state == TRX_ACTIVE
+ || trx->conc_state == TRX_PREPARED);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /* The following assignment makes the transaction committed in memory
+ and makes its changes to data visible to other transactions.
+ NOTE that there is a small discrepancy from the strict formal
+ visibility rules here: a human user of the database can see
+ modifications made by another transaction T even before the necessary
+ log segment has been flushed to the disk. If the database happens to
+ crash before the flush, the user has seen modifications from T which
+ will never be a committed transaction. However, any transaction T2
+ which sees the modifications of the committing transaction T, and
+ which also itself makes modifications to the database, will get an lsn
+ larger than the committing transaction T. In the case where the log
+ flush fails, and T never gets committed, also T2 will never get
+ committed. */
+
+ /*--------------------------------------*/
+ trx->conc_state = TRX_COMMITTED_IN_MEMORY;
+ /*--------------------------------------*/
+
+ /* If we release kernel_mutex below and we are still doing
+ recovery i.e.: back ground rollback thread is still active
+ then there is a chance that the rollback thread may see
+ this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
+ up calling trx_cleanup_at_db_startup(). This can happen
+ in the case we are committing a trx here that is left in
+ PREPARED state during the crash. Note that commit of the
+ rollback of a PREPARED trx happens in the recovery thread
+ while the rollback of other transactions happen in the
+ background thread. To avoid this race we unconditionally
+ unset the is_recovered flag from the trx. */
+
+ trx->is_recovered = FALSE;
+
+ lock_release_off_kernel(trx);
+
+ if (trx->global_read_view) {
+ read_view_close(trx->global_read_view);
+ mem_heap_empty(trx->global_read_view_heap);
+ trx->global_read_view = NULL;
+ }
+
+ trx->read_view = NULL;
+
+ if (lsn) {
+ ulint flush_log_at_trx_commit;
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ if (trx->flush_log_at_trx_commit_session == 3) {
+ flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+ } else {
+ flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+ }
+
+ /* NOTE that we could possibly make a group commit more
+ efficient here: call os_thread_yield here to allow also other
+ trxs to come to commit! */
+
+ /*-------------------------------------*/
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the transaction durable if
+ the OS does not crash. We may also flush the log files to
+ disk, making the transaction durable also at an OS crash or a
+ power outage.
+
+ The idea in InnoDB's group commit is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which commits the whole
+ group. Note that this group commit will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ If we are calling trx_commit() under prepare_commit_mutex, we
+ will delay possible log write and flush to a separate function
+ trx_commit_complete_for_mysql(), which is only called when the
+ thread has released the mutex. This is to make the
+ group commit algorithm to work. Otherwise, the prepare_commit
+ mutex would serialize all commits and prevent a group of
+ transactions from gathering. */
+
+ if (trx->flush_log_later) {
+ /* Do nothing yet */
+ trx->must_flush_log_later = TRUE;
+ } else if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->commit_lsn = lsn;
+
+ /*-------------------------------------*/
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ /* Free all savepoints */
+ trx_roll_free_all_savepoints(trx);
+
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->rseg = NULL;
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+ ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ if (trx->insert_undo != NULL) {
+
+ trx_undo_insert_cleanup(trx);
+ }
+
+ trx->conc_state = TRX_NOT_STARTED;
+ trx->rseg = NULL;
+ trx->undo_no = ut_dulint_zero;
+ trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+ UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+ trx_t* trx) /*!< in: active transaction */
+{
+ ut_ad(trx->conc_state == TRX_ACTIVE);
+
+ if (trx->read_view) {
+ return(trx->read_view);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ if (!trx->read_view) {
+ trx->read_view = read_view_open_now(
+ trx->id, trx->global_read_view_heap);
+ trx->global_read_view = trx->read_view;
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ return(trx->read_view);
+}
+
+/****************************************************************//**
+Commits a transaction. NOTE that the kernel mutex is temporarily released. */
+static
+void
+trx_handle_commit_sig_off_kernel(
+/*=============================*/
+ trx_t* trx, /*!< in: transaction */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_sig_t* sig;
+ trx_sig_t* next_sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ trx->que_state = TRX_QUE_COMMITTING;
+
+ trx_commit_off_kernel(trx);
+
+ ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+ /* Remove all TRX_SIG_COMMIT signals from the signal queue and send
+ reply messages to them */
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ while (sig != NULL) {
+ next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ }
+
+ sig = next_sig;
+ }
+
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+ trx_t* trx) /*!< in: transaction */
+{
+ que_thr_t* thr;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t now;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ que_thr_end_wait_no_next_thr(thr);
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ if (innobase_get_slow_log() && trx->take_stats) {
+ ut_usectime(&sec, &ms);
+ now = (ib_uint64_t)sec * 1000000 + ms;
+ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+ }
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the lock wait list to the SUSPENDED state and puts
+the transaction to the TRX_QUE_RUNNING state. */
+static
+void
+trx_lock_wait_to_suspended(
+/*=======================*/
+ trx_t* trx) /*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
+{
+ que_thr_t* thr;
+ ulint sec;
+ ulint ms;
+ ib_uint64_t now;
+
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+ while (thr != NULL) {
+ thr->state = QUE_THR_SUSPENDED;
+
+ UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
+ if (innobase_get_slow_log() && trx->take_stats) {
+ ut_usectime(&sec, &ms);
+ now = (ib_uint64_t)sec * 1000000 + ms;
+ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+ }
+ trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the sig reply wait list of trx to the SUSPENDED
+state. */
+static
+void
+trx_sig_reply_wait_to_suspended(
+/*============================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ trx_sig_t* sig;
+ que_thr_t* thr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+
+ while (sig != NULL) {
+ thr = sig->receiver;
+
+ ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
+
+ thr->state = QUE_THR_SUSPENDED;
+
+ sig->receiver = NULL;
+
+ UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
+
+ sig = UT_LIST_GET_FIRST(trx->reply_signals);
+ }
+}
+
+/*****************************************************************//**
+Checks the compatibility of a new signal with the other signals in the
+queue.
+@return TRUE if the signal can be queued */
+static
+ibool
+trx_sig_is_compatible(
+/*==================*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender) /*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
+{
+ trx_sig_t* sig;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ return(TRUE);
+ }
+
+ if (sender == TRX_SIG_SELF) {
+ if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ return(FALSE);
+ }
+ }
+
+ ut_ad(sender == TRX_SIG_OTHER_SESS);
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+
+ if (type == TRX_SIG_COMMIT) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_TOTAL_ROLLBACK) {
+ while (sig != NULL) {
+
+ if (sig->type == TRX_SIG_COMMIT) {
+
+ return(FALSE);
+ }
+
+ sig = UT_LIST_GET_NEXT(signals, sig);
+ }
+
+ return(TRUE);
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ return(TRUE);
+ } else {
+ ut_error;
+
+ return(FALSE);
+ }
+}
+
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+ trx_t* trx, /*!< in: trx handle */
+ ulint type, /*!< in: signal type */
+ ulint sender, /*!< in: TRX_SIG_SELF or
+ TRX_SIG_OTHER_SESS */
+ que_thr_t* receiver_thr, /*!< in: query thread which wants the
+ reply, or NULL; if type is
+ TRX_SIG_END_WAIT, this must be NULL */
+ trx_savept_t* savept, /*!< in: possible rollback savepoint, or
+ NULL */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ trx_t* receiver_trx;
+
+ ut_ad(trx);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (!trx_sig_is_compatible(trx, type, sender)) {
+ /* The signal is not compatible with the other signals in
+ the queue: die */
+
+ ut_error;
+ }
+
+ /* Queue the signal object */
+
+ if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+ /* The signal list is empty: the 'sig' slot must be unused
+ (we improve performance a bit by avoiding mem_alloc) */
+ sig = &(trx->sig);
+ } else {
+ /* It might be that the 'sig' slot is unused also in this
+ case, but we choose the easy way of using mem_alloc */
+
+ sig = mem_alloc(sizeof(trx_sig_t));
+ }
+
+ UT_LIST_ADD_LAST(signals, trx->signals, sig);
+
+ sig->type = type;
+ sig->sender = sender;
+ sig->receiver = receiver_thr;
+
+ if (savept) {
+ sig->savept = *savept;
+ }
+
+ if (receiver_thr) {
+ receiver_trx = thr_get_trx(receiver_thr);
+
+ UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
+ sig);
+ }
+
+ if (trx->sess->state == SESS_ERROR) {
+
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
+ ut_error;
+ }
+
+ /* If there were no other signals ahead in the queue, try to start
+ handling of the signal */
+
+ if (UT_LIST_GET_FIRST(trx->signals) == sig) {
+
+ trx_sig_start_handle(trx, next_thr);
+ }
+}
+
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, then returns control to the error
+handling routine of the graph (currently just returns the control to the
+graph root which then will send an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+ trx_t* trx) /*!< in: trx */
+{
+ ut_ad(mutex_own(&kernel_mutex));
+ ut_ad(trx->handling_signals == TRUE);
+
+ trx->handling_signals = FALSE;
+
+ trx->graph = trx->graph_before_signal_handling;
+
+ if (trx->graph && (trx->sess->state == SESS_ERROR)) {
+
+ que_fork_error_handle(trx, trx->graph);
+ }
+}
+
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+ trx_t* trx, /*!< in: trx handle */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread; if the parameter
+ is NULL, it is ignored */
+{
+ trx_sig_t* sig;
+ ulint type;
+loop:
+ /* We loop in this function body as long as there are queued signals
+ we can process immediately */
+
+ ut_ad(trx);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
+
+ trx_end_signal_handling(trx);
+
+ return;
+ }
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+
+ trx_start_low(trx, ULINT_UNDEFINED);
+ }
+
+ /* If the trx is in a lock wait state, moves the waiting query threads
+ to the suspended state */
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+ trx_lock_wait_to_suspended(trx);
+ }
+
+ /* If the session is in the error state and this trx has threads
+ waiting for reply from signals, moves these threads to the suspended
+ state, canceling wait reservations; note that if the transaction has
+ sent a commit or rollback signal to itself, and its session is not in
+ the error state, then nothing is done here. */
+
+ if (trx->sess->state == SESS_ERROR) {
+ trx_sig_reply_wait_to_suspended(trx);
+ }
+
+ /* If there are no running query threads, we can start processing of a
+ signal, otherwise we have to wait until all query threads of this
+ transaction are aware of the arrival of the signal. */
+
+ if (trx->n_active_thrs > 0) {
+
+ return;
+ }
+
+ if (trx->handling_signals == FALSE) {
+ trx->graph_before_signal_handling = trx->graph;
+
+ trx->handling_signals = TRUE;
+ }
+
+ sig = UT_LIST_GET_FIRST(trx->signals);
+ type = sig->type;
+
+ if (type == TRX_SIG_COMMIT) {
+
+ trx_handle_commit_sig_off_kernel(trx, next_thr);
+
+ } else if ((type == TRX_SIG_TOTAL_ROLLBACK)
+ || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_ERROR_OCCURRED) {
+
+ trx_rollback(trx, sig, next_thr);
+
+ /* No further signals can be handled until the rollback
+ completes, therefore we return */
+
+ return;
+
+ } else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+ trx_sig_reply(sig, next_thr);
+ trx_sig_remove(trx, sig);
+ } else {
+ ut_error;
+ }
+
+ goto loop;
+}
+
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has been
+handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+ trx_sig_t* sig, /*!< in: signal */
+ que_thr_t** next_thr) /*!< in/out: next query thread to run;
+ if the value which is passed in is
+ a pointer to a NULL pointer, then the
+ calling function can start running
+ a new query thread */
+{
+ trx_t* receiver_trx;
+
+ ut_ad(sig);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ if (sig->receiver != NULL) {
+ ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
+
+ receiver_trx = thr_get_trx(sig->receiver);
+
+ UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
+ sig);
+ ut_ad(receiver_trx->sess->state != SESS_ERROR);
+
+ que_thr_end_wait(sig->receiver, next_thr);
+
+ sig->receiver = NULL;
+
+ }
+}
+
+/****************************************************************//**
+Removes a signal object from the trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+ trx_t* trx, /*!< in: trx handle */
+ trx_sig_t* sig) /*!< in, own: signal */
+{
+ ut_ad(trx && sig);
+ ut_ad(mutex_own(&kernel_mutex));
+
+ ut_ad(sig->receiver == NULL);
+
+ UT_LIST_REMOVE(signals, trx->signals, sig);
+ sig->type = 0; /* reset the field to catch possible bugs */
+
+ if (sig != &(trx->sig)) {
+ mem_free(sig);
+ }
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+ mem_heap_t* heap) /*!< in: mem heap where created */
+{
+ commit_node_t* node;
+
+ node = mem_heap_alloc(heap, sizeof(commit_node_t));
+ node->common.type = QUE_NODE_COMMIT;
+ node->state = COMMIT_NODE_SEND;
+
+ return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+ que_thr_t* thr) /*!< in: query thread */
+{
+ commit_node_t* node;
+ que_thr_t* next_thr;
+
+ node = thr->run_node;
+
+ ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+ if (thr->prev_node == que_node_get_parent(node)) {
+ node->state = COMMIT_NODE_SEND;
+ }
+
+ if (node->state == COMMIT_NODE_SEND) {
+ mutex_enter(&kernel_mutex);
+
+ node->state = COMMIT_NODE_WAIT;
+
+ next_thr = NULL;
+
+ thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+ /* Send the commit signal to the transaction */
+
+ trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
+ thr, NULL, &next_thr);
+
+ mutex_exit(&kernel_mutex);
+
+ return(next_thr);
+ }
+
+ ut_ad(node->state == COMMIT_NODE_WAIT);
+
+ node->state = COMMIT_NODE_SEND;
+
+ thr->run_node = que_node_get_parent(node);
+
+ return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ /* Because we do not do the commit by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx_start_if_not_started(trx);
+
+ trx->op_info = "committing";
+
+ mutex_enter(&kernel_mutex);
+
+ trx_commit_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ib_uint64_t lsn = trx->commit_lsn;
+ ulint flush_log_at_trx_commit;
+
+ ut_a(trx);
+
+ trx->op_info = "flushing log";
+
+ if (trx->flush_log_at_trx_commit_session == 3) {
+ flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+ } else {
+ flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+ }
+
+ if (!trx->must_flush_log_later) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ /* Write the log to the log files AND flush them to
+ disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ trx->must_flush_log_later = FALSE;
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ ut_a(trx);
+
+ if (trx->conc_state == TRX_NOT_STARTED) {
+ trx->undo_no = ut_dulint_zero;
+ }
+
+ trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+}
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ ulint max_query_len) /*!< in: max query length to print, or 0 to
+ use the default max length */
+{
+ ibool newline;
+
+ fprintf(f, "TRANSACTION " TRX_ID_FMT, TRX_ID_PREP_PRINTF(trx->id));
+
+ switch (trx->conc_state) {
+ case TRX_NOT_STARTED:
+ fputs(", not started", f);
+ break;
+ case TRX_ACTIVE:
+ fprintf(f, ", ACTIVE %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_PREPARED:
+ fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+ (ulong)difftime(time(NULL), trx->start_time));
+ break;
+ case TRX_COMMITTED_IN_MEMORY:
+ fputs(", COMMITTED IN MEMORY", f);
+ break;
+ default:
+ fprintf(f, " state %lu", (ulong) trx->conc_state);
+ }
+
+#ifdef UNIV_LINUX
+ fprintf(f, ", process no %lu", trx->mysql_process_no);
+#endif
+ fprintf(f, ", OS thread id %lu",
+ (ulong) os_thread_pf(trx->mysql_thread_id));
+
+ if (*trx->op_info) {
+ putc(' ', f);
+ fputs(trx->op_info, f);
+ }
+
+ if (trx->is_recovered) {
+ fputs(" recovered trx", f);
+ }
+
+ if (trx->is_purge) {
+ fputs(" purge trx", f);
+ }
+
+ if (trx->declared_to_be_inside_innodb) {
+ fprintf(f, ", thread declared inside InnoDB %lu",
+ (ulong) trx->n_tickets_to_enter_innodb);
+ }
+
+ putc('\n', f);
+
+ if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+ fprintf(f, "mysql tables in use %lu, locked %lu\n",
+ (ulong) trx->n_mysql_tables_in_use,
+ (ulong) trx->mysql_n_tables_locked);
+ }
+
+ newline = TRUE;
+
+ switch (trx->que_state) {
+ case TRX_QUE_RUNNING:
+ newline = FALSE; break;
+ case TRX_QUE_LOCK_WAIT:
+ fputs("LOCK WAIT ", f); break;
+ case TRX_QUE_ROLLING_BACK:
+ fputs("ROLLING BACK ", f); break;
+ case TRX_QUE_COMMITTING:
+ fputs("COMMITTING ", f); break;
+ default:
+ fprintf(f, "que state %lu ", (ulong) trx->que_state);
+ }
+
+ if (0 < UT_LIST_GET_LEN(trx->trx_locks)
+ || mem_heap_get_size(trx->lock_heap) > 400) {
+ newline = TRUE;
+
+ fprintf(f, "%lu lock struct(s), heap size %lu,"
+ " %lu row lock(s)",
+ (ulong) UT_LIST_GET_LEN(trx->trx_locks),
+ (ulong) mem_heap_get_size(trx->lock_heap),
+ (ulong) lock_number_of_rows_locked(trx));
+ }
+
+ if (trx->has_search_latch) {
+ newline = TRUE;
+ fputs(", holds adaptive hash latch", f);
+ }
+
+ if (!ut_dulint_is_zero(trx->undo_no)) {
+ newline = TRUE;
+ fprintf(f, ", undo log entries %lu",
+ (ulong) ut_dulint_get_low(trx->undo_no));
+ }
+
+ if (newline) {
+ putc('\n', f);
+ }
+
+ if (trx->mysql_thd != NULL) {
+ innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
+ }
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return <0, 0 or >0; similar to strcmp(3) */
+UNIV_INTERN
+int
+trx_weight_cmp(
+/*===========*/
+ const trx_t* a, /*!< in: the first transaction to be compared */
+ const trx_t* b) /*!< in: the second transaction to be compared */
+{
+ ibool a_notrans_edit;
+ ibool b_notrans_edit;
+
+ /* If mysql_thd is NULL for a transaction we assume that it has
+ not edited non-transactional tables. */
+
+ a_notrans_edit = a->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(a->mysql_thd);
+
+ b_notrans_edit = b->mysql_thd != NULL
+ && thd_has_edited_nontrans_tables(b->mysql_thd);
+
+ if (a_notrans_edit && !b_notrans_edit) {
+
+ return(1);
+ }
+
+ if (!a_notrans_edit && b_notrans_edit) {
+
+ return(-1);
+ }
+
+ /* Either both had edited non-transactional tables or both had
+ not, we fall back to comparing the number of altered/locked
+ rows. */
+
+#if 0
+ fprintf(stderr,
+ "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+ __func__,
+ ut_conv_dulint_to_longlong(a->undo_no),
+ UT_LIST_GET_LEN(a->trx_locks),
+ ut_conv_dulint_to_longlong(b->undo_no),
+ UT_LIST_GET_LEN(b->trx_locks));
+#endif
+
+ return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b)));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+UNIV_INTERN
+void
+trx_prepare_off_kernel(
+/*===================*/
+ trx_t* trx) /*!< in: transaction */
+{
+ page_t* update_hdr_page;
+ trx_rseg_t* rseg;
+ ib_uint64_t lsn = 0;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ rseg = trx->rseg;
+
+ if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+ mutex_exit(&kernel_mutex);
+
+ mtr_start(&mtr);
+
+ /* Change the undo log segment states from TRX_UNDO_ACTIVE
+ to TRX_UNDO_PREPARED: these modifications to the file data
+ structure define the transaction as prepared in the
+ file-based world, at the serialization point of lsn. */
+
+ mutex_enter(&(rseg->mutex));
+
+ if (trx->insert_undo != NULL) {
+
+ /* It is not necessary to obtain trx->undo_mutex here
+ because only a single OS thread is allowed to do the
+ transaction prepare for this transaction. */
+
+ trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+ &mtr);
+ }
+
+ if (trx->update_undo) {
+ update_hdr_page = trx_undo_set_state_at_prepare(
+ trx, trx->update_undo, &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+
+ /*--------------*/
+ mtr_commit(&mtr); /* This mtr commit makes the
+ transaction prepared in the file-based
+ world */
+ /*--------------*/
+ lsn = mtr.end_lsn;
+
+ mutex_enter(&kernel_mutex);
+ }
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ /*--------------------------------------*/
+ trx->conc_state = TRX_PREPARED;
+ /*--------------------------------------*/
+
+ if (lsn) {
+ ulint flush_log_at_trx_commit;
+
+ /* Depending on the my.cnf options, we may now write the log
+ buffer to the log files, making the prepared state of the
+ transaction durable if the OS does not crash. We may also
+ flush the log files to disk, making the prepared state of the
+ transaction durable also at an OS crash or a power outage.
+
+ The idea in InnoDB's group prepare is that a group of
+ transactions gather behind a trx doing a physical disk write
+ to log files, and when that physical write has been completed,
+ one of those transactions does a write which prepares the whole
+ group. Note that this group prepare will only bring benefit if
+ there are > 2 users in the database. Then at least 2 users can
+ gather behind one doing the physical log write to disk.
+
+ TODO: find out if MySQL holds some mutex when calling this.
+ That would spoil our group prepare algorithm. */
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx->flush_log_at_trx_commit_session == 3) {
+ flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+ } else {
+ flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+ }
+
+ if (flush_log_at_trx_commit == 0) {
+ /* Do nothing */
+ } else if (flush_log_at_trx_commit == 1) {
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+ FALSE);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+ }
+ } else if (flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+ } else {
+ ut_error;
+ }
+
+ mutex_enter(&kernel_mutex);
+ }
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return 0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+ trx_t* trx) /*!< in: trx handle */
+{
+ /* Because we do not do the prepare by sending an Innobase
+ sig to the transaction, we must here make sure that trx has been
+ started. */
+
+ ut_a(trx);
+
+ trx->op_info = "preparing";
+
+ trx_start_if_not_started(trx);
+
+ mutex_enter(&kernel_mutex);
+
+ trx_prepare_off_kernel(trx);
+
+ mutex_exit(&kernel_mutex);
+
+ trx->op_info = "";
+
+ return(0);
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+ XID* xid_list, /*!< in/out: prepared transactions */
+ ulint len) /*!< in: number of slots in xid_list */
+{
+ trx_t* trx;
+ ulint count = 0;
+
+ ut_ad(xid_list);
+ ut_ad(len);
+
+ /* We should set those transactions which are in the prepared state
+ to the xid_list */
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ if (trx->conc_state == TRX_PREPARED) {
+ xid_list[count] = trx->xid;
+
+ if (count == 0) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Starting recovery for"
+ " XA transactions...\n");
+ }
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Transaction " TRX_ID_FMT " in"
+ " prepared state after recovery\n",
+ TRX_ID_PREP_PRINTF(trx->id));
+
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Transaction contains changes"
+ " to %lu rows\n",
+ (ulong) ut_conv_dulint_to_longlong(
+ trx->undo_no));
+
+ count++;
+
+ if (count == len) {
+ break;
+ }
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (count > 0){
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: %lu transactions in prepared state"
+ " after recovery\n",
+ (ulong) count);
+ }
+
+ return ((int) count);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return trx or NULL */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+ XID* xid) /*!< in: X/Open XA transaction identification */
+{
+ trx_t* trx;
+
+ if (xid == NULL) {
+
+ return (NULL);
+ }
+
+ mutex_enter(&kernel_mutex);
+
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+ while (trx) {
+ /* Compare two X/Open XA transaction id's: their
+ length should be the same and binary comparison
+ of gtrid_lenght+bqual_length bytes should be
+ the same */
+
+ if (xid->gtrid_length == trx->xid.gtrid_length
+ && xid->bqual_length == trx->xid.bqual_length
+ && memcmp(xid->data, trx->xid.data,
+ xid->gtrid_length + xid->bqual_length) == 0) {
+ break;
+ }
+
+ trx = UT_LIST_GET_NEXT(trx_list, trx);
+ }
+
+ mutex_exit(&kernel_mutex);
+
+ if (trx) {
+ if (trx->conc_state != TRX_PREPARED) {
+
+ return(NULL);
+ }
+
+ return(trx);
+ } else {
+ return(NULL);
+ }
+}
diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c
new file mode 100644
index 00000000000..ec4beb5660a
--- /dev/null
+++ b/storage/xtradb/trx/trx0undo.c
@@ -0,0 +1,2032 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.c
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+#ifdef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#ifndef UNIV_HOTBACKUP
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+
+/* How should the old versions in the history list be managed?
+ ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+ However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+ A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+ When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+ In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+ We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+ -------------------------------------------------------------------
+latches?
+-------
+The contention of the kernel mutex should be minimized. When a transaction
+does its first insert or modify in an index, an undo log is assigned for it.
+Then we must have an x-latch to the rollback segment header.
+ When the transaction does more modifys or rolls back, the undo log is
+protected with undo_mutex in the transaction.
+ When the transaction commits, its insert undo log is either reset and
+cached for a fast reuse, or freed. In these cases we must have an x-latch on
+the rollback segment page. The update undo log is put to the history list. If
+it is not suitable for reuse, its slot in the rollback segment is reset. In
+both cases, an x-latch must be acquired on the rollback segment.
+ The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+ page_t* undo_page, /*!< in: undo log segment page */
+ ulint type, /*!< in: undo log segment type */
+ mtr_t* mtr); /*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset);/*!< in: undo log header byte offset on page */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ page_t* undo_page, /*!< in/out: insert undo log segment
+ header page, x-latched */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr); /*!< in: mtr */
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /*!< in: header page of an undo log of size 1 */
+ mtr_t* mtr); /*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Gets the previous record in an undo log from the previous page.
+@return undo log record, the page s-latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(
+/*=================================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ ulint prev_page_no;
+ page_t* prev_page;
+ page_t* undo_page;
+
+ undo_page = page_align(rec);
+
+ prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+
+ if (prev_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ space = page_get_space_id(undo_page);
+ zip_size = fil_space_get_zip_size(space);
+
+ prev_page = trx_undo_page_get_s_latched(space, zip_size,
+ prev_page_no, mtr);
+
+ return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_rec_t* prev_rec;
+
+ prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+
+ if (prev_rec) {
+
+ return(prev_rec);
+ }
+
+ /* We have to go to the previous undo log page to look for the
+ previous record */
+
+ return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
+ mtr));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log from the next page.
+@return undo log record, the page latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(
+/*=================================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ page_t* undo_page, /*!< in: undo log page */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_ulogf_t* log_hdr;
+ ulint next_page_no;
+ page_t* next_page;
+ ulint next;
+
+ if (page_no == page_get_page_no(undo_page)) {
+
+ log_hdr = undo_page + offset;
+ next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+ if (next != 0) {
+
+ return(NULL);
+ }
+ }
+
+ next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_NODE, mtr)
+ .page;
+ if (next_page_no == FIL_NULL) {
+
+ return(NULL);
+ }
+
+ if (mode == RW_S_LATCH) {
+ next_page = trx_undo_page_get_s_latched(space, zip_size,
+ next_page_no, mtr);
+ } else {
+ ut_ad(mode == RW_X_LATCH);
+ next_page = trx_undo_page_get(space, zip_size,
+ next_page_no, mtr);
+ }
+
+ return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+ trx_undo_rec_t* rec, /*!< in: undo record */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint space;
+ ulint zip_size;
+ trx_undo_rec_t* next_rec;
+
+ next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+
+ if (next_rec) {
+ return(next_rec);
+ }
+
+ space = page_get_space_id(page_align(rec));
+ zip_size = fil_space_get_zip_size(space);
+
+ return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+ page_align(rec),
+ page_no, offset,
+ RW_S_LATCH, mtr));
+}
+
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+ ulint space, /*!< in: undo log header space */
+ ulint zip_size,/*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset, /*!< in: undo log header offset on page */
+ ulint mode, /*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+
+ if (mode == RW_S_LATCH) {
+ undo_page = trx_undo_page_get_s_latched(space, zip_size,
+ page_no, mtr);
+ } else {
+ undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+ }
+
+ rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+
+ if (rec) {
+ return(rec);
+ }
+
+ return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+ undo_page, page_no, offset,
+ mode, mtr));
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log page initialization. */
+UNIV_INLINE
+void
+trx_undo_page_init_log(
+/*===================*/
+ page_t* undo_page, /*!< in: undo log page */
+ ulint type, /*!< in: undo log type */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr);
+
+ mlog_catenate_ulint_compressed(mtr, type);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ulint type;
+
+ ptr = mach_parse_compressed(ptr, end_ptr, &type);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ trx_undo_page_init(page, type, mtr);
+ }
+
+ return(ptr);
+}
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+ page_t* undo_page, /*!< in: undo log segment page */
+ ulint type, /*!< in: undo log segment type */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+
+ fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG);
+
+ trx_undo_page_init_log(undo_page, type, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Creates a new undo log segment in file.
+@return DB_SUCCESS if page creation OK possible error codes are:
+DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */
+static
+ulint
+trx_undo_seg_create(
+/*================*/
+ trx_rseg_t* rseg __attribute__((unused)),/*!< in: rollback segment */
+ trx_rsegf_t* rseg_hdr,/*!< in: rollback segment header, page
+ x-latched */
+ ulint type, /*!< in: type of the segment: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ ulint* id, /*!< out: slot index within rseg header */
+ page_t** undo_page,
+ /*!< out: segment header page x-latched, NULL
+ if there was an error */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ ulint slot_no;
+ ulint space;
+ buf_block_t* block;
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ ulint n_reserved;
+ ibool success;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(mtr && id && rseg_hdr);
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ /* fputs(type == TRX_UNDO_INSERT
+ ? "Creating insert undo log segment\n"
+ : "Creating update undo log segment\n", stderr); */
+ slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr);
+
+ if (slot_no == ULINT_UNDEFINED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Warning: cannot find a free slot for"
+ " an undo log. Do you have too\n"
+ "InnoDB: many active transactions"
+ " running concurrently?\n");
+
+ return(DB_TOO_MANY_CONCURRENT_TRXS);
+ }
+
+ space = page_get_space_id(page_align(rseg_hdr));
+
+ success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+ mtr);
+ if (!success) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ /* Allocate a new file segment for the undo log */
+ block = fseg_create_general(space, 0,
+ TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
+
+ fil_space_release_free_extents(space, n_reserved);
+
+ if (block == NULL) {
+ /* No space left */
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+ *undo_page = buf_block_get_frame(block);
+
+ page_hdr = *undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = *undo_page + TRX_UNDO_SEG_HDR;
+
+ trx_undo_page_init(*undo_page, type, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE,
+ TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr);
+
+ flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr);
+
+ flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST,
+ page_hdr + TRX_UNDO_PAGE_NODE, mtr);
+
+ trx_rsegf_set_nth_undo(rseg_hdr, slot_no,
+ page_get_page_no(*undo_page), mtr);
+ *id = slot_no;
+
+ return(err);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header initialization. */
+UNIV_INLINE
+void
+trx_undo_header_create_log(
+/*=======================*/
+ const page_t* undo_page, /*!< in: undo log header page */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
+
+ mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a new undo log header in file. NOTE that this function has its own
+log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
+this function!
+@return header byte offset on page */
+static
+ulint
+trx_undo_header_create(
+/*===================*/
+ page_t* undo_page, /*!< in/out: undo log segment
+ header page, x-latched; it is
+ assumed that there is
+ TRX_UNDO_LOG_XA_HDR_SIZE bytes
+ free space on it */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint prev_log;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+
+ if (prev_log != 0) {
+ prev_log_hdr = undo_page + prev_log;
+
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
+ }
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE);
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
+
+ /* Write the log record about the header creation */
+ trx_undo_header_create_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Write X/Open XA Transaction Identification (XID) to undo log header */
+static
+void
+trx_undo_write_xid(
+/*===============*/
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ const XID* xid, /*!< in: X/Open XA Transaction Identification */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
+ (ulint)xid->formatID, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
+ (ulint)xid->gtrid_length, MLOG_4BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+ (ulint)xid->bqual_length, MLOG_4BYTES, mtr);
+
+ mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
+ XIDDATASIZE, mtr);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(
+/*==============*/
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ XID* xid) /*!< out: X/Open XA Transaction Identification */
+{
+ xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+
+ xid->gtrid_length
+ = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
+ xid->bqual_length
+ = (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN);
+
+ memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/***************************************************************//**
+Adds space for the XA XID after an undo log old-style header. */
+static
+void
+trx_undo_header_add_space_for_xid(
+/*==============================*/
+ page_t* undo_page,/*!< in: undo log segment header page */
+ trx_ulogf_t* log_hdr,/*!< in: undo log header */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ ulint free;
+ ulint new_free;
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+ /* free is now the end offset of the old style undo log header */
+
+ ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
+ - TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+ /* Add space for a XID after the header, update the free offset
+ fields on the undo log page and in the undo log header */
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
+ MLOG_2BYTES, mtr);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
+ MLOG_2BYTES, mtr);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header reuse. */
+UNIV_INLINE
+void
+trx_undo_insert_header_reuse_log(
+/*=============================*/
+ const page_t* undo_page, /*!< in: undo log header page */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr);
+
+ mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+ ulint type, /*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr,/*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ trx_id_t trx_id;
+
+ ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id);
+
+ if (ptr == NULL) {
+
+ return(NULL);
+ }
+
+ if (page) {
+ if (type == MLOG_UNDO_HDR_CREATE) {
+ trx_undo_header_create(page, trx_id, mtr);
+ } else {
+ ut_ad(type == MLOG_UNDO_HDR_REUSE);
+ trx_undo_insert_header_reuse(page, trx_id, mtr);
+ }
+ }
+
+ return(ptr);
+}
+
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+ page_t* undo_page, /*!< in/out: insert undo log segment
+ header page, x-latched */
+ trx_id_t trx_id, /*!< in: transaction id */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_upagef_t* page_hdr;
+ trx_usegf_t* seg_hdr;
+ trx_ulogf_t* log_hdr;
+ ulint free;
+ ulint new_free;
+
+ ut_ad(mtr && undo_page);
+
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+ free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
+
+ ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+ log_hdr = undo_page + free;
+
+ new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+ /* Insert undo data is not needed after commit: we may free all
+ the space on the page */
+
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_INSERT);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+ log_hdr = undo_page + free;
+
+ mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+ mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+ mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+ mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+ /* Write the log record MLOG_UNDO_HDR_REUSE */
+ trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr);
+
+ return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Writes the redo log entry of an update undo log header discard. */
+UNIV_INLINE
+void
+trx_undo_discard_latest_log(
+/*========================*/
+ page_t* undo_page, /*!< in: undo log header page */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+ byte* ptr, /*!< in: buffer */
+ byte* end_ptr __attribute__((unused)), /*!< in: buffer end */
+ page_t* page, /*!< in: page or NULL */
+ mtr_t* mtr) /*!< in: mtr or NULL */
+{
+ ut_ad(end_ptr);
+
+ if (page) {
+ trx_undo_discard_latest_update_undo(page, mtr);
+ }
+
+ return(ptr);
+}
+
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+ page_t* undo_page, /*!< in: header page of an undo log of size 1 */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ trx_ulogf_t* log_hdr;
+ trx_ulogf_t* prev_log_hdr;
+ ulint free;
+ ulint prev_hdr_offset;
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ log_hdr = undo_page + free;
+
+ prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG);
+
+ if (prev_hdr_offset != 0) {
+ prev_log_hdr = undo_page + prev_hdr_offset;
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+ mach_read_from_2(prev_log_hdr
+ + TRX_UNDO_LOG_START));
+ mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0);
+ }
+
+ mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free);
+
+ mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED);
+ mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset);
+
+ trx_undo_discard_latest_log(undo_page, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return page number if success, else FIL_NULL */
+UNIV_INTERN
+ulint
+trx_undo_add_page(
+/*==============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory object */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ page_t* new_page;
+ trx_rseg_t* rseg;
+ ulint page_no;
+ ulint n_reserved;
+ ibool success;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+ rseg = trx->rseg;
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(FIL_NULL);
+ }
+
+ header_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ success = fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+ FSP_UNDO, mtr);
+ if (!success) {
+
+ return(FIL_NULL);
+ }
+
+ page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
+ + TRX_UNDO_FSEG_HEADER,
+ undo->top_page_no + 1, FSP_UP,
+ TRUE, mtr);
+
+ fil_space_release_free_extents(undo->space, n_reserved);
+
+ if (page_no == FIL_NULL) {
+
+ /* No space left */
+
+ return(FIL_NULL);
+ }
+
+ undo->last_page_no = page_no;
+
+ new_page = trx_undo_page_get(undo->space, undo->zip_size,
+ page_no, mtr);
+
+ trx_undo_page_init(new_page, undo->type, mtr);
+
+ flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+ undo->size++;
+ rseg->curr_size++;
+
+ return(page_no);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return last page number in remaining log */
+static
+ulint
+trx_undo_free_page(
+/*===============*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ibool in_history, /*!< in: TRUE if the undo log is in the history
+ list */
+ ulint space, /*!< in: space */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint page_no, /*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ page_t* header_page;
+ page_t* undo_page;
+ fil_addr_t last_addr;
+ trx_rsegf_t* rseg_header;
+ ulint hist_size;
+ ulint zip_size;
+
+ ut_a(hdr_page_no != page_no);
+ ut_ad(!mutex_own(&kernel_mutex));
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ zip_size = rseg->zip_size;
+
+ undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+
+ header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+ flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+ undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+ fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+ space, page_no, mtr);
+
+ last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR
+ + TRX_UNDO_PAGE_LIST, mtr);
+ rseg->curr_size--;
+
+ if (in_history) {
+ rseg_header = trx_rsegf_get(space, zip_size,
+ rseg->page_no, mtr);
+
+ hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ MLOG_4BYTES, mtr);
+ ut_ad(hist_size > 0);
+ mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+ hist_size - 1, MLOG_4BYTES, mtr);
+ }
+
+ return(last_addr.page);
+}
+
+/********************************************************************//**
+Frees an undo log page when there is also the memory object for the undo
+log. */
+static
+void
+trx_undo_free_page_in_rollback(
+/*===========================*/
+ trx_t* trx __attribute__((unused)), /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ ulint page_no,/*!< in: page number to free: must not be the
+ header page */
+ mtr_t* mtr) /*!< in: mtr which does not have a latch to any
+ undo log page; the caller must have reserved
+ the rollback segment mutex */
+{
+ ulint last_page_no;
+
+ ut_ad(undo->hdr_page_no != page_no);
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+
+ last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space,
+ undo->hdr_page_no, page_no, mtr);
+
+ undo->last_page_no = last_page_no;
+ undo->size--;
+}
+
+/********************************************************************//**
+Empties an undo log header page of undo records for that undo log. Other
+undo logs may still have records on that page, if it is an update undo log. */
+static
+void
+trx_undo_empty_header_page(
+/*=======================*/
+ ulint space, /*!< in: space */
+ ulint zip_size, /*!< in: compressed page size in bytes
+ or 0 for uncompressed pages */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* header_page;
+ trx_ulogf_t* log_hdr;
+ ulint end;
+
+ header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+ log_hdr = header_page + hdr_offset;
+
+ end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset);
+
+ mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end(
+/*==================*/
+ trx_t* trx, /*!< in: transaction whose undo log it is */
+ trx_undo_t* undo, /*!< in: undo log */
+ undo_no_t limit) /*!< in: all undo records with undo number
+ >= this value should be truncated */
+{
+ page_t* undo_page;
+ ulint last_page_no;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* trunc_here;
+ trx_rseg_t* rseg;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+ ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+ rseg = trx->rseg;
+
+ for (;;) {
+ mtr_start(&mtr);
+
+ trunc_here = NULL;
+
+ last_page_no = undo->last_page_no;
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ last_page_no, &mtr);
+
+ rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
+ undo->hdr_offset);
+ for (;;) {
+ if (rec == NULL) {
+ if (last_page_no == undo->hdr_page_no) {
+
+ goto function_exit;
+ }
+
+ trx_undo_free_page_in_rollback(
+ trx, undo, last_page_no, &mtr);
+ break;
+ }
+
+ if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit)
+ >= 0) {
+ /* Truncate at least this record off, maybe
+ more */
+ trunc_here = rec;
+ } else {
+ goto function_exit;
+ }
+
+ rec = trx_undo_page_get_prev_rec(rec,
+ undo->hdr_page_no,
+ undo->hdr_offset);
+ }
+
+ mtr_commit(&mtr);
+ }
+
+function_exit:
+ if (trunc_here) {
+ mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_FREE,
+ trunc_here - undo_page, MLOG_2BYTES, &mtr);
+ }
+
+ mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment */
+ ulint space, /*!< in: space id of the log */
+ ulint hdr_page_no, /*!< in: header page number */
+ ulint hdr_offset, /*!< in: header offset on the page */
+ undo_no_t limit) /*!< in: all undo pages with
+ undo numbers < this value
+ should be truncated; NOTE that
+ the function only frees whole
+ pages; the header page is not
+ freed, but emptied, if all the
+ records there are < limit */
+{
+ page_t* undo_page;
+ trx_undo_rec_t* rec;
+ trx_undo_rec_t* last_rec;
+ ulint page_no;
+ mtr_t mtr;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (ut_dulint_is_zero(limit)) {
+
+ return;
+ }
+loop:
+ mtr_start(&mtr);
+
+ rec = trx_undo_get_first_rec(space, rseg->zip_size,
+ hdr_page_no, hdr_offset,
+ RW_X_LATCH, &mtr);
+ if (rec == NULL) {
+ /* Already empty */
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ undo_page = page_align(rec);
+
+ last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+ hdr_offset);
+ if (ut_dulint_cmp(trx_undo_rec_get_undo_no(last_rec), limit) >= 0) {
+
+ mtr_commit(&mtr);
+
+ return;
+ }
+
+ page_no = page_get_page_no(undo_page);
+
+ if (page_no == hdr_page_no) {
+ trx_undo_empty_header_page(space, rseg->zip_size,
+ hdr_page_no, hdr_offset,
+ &mtr);
+ } else {
+ trx_undo_free_page(rseg, TRUE, space, hdr_page_no,
+ page_no, &mtr);
+ }
+
+ mtr_commit(&mtr);
+
+ goto loop;
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is not in the history list. */
+static
+void
+trx_undo_seg_free(
+/*==============*/
+ trx_undo_t* undo) /*!< in: undo log */
+{
+ trx_rseg_t* rseg;
+ fseg_header_t* file_seg;
+ trx_rsegf_t* rseg_header;
+ trx_usegf_t* seg_header;
+ ibool finished;
+ mtr_t mtr;
+
+ rseg = undo->rseg;
+
+ do {
+
+ mtr_start(&mtr);
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mutex_enter(&(rseg->mutex));
+
+ seg_header = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no,
+ &mtr) + TRX_UNDO_SEG_HDR;
+
+ file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+
+ finished = fseg_free_step(file_seg, &mtr);
+
+ if (finished) {
+ /* Update the rseg header */
+ rseg_header = trx_rsegf_get(
+ rseg->space, rseg->zip_size, rseg->page_no,
+ &mtr);
+ trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
+ &mtr);
+ }
+
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+ } while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/********************************************************************//**
+Creates and initializes an undo log memory object according to the values
+in the header in file, when the database is started. The memory object is
+inserted in the appropriate list of rseg.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create_at_db_start(
+/*============================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint page_no,/*!< in: undo log segment page number */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* undo_page;
+ trx_upagef_t* page_header;
+ trx_usegf_t* seg_header;
+ trx_ulogf_t* undo_header;
+ trx_undo_t* undo;
+ ulint type;
+ ulint state;
+ trx_id_t trx_id;
+ ulint offset;
+ fil_addr_t last_addr;
+ page_t* last_page;
+ trx_undo_rec_t* rec;
+ XID xid;
+ ibool xid_exists = FALSE;
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ page_no, mtr);
+
+ page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+ type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES,
+ mtr);
+ seg_header = undo_page + TRX_UNDO_SEG_HDR;
+
+ state = mach_read_from_2(seg_header + TRX_UNDO_STATE);
+
+ offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG);
+
+ undo_header = undo_page + offset;
+
+ trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr);
+
+ xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ MLOG_1BYTE, mtr);
+
+ /* Read X/Open XA transaction identification if it exists, or
+ set it to NULL. */
+
+ memset(&xid, 0, sizeof(xid));
+ xid.formatID = -1;
+
+ if (xid_exists == TRUE) {
+ trx_undo_read_xid(undo_header, &xid);
+ }
+
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid,
+ page_no, offset);
+ mutex_exit(&(rseg->mutex));
+
+ undo->dict_operation = mtr_read_ulint(
+ undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr);
+
+ undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr);
+ undo->state = state;
+ undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ /* If the log segment is being freed, the page list is inconsistent! */
+ if (state == TRX_UNDO_TO_FREE) {
+
+ goto add_to_list;
+ }
+
+ last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+ undo->last_page_no = last_addr.page;
+ undo->top_page_no = last_addr.page;
+
+ last_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+ undo->last_page_no, mtr);
+
+ rec = trx_undo_page_get_last_rec(last_page, page_no, offset);
+
+ if (rec == NULL) {
+ undo->empty = TRUE;
+ } else {
+ undo->empty = FALSE;
+ undo->top_offset = rec - last_page;
+ undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+ }
+add_to_list:
+ if (type == TRX_UNDO_INSERT) {
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
+ undo);
+ }
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+ if (state != TRX_UNDO_CACHED) {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list,
+ undo);
+ } else {
+ UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
+ undo);
+ }
+ }
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy. This
+function is only called when the database is started or a new rollback
+segment is created.
+@return the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+ trx_rseg_t* rseg) /*!< in: rollback segment memory object */
+{
+ ulint page_no;
+ trx_undo_t* undo;
+ ulint size = 0;
+ trx_rsegf_t* rseg_header;
+ ulint i;
+ mtr_t mtr;
+
+ UT_LIST_INIT(rseg->update_undo_list);
+ UT_LIST_INIT(rseg->update_undo_cached);
+ UT_LIST_INIT(rseg->insert_undo_list);
+ UT_LIST_INIT(rseg->insert_undo_cached);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size,
+ rseg->page_no, &mtr);
+
+ if (!srv_extra_undoslots) {
+ /* uses direct call for avoid "Assertion failure" */
+ //page_no = trx_rsegf_get_nth_undo(rseg_header, TRX_RSEG_N_EXTRA_SLOTS - 1, &mtr);
+ page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS
+ + (TRX_RSEG_N_EXTRA_SLOTS - 1) * TRX_RSEG_SLOT_SIZE,
+ MLOG_4BYTES, &mtr);
+ if (page_no != 0) {
+ /* check extended slots are not used */
+ for (i = TRX_RSEG_N_SLOTS; i < TRX_RSEG_N_EXTRA_SLOTS; i++) {
+ /* uses direct call for avoid "Assertion failure" */
+ page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS
+ + i * TRX_RSEG_SLOT_SIZE,
+ MLOG_4BYTES, &mtr);
+ if (page_no != FIL_NULL) {
+ srv_extra_undoslots = TRUE;
+ fprintf(stderr,
+"InnoDB: Error: innodb_extra_undoslots option is disabled, but it was enabled before.\n"
+"InnoDB: The datafile is not normal for mysqld and disabled innodb_extra_undoslots.\n"
+"InnoDB: Enable innodb_extra_undoslots if it was enabled before, and\n"
+"InnoDB: ### don't use this datafile with other mysqld or ibbackup! ###\n"
+"InnoDB: Cannot continue operation for the safety. Calling exit(1).\n");
+ exit(1);
+ }
+ }
+ fprintf(stderr,
+"InnoDB: Warning: innodb_extra_undoslots option is disabled, but it was enabled before.\n"
+"InnoDB: But extended undo slots seem not used, so continue operation.\n");
+ }
+ }
+
+ for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+ page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
+
+ /* If it was not initialized when the datafile created,
+ page_no will be 0 for the extended slots after that */
+
+ if (page_no == 0) {
+ page_no = FIL_NULL;
+ trx_rsegf_set_nth_undo(rseg_header, i, page_no, &mtr);
+ }
+
+ /* In forced recovery: try to avoid operations which look
+ at database pages; undo logs are rapidly changing data, and
+ the probability that they are in an inconsistent state is
+ high */
+
+ if (page_no != FIL_NULL
+ && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+ undo = trx_undo_mem_create_at_db_start(rseg, i,
+ page_no, &mtr);
+ size += undo->size;
+
+ mtr_commit(&mtr);
+
+ mtr_start(&mtr);
+
+ rseg_header = trx_rsegf_get(
+ rseg->space, rseg->zip_size, rseg->page_no,
+ &mtr);
+ }
+ }
+
+ mtr_commit(&mtr);
+
+ return(size);
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint id, /*!< in: slot index within rseg */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification */
+ ulint page_no,/*!< in: undo log header page number */
+ ulint offset) /*!< in: undo log header byte offset on page */
+{
+ trx_undo_t* undo;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) id);
+ ut_error;
+ }
+
+ undo = mem_alloc(sizeof(trx_undo_t));
+
+ if (undo == NULL) {
+
+ return NULL;
+ }
+
+ undo->id = id;
+ undo->type = type;
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->rseg = rseg;
+
+ undo->space = rseg->space;
+ undo->zip_size = rseg->zip_size;
+ undo->hdr_page_no = page_no;
+ undo->hdr_offset = offset;
+ undo->last_page_no = page_no;
+ undo->size = 1;
+
+ undo->empty = TRUE;
+ undo->top_page_no = page_no;
+ undo->guess_block = NULL;
+
+ return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+ trx_undo_t* undo, /*!< in: undo log to init */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open XA transaction identification*/
+ ulint offset) /*!< in: undo log header byte offset on page */
+{
+ ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+ if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo->state = TRX_UNDO_ACTIVE;
+ undo->del_marks = FALSE;
+ undo->trx_id = trx_id;
+ undo->xid = *xid;
+
+ undo->dict_operation = FALSE;
+
+ undo->hdr_offset = offset;
+ undo->empty = TRUE;
+}
+
+/********************************************************************//**
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+ trx_undo_t* undo) /*!< in: the undo object to be freed */
+{
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr,
+ "InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+ ut_error;
+ }
+
+ mem_free(undo);
+}
+
+/**********************************************************************//**
+Creates a new undo log.
+@return DB_SUCCESS if successful in creating the new undo lob object,
+possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS
+DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */
+static
+ulint
+trx_undo_create(
+/*============*/
+ trx_t* trx, /*!< in: transaction */
+ trx_rseg_t* rseg, /*!< in: rollback segment memory copy */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is created */
+ const XID* xid, /*!< in: X/Open transaction identification*/
+ trx_undo_t** undo, /*!< out: the new undo log object, undefined
+ * if did not succeed */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_rsegf_t* rseg_header;
+ ulint page_no;
+ ulint offset;
+ ulint id;
+ page_t* undo_page;
+ ulint err;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (rseg->curr_size == rseg->max_size) {
+
+ return(DB_OUT_OF_FILE_SPACE);
+ }
+
+ rseg->curr_size++;
+
+ rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no,
+ mtr);
+
+ err = trx_undo_seg_create(rseg, rseg_header, type, &id,
+ &undo_page, mtr);
+
+ if (err != DB_SUCCESS) {
+ /* Did not succeed */
+
+ rseg->curr_size--;
+
+ return(err);
+ }
+
+ page_no = page_get_page_no(undo_page);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(undo_page,
+ undo_page + offset, mtr);
+ }
+
+ *undo = trx_undo_mem_create(rseg, id, type, trx_id, xid,
+ page_no, offset);
+ if (*undo == NULL) {
+
+ err = DB_OUT_OF_MEMORY;
+ }
+
+ return(err);
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/********************************************************************//**
+Reuses a cached undo log.
+@return the undo log memory object, NULL if none cached */
+static
+trx_undo_t*
+trx_undo_reuse_cached(
+/*==================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ ulint type, /*!< in: type of the log: TRX_UNDO_INSERT or
+ TRX_UNDO_UPDATE */
+ trx_id_t trx_id, /*!< in: id of the trx for which the undo log
+ is used */
+ const XID* xid, /*!< in: X/Open XA transaction identification */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_undo_t* undo;
+ page_t* undo_page;
+ ulint offset;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ if (type == TRX_UNDO_INSERT) {
+
+ undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(type == TRX_UNDO_UPDATE);
+
+ undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+ if (undo == NULL) {
+
+ return(NULL);
+ }
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+ }
+
+ ut_ad(undo->size == 1);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ if (type == TRX_UNDO_INSERT) {
+ offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(
+ undo_page, undo_page + offset, mtr);
+ }
+ } else {
+ ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+ + TRX_UNDO_PAGE_TYPE)
+ == TRX_UNDO_UPDATE);
+
+ offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+ if (trx->support_xa) {
+ trx_undo_header_add_space_for_xid(
+ undo_page, undo_page + offset, mtr);
+ }
+ }
+
+ trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset);
+
+ return(undo);
+}
+
+/**********************************************************************//**
+Marks an undo log header as a header of a data dictionary operation
+transaction. */
+static
+void
+trx_undo_mark_as_dict_operation(
+/*============================*/
+ trx_t* trx, /*!< in: dict op transaction */
+ trx_undo_t* undo, /*!< in: assigned undo log */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ page_t* hdr_page;
+
+ hdr_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ switch (trx_get_dict_operation(trx)) {
+ case TRX_DICT_OP_NONE:
+ ut_error;
+ case TRX_DICT_OP_INDEX:
+ /* Do not discard the table on recovery. */
+ undo->table_id = ut_dulint_zero;
+ break;
+ case TRX_DICT_OP_TABLE:
+ undo->table_id = trx->table_id;
+ break;
+ }
+
+ mlog_write_ulint(hdr_page + undo->hdr_offset
+ + TRX_UNDO_DICT_TRANS,
+ TRUE, MLOG_1BYTE, mtr);
+
+ mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID,
+ undo->table_id, mtr);
+
+ undo->dict_operation = TRUE;
+}
+
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+ trx_t* trx, /*!< in: transaction */
+ ulint type) /*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+ mtr_t mtr;
+ ulint err = DB_SUCCESS;
+
+ ut_ad(trx);
+ ut_ad(trx->rseg);
+
+ rseg = trx->rseg;
+
+ ut_ad(mutex_own(&(trx->undo_mutex)));
+
+ mtr_start(&mtr);
+
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ mutex_enter(&(rseg->mutex));
+
+ undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
+ &mtr);
+ if (undo == NULL) {
+ err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
+ &undo, &mtr);
+ if (err != DB_SUCCESS) {
+
+ goto func_exit;
+ }
+ }
+
+ if (type == TRX_UNDO_INSERT) {
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo);
+ ut_ad(trx->insert_undo == NULL);
+ trx->insert_undo = undo;
+ } else {
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo);
+ ut_ad(trx->update_undo == NULL);
+ trx->update_undo = undo;
+ }
+
+ if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+ trx_undo_mark_as_dict_operation(trx, undo, &mtr);
+ }
+
+func_exit:
+ mutex_exit(&(rseg->mutex));
+ mtr_commit(&mtr);
+
+ return err;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+ trx_rseg_t* rseg, /*!< in: rollback segment memory object */
+ trx_t* trx __attribute__((unused)), /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ page_t* undo_page;
+ ulint state;
+
+ ut_ad(trx);
+ ut_ad(undo);
+ ut_ad(mtr);
+ ut_ad(mutex_own(&rseg->mutex));
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ if (undo->size == 1
+ && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
+ < TRX_UNDO_PAGE_REUSE_LIMIT) {
+
+ /* This is a heuristic to avoid the problem of all UNDO
+ slots ending up in one of the UNDO lists. Previously if
+ the server crashed with all the slots in one of the lists,
+ transactions that required the slots of a different type
+ would fail for lack of slots. */
+
+ if (UT_LIST_GET_LEN(rseg->update_undo_list) < 500
+ && UT_LIST_GET_LEN(rseg->insert_undo_list) < 500) {
+
+ state = TRX_UNDO_CACHED;
+ } else {
+ state = TRX_UNDO_TO_FREE;
+ }
+
+ } else if (undo->type == TRX_UNDO_INSERT) {
+
+ state = TRX_UNDO_TO_FREE;
+ } else {
+ state = TRX_UNDO_TO_PURGE;
+ }
+
+ undo->state = state;
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
+
+ return(undo_page);
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+ trx_t* trx, /*!< in: transaction */
+ trx_undo_t* undo, /*!< in: undo log memory copy */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_usegf_t* seg_hdr;
+ trx_upagef_t* page_hdr;
+ trx_ulogf_t* undo_header;
+ page_t* undo_page;
+ ulint offset;
+
+ ut_ad(trx && undo && mtr);
+
+ if (undo->id >= TRX_RSEG_N_SLOTS) {
+ fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+ (ulong) undo->id);
+ mem_analyze_corruption(undo);
+ ut_error;
+ }
+
+ undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+ undo->hdr_page_no, mtr);
+
+ seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+ page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+ /*------------------------------*/
+ undo->state = TRX_UNDO_PREPARED;
+ undo->xid = trx->xid;
+ /*------------------------------*/
+
+ mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
+ MLOG_2BYTES, mtr);
+
+ offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+ undo_header = undo_page + offset;
+
+ mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+ TRUE, MLOG_1BYTE, mtr);
+
+ trx_undo_write_xid(undo_header, &undo->xid, mtr);
+
+ return(undo_page);
+}
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+ trx_t* trx, /*!< in: trx owning the update undo log */
+ page_t* undo_page, /*!< in: update undo log header page,
+ x-latched */
+ mtr_t* mtr) /*!< in: mtr */
+{
+ trx_rseg_t* rseg;
+ trx_undo_t* undo;
+
+ undo = trx->update_undo;
+ rseg = trx->rseg;
+
+ ut_ad(mutex_own(&(rseg->mutex)));
+
+ trx_purge_add_update_undo_to_history(trx, undo_page, mtr);
+
+ UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo);
+
+ trx->update_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_PURGE
+ || undo->state == TRX_UNDO_TO_FREE);
+
+ trx_undo_mem_free(undo);
+ }
+}
+
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+ trx_t* trx) /*!< in: transaction handle */
+{
+ trx_undo_t* undo;
+ trx_rseg_t* rseg;
+
+ undo = trx->insert_undo;
+ ut_ad(undo);
+
+ rseg = trx->rseg;
+
+ mutex_enter(&(rseg->mutex));
+
+ UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo);
+ trx->insert_undo = NULL;
+
+ if (undo->state == TRX_UNDO_CACHED) {
+
+ UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+ } else {
+ ut_ad(undo->state == TRX_UNDO_TO_FREE);
+
+ /* Delete first the undo log segment in the file */
+
+ mutex_exit(&(rseg->mutex));
+
+ trx_undo_seg_free(undo);
+
+ mutex_enter(&(rseg->mutex));
+
+ ut_ad(rseg->curr_size > undo->size);
+
+ rseg->curr_size -= undo->size;
+
+ trx_undo_mem_free(undo);
+ }
+
+ mutex_exit(&(rseg->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/usr/usr0sess.c b/storage/xtradb/usr/usr0sess.c
new file mode 100644
index 00000000000..8087dcb4170
--- /dev/null
+++ b/storage/xtradb/usr/usr0sess.c
@@ -0,0 +1,71 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file usr/usr0sess.c
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+#ifdef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#include "trx0trx.h"
+
+/*********************************************************************//**
+Opens a session.
+@return own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void)
+/*===========*/
+{
+ sess_t* sess;
+
+ ut_ad(mutex_own(&kernel_mutex));
+
+ sess = mem_alloc(sizeof(sess_t));
+
+ sess->state = SESS_ACTIVE;
+
+ sess->trx = trx_create(sess);
+
+ UT_LIST_INIT(sess->graphs);
+
+ return(sess);
+}
+
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+ sess_t* sess) /*!< in, own: session object */
+{
+ ut_ad(!mutex_own(&kernel_mutex));
+
+ ut_a(UT_LIST_GET_LEN(sess->graphs) == 0);
+
+ trx_free_for_background(sess->trx);
+ mem_free(sess);
+}
diff --git a/storage/xtradb/ut/ut0byte.c b/storage/xtradb/ut/ut0byte.c
new file mode 100644
index 00000000000..4e093f72ce2
--- /dev/null
+++ b/storage/xtradb/ut/ut0byte.c
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0byte.c
+Byte utilities
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0byte.h"
+
+#ifdef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+/** Zero value for a dulint */
+UNIV_INTERN const dulint ut_dulint_zero = {0, 0};
+
+/** Maximum value for a dulint */
+UNIV_INTERN const dulint ut_dulint_max = {0xFFFFFFFFUL, 0xFFFFFFFFUL};
+
+#ifdef notdefined /* unused code */
+#include "ut0sort.h"
+
+/************************************************************//**
+Sort function for dulint arrays. */
+UNIV_INTERN
+void
+ut_dulint_sort(
+/*===========*/
+ dulint* arr, /*!< in/out: array to be sorted */
+ dulint* aux_arr,/*!< in/out: auxiliary array (same size as arr) */
+ ulint low, /*!< in: low bound of sort interval, inclusive */
+ ulint high) /*!< in: high bound of sort interval, noninclusive */
+{
+ UT_SORT_FUNCTION_BODY(ut_dulint_sort, arr, aux_arr, low, high,
+ ut_dulint_cmp);
+}
+#endif /* notdefined */
diff --git a/storage/xtradb/ut/ut0dbg.c b/storage/xtradb/ut/ut0dbg.c
new file mode 100644
index 00000000000..4484e6c36de
--- /dev/null
+++ b/storage/xtradb/ut/ut0dbg.c
@@ -0,0 +1,187 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file ut/ut0dbg.c
+Debug utilities for Innobase.
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#include "univ.i"
+#include "ut0dbg.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+#else
+/** This is used to eliminate compiler warnings */
+UNIV_INTERN ulint ut_dbg_zero = 0;
+#endif
+
+#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/** If this is set to TRUE by ut_dbg_assertion_failed(), all threads
+will stop at the next ut_a() or ut_ad(). */
+UNIV_INTERN ibool ut_dbg_stop_threads = FALSE;
+#endif
+#ifdef __NETWARE__
+/** Flag for ignoring further assertion failures. This is set to TRUE
+when on NetWare there happens an InnoDB assertion failure or other
+fatal error condition that requires an immediate shutdown. */
+UNIV_INTERN ibool panic_shutdown = FALSE;
+#elif !defined(UT_DBG_USE_ABORT)
+/** A null pointer that will be dereferenced to trigger a memory trap */
+UNIV_INTERN ulint* ut_dbg_null_ptr = NULL;
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+ const char* expr, /*!< in: the failed assertion (optional) */
+ const char* file, /*!< in: source file containing the assertion */
+ ulint line) /*!< in: line number of the assertion */
+{
+ ut_print_timestamp(stderr);
+#ifdef UNIV_HOTBACKUP
+ fprintf(stderr, " InnoDB: Assertion failure in file %s line %lu\n",
+ file, line);
+#else /* UNIV_HOTBACKUP */
+ fprintf(stderr,
+ " InnoDB: Assertion failure in thread %lu"
+ " in file %s line %lu\n",
+ os_thread_pf(os_thread_get_curr_id()), file, line);
+#endif /* UNIV_HOTBACKUP */
+ if (expr) {
+ fprintf(stderr,
+ "InnoDB: Failing assertion: %s\n", expr);
+ }
+
+ fputs("InnoDB: We intentionally generate a memory trap.\n"
+ "InnoDB: Submit a detailed bug report"
+ " to http://bugs.mysql.com.\n"
+ "InnoDB: If you get repeated assertion failures"
+ " or crashes, even\n"
+ "InnoDB: immediately after the mysqld startup, there may be\n"
+ "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
+ "InnoDB: " REFMAN "forcing-recovery.html\n"
+ "InnoDB: about forcing recovery.\n", stderr);
+#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+ ut_dbg_stop_threads = TRUE;
+#endif
+}
+
+#ifdef __NETWARE__
+/*************************************************************//**
+Shut down MySQL/InnoDB after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_panic(void)
+/*==============*/
+{
+ if (!panic_shutdown) {
+ panic_shutdown = TRUE;
+ innobase_shutdown_for_mysql();
+ }
+ exit(1);
+}
+#else /* __NETWARE__ */
+# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/*************************************************************//**
+Stop a thread after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_stop_thread(
+/*===============*/
+ const char* file,
+ ulint line)
+{
+#ifndef UNIV_HOTBACKUP
+ fprintf(stderr, "InnoDB: Thread %lu stopped in file %s line %lu\n",
+ os_thread_pf(os_thread_get_curr_id()), file, line);
+ os_thread_sleep(1000000000);
+#endif /* !UNIV_HOTBACKUP */
+}
+# endif
+#endif /* __NETWARE__ */
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <unistd.h>
+
+#ifndef timersub
+#define timersub(a, b, r) \
+ do { \
+ (r)->tv_sec = (a)->tv_sec - (b)->tv_sec; \
+ (r)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+ if ((r)->tv_usec < 0) { \
+ (r)->tv_sec--; \
+ (r)->tv_usec += 1000000; \
+ } \
+ } while (0)
+#endif /* timersub */
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+ speedo_t* speedo) /*!< out: speedo */
+{
+ gettimeofday(&speedo->tv, NULL);
+
+ getrusage(RUSAGE_SELF, &speedo->ru);
+}
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+ const speedo_t* speedo) /*!< in: speedo */
+{
+ struct rusage ru_now;
+ struct timeval tv_now;
+ struct timeval tv_diff;
+
+ getrusage(RUSAGE_SELF, &ru_now);
+
+ gettimeofday(&tv_now, NULL);
+
+#define PRINT_TIMEVAL(prefix, tvp) \
+ fprintf(stderr, "%s% 5ld.%06ld sec\n", \
+ prefix, (tvp)->tv_sec, (tvp)->tv_usec)
+
+ timersub(&tv_now, &speedo->tv, &tv_diff);
+ PRINT_TIMEVAL("real", &tv_diff);
+
+ timersub(&ru_now.ru_utime, &speedo->ru.ru_utime, &tv_diff);
+ PRINT_TIMEVAL("user", &tv_diff);
+
+ timersub(&ru_now.ru_stime, &speedo->ru.ru_stime, &tv_diff);
+ PRINT_TIMEVAL("sys ", &tv_diff);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/ut/ut0list.c b/storage/xtradb/ut/ut0list.c
new file mode 100644
index 00000000000..895a575c535
--- /dev/null
+++ b/storage/xtradb/ut/ut0list.c
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0list.c
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0list.h"
+#ifdef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+/****************************************************************//**
+Create a new list.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void)
+/*=================*/
+{
+ ib_list_t* list = mem_alloc(sizeof(ib_list_t));
+
+ list->first = NULL;
+ list->last = NULL;
+ list->is_heap_list = FALSE;
+
+ return(list);
+}
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ ib_list_t* list = mem_heap_alloc(heap, sizeof(ib_list_t));
+
+ list->first = NULL;
+ list->last = NULL;
+ list->is_heap_list = TRUE;
+
+ return(list);
+}
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+ ib_list_t* list) /*!< in: list */
+{
+ ut_a(!list->is_heap_list);
+
+ /* We don't check that the list is empty because it's entirely valid
+ to e.g. have all the nodes allocated from a single heap that is then
+ freed after the list itself is freed. */
+
+ mem_free(list);
+}
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ return(ib_list_add_after(list, ib_list_get_first(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+ ib_list_t* list, /*!< in: list */
+ void* data, /*!< in: data */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ return(ib_list_add_after(list, ib_list_get_last(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* prev_node, /*!< in: node preceding new node (can
+ be NULL) */
+ void* data, /*!< in: data */
+ mem_heap_t* heap) /*!< in: memory heap to use */
+{
+ ib_list_node_t* node = mem_heap_alloc(heap, sizeof(ib_list_node_t));
+
+ node->data = data;
+
+ if (!list->first) {
+ /* Empty list. */
+
+ ut_a(!prev_node);
+
+ node->prev = NULL;
+ node->next = NULL;
+
+ list->first = node;
+ list->last = node;
+ } else if (!prev_node) {
+ /* Start of list. */
+
+ node->prev = NULL;
+ node->next = list->first;
+
+ list->first->prev = node;
+
+ list->first = node;
+ } else {
+ /* Middle or end of list. */
+
+ node->prev = prev_node;
+ node->next = prev_node->next;
+
+ prev_node->next = node;
+
+ if (node->next) {
+ node->next->prev = node;
+ } else {
+ list->last = node;
+ }
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+ ib_list_t* list, /*!< in: list */
+ ib_list_node_t* node) /*!< in: node to remove */
+{
+ if (node->prev) {
+ node->prev->next = node->next;
+ } else {
+ /* First item in list. */
+
+ ut_ad(list->first == node);
+
+ list->first = node->next;
+ }
+
+ if (node->next) {
+ node->next->prev = node->prev;
+ } else {
+ /* Last item in list. */
+
+ ut_ad(list->last == node);
+
+ list->last = node->prev;
+ }
+}
diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.c
new file mode 100644
index 00000000000..bf55e4273b6
--- /dev/null
+++ b/storage/xtradb/ut/ut0mem.c
@@ -0,0 +1,711 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0mem.c
+Memory primitives
+
+Created 5/11/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+
+#ifdef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "os0thread.h"
+# include "srv0srv.h"
+
+#include <stdlib.h>
+
+/** This struct is placed first in every allocated memory block */
+typedef struct ut_mem_block_struct ut_mem_block_t;
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc(). Does not count malloc()
+if srv_use_sys_malloc is set. Protected by ut_list_mutex. */
+UNIV_INTERN ulint ut_total_allocated_memory = 0;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+UNIV_INTERN os_fast_mutex_t ut_list_mutex;
+
+/** Dynamically allocated memory block */
+struct ut_mem_block_struct{
+ UT_LIST_NODE_T(ut_mem_block_t) mem_block_list;
+ /*!< mem block list node */
+ ulint size; /*!< size of allocated memory */
+ ulint magic_n;/*!< magic number (UT_MEM_MAGIC_N) */
+};
+
+/** The value of ut_mem_block_struct::magic_n. Used in detecting
+memory corruption. */
+#define UT_MEM_MAGIC_N 1601650166
+
+/** List of all memory blocks allocated from the operating system
+with malloc. Protected by ut_list_mutex. */
+static UT_LIST_BASE_NODE_T(ut_mem_block_t) ut_mem_block_list;
+
+/** Flag: has ut_mem_block_list been initialized? */
+static ibool ut_mem_block_list_inited = FALSE;
+
+/** A dummy pointer for generating a null pointer exception in
+ut_malloc_low() */
+static ulint* ut_mem_null_ptr = NULL;
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void)
+/*=============*/
+{
+ ut_a(!ut_mem_block_list_inited);
+ os_fast_mutex_init(&ut_list_mutex);
+ UT_LIST_INIT(ut_mem_block_list);
+ ut_mem_block_list_inited = TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined and set_to_zero is TRUE.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+ ulint n, /*!< in: number of bytes to allocate */
+ ibool set_to_zero, /*!< in: TRUE if allocated memory should be
+ set to zero if UNIV_SET_MEM_TO_ZERO is
+ defined */
+ ibool assert_on_error)/*!< in: if TRUE, we crash mysqld if the
+ memory cannot be allocated */
+{
+#ifndef UNIV_HOTBACKUP
+ ulint retry_count;
+ void* ret;
+
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ ret = malloc(n);
+ ut_a(ret || !assert_on_error);
+
+#ifdef UNIV_SET_MEM_TO_ZERO
+ if (set_to_zero) {
+ memset(ret, '\0', n);
+ UNIV_MEM_ALLOC(ret, n);
+ }
+#endif
+ return(ret);
+ }
+
+ ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
+ ut_a(ut_mem_block_list_inited);
+
+ retry_count = 0;
+retry:
+ os_fast_mutex_lock(&ut_list_mutex);
+
+ ret = malloc(n + sizeof(ut_mem_block_t));
+
+ if (ret == NULL && retry_count < 60) {
+ if (retry_count == 0) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: Error: cannot allocate"
+ " %lu bytes of\n"
+ "InnoDB: memory with malloc!"
+ " Total allocated memory\n"
+ "InnoDB: by InnoDB %lu bytes."
+ " Operating system errno: %lu\n"
+ "InnoDB: Check if you should"
+ " increase the swap file or\n"
+ "InnoDB: ulimits of your operating system.\n"
+ "InnoDB: On FreeBSD check you"
+ " have compiled the OS with\n"
+ "InnoDB: a big enough maximum process size.\n"
+ "InnoDB: Note that in most 32-bit"
+ " computers the process\n"
+ "InnoDB: memory space is limited"
+ " to 2 GB or 4 GB.\n"
+ "InnoDB: We keep retrying"
+ " the allocation for 60 seconds...\n",
+ (ulong) n, (ulong) ut_total_allocated_memory,
+#ifdef __WIN__
+ (ulong) GetLastError()
+#else
+ (ulong) errno
+#endif
+ );
+ }
+
+ os_fast_mutex_unlock(&ut_list_mutex);
+
+ /* Sleep for a second and retry the allocation; maybe this is
+ just a temporary shortage of memory */
+
+ os_thread_sleep(1000000);
+
+ retry_count++;
+
+ goto retry;
+ }
+
+ if (ret == NULL) {
+ /* Flush stderr to make more probable that the error
+ message gets in the error file before we generate a seg
+ fault */
+
+ fflush(stderr);
+
+ os_fast_mutex_unlock(&ut_list_mutex);
+
+ /* Make an intentional seg fault so that we get a stack
+ trace */
+ /* Intentional segfault on NetWare causes an abend. Avoid this
+ by graceful exit handling in ut_a(). */
+#if (!defined __NETWARE__)
+ if (assert_on_error) {
+ ut_print_timestamp(stderr);
+
+ fprintf(stderr,
+ " InnoDB: We now intentionally"
+ " generate a seg fault so that\n"
+ "InnoDB: on Linux we get a stack trace.\n");
+
+ if (*ut_mem_null_ptr) ut_mem_null_ptr = 0;
+ } else {
+ return(NULL);
+ }
+#else
+ ut_a(0);
+#endif
+ }
+
+ if (set_to_zero) {
+#ifdef UNIV_SET_MEM_TO_ZERO
+ memset(ret, '\0', n + sizeof(ut_mem_block_t));
+#endif
+ }
+
+ UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));
+
+ ((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t);
+ ((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N;
+
+ ut_total_allocated_memory += n + sizeof(ut_mem_block_t);
+
+ UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
+ ((ut_mem_block_t*)ret));
+ os_fast_mutex_unlock(&ut_list_mutex);
+
+ return((void*)((byte*)ret + sizeof(ut_mem_block_t)));
+#else /* !UNIV_HOTBACKUP */
+ void* ret = malloc(n);
+ ut_a(ret || !assert_on_error);
+
+# ifdef UNIV_SET_MEM_TO_ZERO
+ if (set_to_zero) {
+ memset(ret, '\0', n);
+ }
+# endif
+ return(ret);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined.
+@return own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc(
+/*======*/
+ ulint n) /*!< in: number of bytes to allocate */
+{
+#ifndef UNIV_HOTBACKUP
+ return(ut_malloc_low(n, TRUE, TRUE));
+#else /* !UNIV_HOTBACKUP */
+ return(malloc(n));
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails.
+@return TRUE if succeeded */
+UNIV_INTERN
+ibool
+ut_test_malloc(
+/*===========*/
+ ulint n) /*!< in: try to allocate this many bytes */
+{
+ void* ret;
+
+ ret = malloc(n);
+
+ if (ret == NULL) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: cannot allocate"
+ " %lu bytes of memory for\n"
+ "InnoDB: a BLOB with malloc! Total allocated memory\n"
+ "InnoDB: by InnoDB %lu bytes."
+ " Operating system errno: %d\n"
+ "InnoDB: Check if you should increase"
+ " the swap file or\n"
+ "InnoDB: ulimits of your operating system.\n"
+ "InnoDB: On FreeBSD check you have"
+ " compiled the OS with\n"
+ "InnoDB: a big enough maximum process size.\n",
+ (ulong) n,
+ (ulong) ut_total_allocated_memory,
+ (int) errno);
+ return(FALSE);
+ }
+
+ free(ret);
+
+ return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+ void* ptr) /*!< in, own: memory block */
+{
+#ifndef UNIV_HOTBACKUP
+ ut_mem_block_t* block;
+
+ if (ptr == NULL) {
+ return;
+ } else if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ free(ptr);
+ return;
+ }
+
+ block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+
+ os_fast_mutex_lock(&ut_list_mutex);
+
+ ut_a(block->magic_n == UT_MEM_MAGIC_N);
+ ut_a(ut_total_allocated_memory >= block->size);
+
+ ut_total_allocated_memory -= block->size;
+
+ UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+ free(block);
+
+ os_fast_mutex_unlock(&ut_list_mutex);
+#else /* !UNIV_HOTBACKUP */
+ free(ptr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+ realloc() changes the size of the memory block pointed to
+ by ptr to size bytes. The contents will be unchanged to
+ the minimum of the old and new sizes; newly allocated mem-
+ ory will be uninitialized. If ptr is NULL, the call is
+ equivalent to malloc(size); if size is equal to zero, the
+ call is equivalent to free(ptr). Unless ptr is NULL, it
+ must have been returned by an earlier call to malloc(),
+ calloc() or realloc().
+
+RETURN VALUE
+ realloc() returns a pointer to the newly allocated memory,
+ which is suitably aligned for any kind of variable and may
+ be different from ptr, or NULL if the request fails. If
+ size was equal to 0, either NULL or a pointer suitable to
+ be passed to free() is returned. If realloc() fails the
+ original block is left untouched - it is not freed or
+ moved.
+@return own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+ void* ptr, /*!< in: pointer to old block or NULL */
+ ulint size) /*!< in: desired size */
+{
+ ut_mem_block_t* block;
+ ulint old_size;
+ ulint min_size;
+ void* new_ptr;
+
+ if (UNIV_LIKELY(srv_use_sys_malloc)) {
+ return(realloc(ptr, size));
+ }
+
+ if (ptr == NULL) {
+
+ return(ut_malloc(size));
+ }
+
+ if (size == 0) {
+ ut_free(ptr);
+
+ return(NULL);
+ }
+
+ block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+
+ ut_a(block->magic_n == UT_MEM_MAGIC_N);
+
+ old_size = block->size - sizeof(ut_mem_block_t);
+
+ if (size < old_size) {
+ min_size = size;
+ } else {
+ min_size = old_size;
+ }
+
+ new_ptr = ut_malloc(size);
+
+ if (new_ptr == NULL) {
+
+ return(NULL);
+ }
+
+ /* Copy the old data from ptr */
+ ut_memcpy(new_ptr, ptr, min_size);
+
+ ut_free(ptr);
+
+ return(new_ptr);
+}
+
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void)
+/*=================*/
+{
+ ut_mem_block_t* block;
+
+ ut_a(ut_mem_block_list_inited);
+ ut_mem_block_list_inited = FALSE;
+ os_fast_mutex_free(&ut_list_mutex);
+
+ while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) {
+
+ ut_a(block->magic_n == UT_MEM_MAGIC_N);
+ ut_a(ut_total_allocated_memory >= block->size);
+
+ ut_total_allocated_memory -= block->size;
+
+ UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+ free(block);
+ }
+
+ if (ut_total_allocated_memory != 0) {
+ fprintf(stderr,
+ "InnoDB: Warning: after shutdown"
+ " total allocated memory is %lu\n",
+ (ulong) ut_total_allocated_memory);
+ }
+
+ ut_mem_block_list_inited = FALSE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size) /*!< in: size of destination buffer */
+{
+ ulint src_size = strlen(src);
+
+ if (size != 0) {
+ ulint n = ut_min(src_size, size - 1);
+
+ memcpy(dst, src, n);
+ dst[n] = '\0';
+ }
+
+ return(src_size);
+}
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+ char* dst, /*!< in: destination buffer */
+ const char* src, /*!< in: source buffer */
+ ulint size) /*!< in: size of destination buffer */
+{
+ ulint src_size = strlen(src);
+
+ if (size != 0) {
+ ulint n = ut_min(src_size, size - 1);
+
+ memcpy(dst, src + src_size - n, n + 1);
+ }
+
+ return(src_size);
+}
+
+/**********************************************************************//**
+Make a quoted copy of a NUL-terminated string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_memcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_strcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src) /*!< in: null-terminated string */
+{
+ while (*src) {
+ if ((*dest++ = *src++) == q) {
+ *dest++ = q;
+ }
+ }
+
+ return(dest);
+}
+
+/**********************************************************************//**
+Make a quoted copy of a fixed-length string. Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_strcpyq().
+@return pointer to end of dest */
+UNIV_INTERN
+char*
+ut_memcpyq(
+/*=======*/
+ char* dest, /*!< in: output buffer */
+ char q, /*!< in: the quote character */
+ const char* src, /*!< in: string to be quoted */
+ ulint len) /*!< in: length of src */
+{
+ const char* srcend = src + len;
+
+ while (src < srcend) {
+ if ((*dest++ = *src++) == q) {
+ *dest++ = q;
+ }
+ }
+
+ return(dest);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+ const char* s1, /*!< in: string to search in */
+ const char* s2) /*!< in: string to search for */
+{
+ ulint count = 0;
+ ulint len = strlen(s2);
+
+ if (len == 0) {
+
+ return(0);
+ }
+
+ for (;;) {
+ s1 = strstr(s1, s2);
+
+ if (!s1) {
+
+ break;
+ }
+
+ count++;
+ s1 += len;
+ }
+
+ return(count);
+}
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+ const char* str, /*!< in: string to operate on */
+ const char* s1, /*!< in: string to replace */
+ const char* s2) /*!< in: string to replace s1 with */
+{
+ char* new_str;
+ char* ptr;
+ const char* str_end;
+ ulint str_len = strlen(str);
+ ulint s1_len = strlen(s1);
+ ulint s2_len = strlen(s2);
+ ulint count = 0;
+ int len_delta = (int)s2_len - (int)s1_len;
+
+ str_end = str + str_len;
+
+ if (len_delta <= 0) {
+ len_delta = 0;
+ } else {
+ count = ut_strcount(str, s1);
+ }
+
+ new_str = mem_alloc(str_len + count * len_delta + 1);
+ ptr = new_str;
+
+ while (str) {
+ const char* next = strstr(str, s1);
+
+ if (!next) {
+ next = str_end;
+ }
+
+ memcpy(ptr, str, next - str);
+ ptr += next - str;
+
+ if (next == str_end) {
+
+ break;
+ }
+
+ memcpy(ptr, s2, s2_len);
+ ptr += s2_len;
+
+ str = next + s1_len;
+ }
+
+ *ptr = '\0';
+
+ return(new_str);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ut_str_sql_format()
+{
+ char buf[128];
+ ulint ret;
+
+#define CALL_AND_TEST(str, str_len, buf, buf_size, ret_expected, buf_expected)\
+ do {\
+ ibool ok = TRUE;\
+ memset(buf, 'x', 10);\
+ buf[10] = '\0';\
+ fprintf(stderr, "TESTING \"%s\", %lu, %lu\n",\
+ str, (ulint) str_len, (ulint) buf_size);\
+ ret = ut_str_sql_format(str, str_len, buf, buf_size);\
+ if (ret != ret_expected) {\
+ fprintf(stderr, "expected ret %lu, got %lu\n",\
+ (ulint) ret_expected, ret);\
+ ok = FALSE;\
+ }\
+ if (strcmp((char*) buf, buf_expected) != 0) {\
+ fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+ buf_expected, buf);\
+ ok = FALSE;\
+ }\
+ if (ok) {\
+ fprintf(stderr, "OK: %lu, \"%s\"\n\n",\
+ (ulint) ret, buf);\
+ } else {\
+ return;\
+ }\
+ } while (0)
+
+ CALL_AND_TEST("abcd", 4, buf, 0, 0, "xxxxxxxxxx");
+
+ CALL_AND_TEST("abcd", 4, buf, 1, 1, "");
+
+ CALL_AND_TEST("abcd", 4, buf, 2, 1, "");
+
+ CALL_AND_TEST("abcd", 0, buf, 3, 3, "''");
+ CALL_AND_TEST("abcd", 1, buf, 3, 1, "");
+ CALL_AND_TEST("abcd", 2, buf, 3, 1, "");
+ CALL_AND_TEST("abcd", 3, buf, 3, 1, "");
+ CALL_AND_TEST("abcd", 4, buf, 3, 1, "");
+
+ CALL_AND_TEST("abcd", 0, buf, 4, 3, "''");
+ CALL_AND_TEST("abcd", 1, buf, 4, 4, "'a'");
+ CALL_AND_TEST("abcd", 2, buf, 4, 4, "'a'");
+ CALL_AND_TEST("abcd", 3, buf, 4, 4, "'a'");
+ CALL_AND_TEST("abcd", 4, buf, 4, 4, "'a'");
+ CALL_AND_TEST("abcde", 5, buf, 4, 4, "'a'");
+ CALL_AND_TEST("'", 1, buf, 4, 3, "''");
+ CALL_AND_TEST("''", 2, buf, 4, 3, "''");
+ CALL_AND_TEST("a'", 2, buf, 4, 4, "'a'");
+ CALL_AND_TEST("'a", 2, buf, 4, 3, "''");
+ CALL_AND_TEST("ab", 2, buf, 4, 4, "'a'");
+
+ CALL_AND_TEST("abcdef", 0, buf, 5, 3, "''");
+ CALL_AND_TEST("abcdef", 1, buf, 5, 4, "'a'");
+ CALL_AND_TEST("abcdef", 2, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("abcdef", 3, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("abcdef", 4, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("abcdef", 5, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("abcdef", 6, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("'", 1, buf, 5, 5, "''''");
+ CALL_AND_TEST("''", 2, buf, 5, 5, "''''");
+ CALL_AND_TEST("a'", 2, buf, 5, 4, "'a'");
+ CALL_AND_TEST("'a", 2, buf, 5, 5, "''''");
+ CALL_AND_TEST("ab", 2, buf, 5, 5, "'ab'");
+ CALL_AND_TEST("abc", 3, buf, 5, 5, "'ab'");
+
+ CALL_AND_TEST("ab", 2, buf, 6, 5, "'ab'");
+
+ CALL_AND_TEST("a'b'c", 5, buf, 32, 10, "'a''b''c'");
+ CALL_AND_TEST("a'b'c'", 6, buf, 32, 12, "'a''b''c'''");
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.c
new file mode 100644
index 00000000000..3d7bc91e714
--- /dev/null
+++ b/storage/xtradb/ut/ut0rbt.c
@@ -0,0 +1,1249 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0rbt.c
+Red-Black tree implementation
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/************************************************************************
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+ 1. Every node is either red or black.
+ 2. Every leaf (NULL - in our case tree->nil) is black.
+ 3. If a node is red, then both its children are black.
+ 4. Every simple path from a node to a descendant leaf contains the
+ same number of black nodes.
+
+ from (3) above, the implication is that on any path from the root
+ to a leaf, red nodes must not be adjacent.
+
+ However, any number of black nodes may appear in a sequence. */
+
+#if defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t) (t->root->left)
+#define SIZEOF_NODE(t) ((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+/****************************************************************//**
+Print out the sub-tree recursively. */
+static
+void
+rbt_print_subtree(
+/*==============*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ const ib_rbt_node_t* node, /*!< in: node to print */
+ ib_rbt_print_node print) /*!< in: print key function */
+{
+ /* FIXME: Doesn't do anything yet */
+ if (node != tree->nil) {
+ print(node);
+ rbt_print_subtree(tree, node->left, print);
+ rbt_print_subtree(tree, node->right, print);
+ }
+}
+
+/****************************************************************//**
+Verify that the keys are in order.
+@return TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+ const ib_rbt_t* tree) /*!< in: tree to verfify */
+{
+ const ib_rbt_node_t* node;
+ const ib_rbt_node_t* prev = NULL;
+
+ /* Iterate over all the nodes, comparing each node with the prev */
+ for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+ if (prev && tree->compare(prev->value, node->value) >= 0) {
+ return(FALSE);
+ }
+
+ prev = node;
+ }
+
+ return(TRUE);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return 0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+ const ib_rbt_t* tree, /*!< in: tree to verify */
+ const ib_rbt_node_t* node) /*!< in: start of sub-tree */
+{
+ ulint result;
+
+ if (node != tree->nil) {
+ ulint left_height = rbt_count_black_nodes(tree, node->left);
+
+ ulint right_height = rbt_count_black_nodes(tree, node->right);
+
+ if (left_height == 0
+ || right_height == 0
+ || left_height != right_height) {
+
+ result = 0;
+ } else if (node->color == IB_RBT_RED) {
+
+ /* Case 3 */
+ if (node->left->color != IB_RBT_BLACK
+ || node->right->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+ result = left_height;
+ }
+ /* Check if it's anything other than RED or BLACK. */
+ } else if (node->color != IB_RBT_BLACK) {
+
+ result = 0;
+ } else {
+
+ result = right_height + 1;
+ }
+ } else {
+ result = 1;
+ }
+
+ return(result);
+}
+
+/****************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of the tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* right = node->right;
+
+ node->right = right->left;
+
+ if (right->left != nil) {
+ right->left->parent = node;
+ }
+
+ /* Right's new parent was node's parent. */
+ right->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->left) {
+ /* Node was on the left of its parent. */
+ node->parent->left = right;
+ } else {
+ /* Node must have been on the right. */
+ node->parent->right = right;
+ }
+
+ /* Finally, put node on right's left. */
+ right->left = node;
+ node->parent = right;
+}
+
+/****************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: nil node of tree */
+ ib_rbt_node_t* node) /*!< in: node to rotate */
+{
+ ib_rbt_node_t* left = node->left;
+
+ node->left = left->right;
+
+ if (left->right != nil) {
+ left->right->parent = node;
+ }
+
+ /* Left's new parent was node's parent. */
+ left->parent = node->parent;
+
+ /* Since root's parent is tree->nil and root->parent->left points
+ back to root, we can avoid the check. */
+ if (node == node->parent->right) {
+ /* Node was on the left of its parent. */
+ node->parent->right = left;
+ } else {
+ /* Node must have been on the left. */
+ node->parent->left = left;
+ }
+
+ /* Finally, put node on left's right. */
+ left->right = node;
+ node->parent = left;
+}
+
+/****************************************************************//**
+Append a node to the tree.
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rbt tree */
+ ib_rbt_bound_t* parent, /*!< in: node's parent */
+ ib_rbt_node_t* node) /*!< in: node to add */
+{
+ /* Cast away the const. */
+ ib_rbt_node_t* last = (ib_rbt_node_t*) parent->last;
+
+ if (last == tree->root || parent->result < 0) {
+ last->left = node;
+ } else {
+ /* FIXME: We don't handle duplicates (yet)! */
+ ut_a(parent->result != 0);
+
+ last->right = node;
+ }
+
+ node->parent = last;
+
+ return(node);
+}
+
+/****************************************************************//**
+Generic binary tree insert
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ ib_rbt_node_t* node) /*!< in: node hold the insert value */
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ parent.result = 0;
+ parent.last = tree->root;
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+
+ parent.last = current;
+ parent.result = tree->compare(key, current->value);
+
+ if (parent.result < 0) {
+ current = current->left;
+ } else {
+ current = current->right;
+ }
+ }
+
+ ut_a(current == tree->nil);
+
+ rbt_tree_add_child(tree, &parent, node);
+
+ return(node);
+}
+
+/****************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+ const ib_rbt_t* tree, /*!< in: tree to balance */
+ ib_rbt_node_t* node) /*!< in: node that was inserted */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* parent = node->parent;
+
+ /* Restore the red-black property. */
+ node->color = IB_RBT_RED;
+
+ while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+ ib_rbt_node_t* grand_parent = parent->parent;
+
+ if (parent == grand_parent->left) {
+ ib_rbt_node_t* uncle = grand_parent->right;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->right) {
+ /* Right is a black node and node is
+ to the right, case 2 - move node
+ up and rotate. */
+ node = parent;
+ rbt_rotate_left(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_right(nil, grand_parent);
+ }
+
+ } else {
+ ib_rbt_node_t* uncle = grand_parent->left;
+
+ if (uncle->color == IB_RBT_RED) {
+
+ /* Case 1 - change the colors. */
+ uncle->color = IB_RBT_BLACK;
+ parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ /* Move node up the tree. */
+ node = grand_parent;
+
+ } else {
+
+ if (node == parent->left) {
+ /* Left is a black node and node is to
+ the right, case 2 - move node up and
+ rotate. */
+ node = parent;
+ rbt_rotate_right(nil, node);
+ }
+
+ grand_parent = node->parent->parent;
+
+ /* Case 3. */
+ node->parent->color = IB_RBT_BLACK;
+ grand_parent->color = IB_RBT_RED;
+
+ rbt_rotate_left(nil, grand_parent);
+ }
+ }
+
+ parent = node->parent;
+ }
+
+ /* Color the root black. */
+ ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/****************************************************************//**
+Find the given node's successor.
+@return successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: this is declared const
+ because it can be called via
+ rbt_next() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* next = current->right;
+
+ /* Is there a sub-tree to the right that we can follow. */
+ if (next != nil) {
+
+ /* Follow the left most links of the current right child. */
+ while (next->left != nil) {
+ next = next->left;
+ }
+
+ } else { /* We will have to go up the tree to find the successor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ next = (ib_rbt_node_t*) current;
+
+ while (parent != tree->root && next == parent->right) {
+ next = parent;
+ parent = next->parent;
+ }
+
+ next = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(next);
+}
+
+/****************************************************************//**
+Find the given node's precedecessor.
+@return predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current) /*!< in: this is declared const
+ because it can be called via
+ rbt_prev() */
+{
+ const ib_rbt_node_t* nil = tree->nil;
+ ib_rbt_node_t* prev = current->left;
+
+ /* Is there a sub-tree to the left that we can follow. */
+ if (prev != nil) {
+
+ /* Follow the right most links of the current left child. */
+ while (prev->right != nil) {
+ prev = prev->right;
+ }
+
+ } else { /* We will have to go up the tree to find the precedecessor. */
+ ib_rbt_node_t* parent = current->parent;
+
+ /* Cast away the const. */
+ prev = (ib_rbt_node_t*)current;
+
+ while (parent != tree->root && prev == parent->left) {
+ prev = parent;
+ parent = prev->parent;
+ }
+
+ prev = (parent == tree->root) ? NULL : parent;
+ }
+
+ return(prev);
+}
+
+/****************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+ ib_rbt_node_t* eject, /*!< in: node to eject */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ /* Update the to be ejected node's parent's child pointers. */
+ if (eject->parent->left == eject) {
+ eject->parent->left = node;
+ } else if (eject->parent->right == eject) {
+ eject->parent->right = node;
+ } else {
+ ut_a(0);
+ }
+ /* eject is now an orphan but otherwise its pointers
+ and color are left intact. */
+
+ node->parent = eject->parent;
+}
+
+/****************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+ ib_rbt_node_t* replace, /*!< in: node to replace */
+ ib_rbt_node_t* node) /*!< in: node to replace with */
+{
+ ib_rbt_color_t color = node->color;
+
+ /* Update the node pointers. */
+ node->left = replace->left;
+ node->right = replace->right;
+
+ /* Update the child node pointers. */
+ node->left->parent = node;
+ node->right->parent = node;
+
+ /* Make the parent of replace point to node. */
+ rbt_eject_node(replace, node);
+
+ /* Swap the colors. */
+ node->color = replace->color;
+ replace->color = color;
+}
+
+/****************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to detach */
+{
+ ib_rbt_node_t* child;
+ const ib_rbt_node_t* nil = tree->nil;
+
+ if (node->left != nil && node->right != nil) {
+ /* Case where the node to be deleted has two children. */
+ ib_rbt_node_t* successor = rbt_find_successor(tree, node);
+
+ ut_a(successor != nil);
+ ut_a(successor->parent != nil);
+ ut_a(successor->left == nil);
+
+ child = successor->right;
+
+ /* Remove the successor node and replace with its child. */
+ rbt_eject_node(successor, child);
+
+ /* Replace the node to delete with its successor node. */
+ rbt_replace_node(node, successor);
+ } else {
+ ut_a(node->left == nil || node->right == nil);
+
+ child = (node->left != nil) ? node->left : node->right;
+
+ /* Replace the node to delete with one of it's children. */
+ rbt_eject_node(node, child);
+ }
+
+ /* Reset the node links. */
+ node->parent = node->right = node->left = tree->nil;
+
+ return(child);
+}
+
+/****************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling)/*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+
+ sibling = parent->right;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->left->color == IB_RBT_BLACK
+ && sibling->right->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->right->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->left->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->left->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, sibling);
+
+ sibling = parent->right;
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->right->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, parent);
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+ const ib_rbt_node_t* nil, /*!< in: rb tree nil node */
+ ib_rbt_node_t* parent, /*!< in: parent node */
+ ib_rbt_node_t* sibling)/*!< in: sibling node */
+{
+ ib_rbt_node_t* node = NULL;
+
+ ut_a(sibling != nil);
+
+ /* Case 3. */
+ if (sibling->color == IB_RBT_RED) {
+
+ parent->color = IB_RBT_RED;
+ sibling->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ /* Since this will violate case 3 because of the change above. */
+ if (sibling->right->color == IB_RBT_BLACK
+ && sibling->left->color == IB_RBT_BLACK) {
+
+ node = parent; /* Parent needs to be rebalanced too. */
+ sibling->color = IB_RBT_RED;
+
+ } else {
+ if (sibling->left->color == IB_RBT_BLACK) {
+
+ ut_a(sibling->right->color == IB_RBT_RED);
+
+ sibling->color = IB_RBT_RED;
+ sibling->right->color = IB_RBT_BLACK;
+
+ rbt_rotate_left(nil, sibling);
+
+ sibling = parent->left;
+
+ ut_a(sibling != nil);
+ }
+
+ sibling->color = parent->color;
+ sibling->left->color = IB_RBT_BLACK;
+
+ parent->color = IB_RBT_BLACK;
+
+ rbt_rotate_right(nil, parent);
+ }
+
+ return(node);
+}
+
+/****************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_node_t* node) /*!< in: node to remove */
+{
+ /* Detach node and get the node that will be used
+ as rebalance start. */
+ ib_rbt_node_t* child = rbt_detach_node(tree, node);
+
+ if (node->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* last = child;
+
+ ROOT(tree)->color = IB_RBT_RED;
+
+ while (child && child->color == IB_RBT_BLACK) {
+ ib_rbt_node_t* parent = child->parent;
+
+ /* Did the deletion cause an imbalance in the
+ parents left sub-tree. */
+ if (parent->left == child) {
+
+ child = rbt_balance_right(
+ tree->nil, parent, parent->right);
+
+ } else if (parent->right == child) {
+
+ child = rbt_balance_left(
+ tree->nil, parent, parent->left);
+
+ } else {
+ ut_error;
+ }
+
+ if (child) {
+ last = child;
+ }
+ }
+
+ ut_a(last);
+
+ last->color = IB_RBT_BLACK;
+ ROOT(tree)->color = IB_RBT_BLACK;
+ }
+
+ /* Note that we have removed a node from the tree. */
+ --tree->n_nodes;
+}
+
+/****************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+ ib_rbt_node_t* node, /*!< in: node to free */
+ ib_rbt_node_t* nil) /*!< in: rb tree nil node */
+{
+ if (node != nil) {
+ rbt_free_node(node->left, nil);
+ rbt_free_node(node->right, nil);
+
+ ut_free(node);
+ }
+}
+
+/****************************************************************//**
+Free all the nodes and free the tree. */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+ ib_rbt_t* tree) /*!< in: rb tree to free */
+{
+ rbt_free_node(tree->root, tree->nil);
+ ut_free(tree->nil);
+ ut_free(tree);
+}
+
+/****************************************************************//**
+Create an instance of a red black tree.
+@return an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+ size_t sizeof_value, /*!< in: sizeof data item */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_t* tree;
+ ib_rbt_node_t* node;
+
+ tree = (ib_rbt_t*) ut_malloc(sizeof(*tree));
+ memset(tree, 0, sizeof(*tree));
+
+ tree->sizeof_value = sizeof_value;
+
+ /* Create the sentinel (NIL) node. */
+ node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = node;
+
+ /* Create the "fake" root, the real root node will be the
+ left child of this node. */
+ node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+
+ node->color = IB_RBT_BLACK;
+ node->parent = node->left = node->right = tree->nil;
+
+ tree->compare = compare;
+
+ return(tree);
+}
+
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key, /*!< in: key for ordering */
+ const void* value) /*!< in: value of key, this value
+ is copied to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data. */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* Insert in the tree in the usual way. */
+ rbt_tree_insert(tree, key, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+ return(node);
+}
+
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: bounds */
+ const void* value) /*!< in: this value is copied
+ to the node */
+{
+ ib_rbt_node_t* node;
+
+ /* Create the node that will hold the value data */
+ node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+ memcpy(node->value, value, tree->sizeof_value);
+ node->parent = node->left = node->right = tree->nil;
+
+ /* If tree is empty */
+ if (parent->last == NULL) {
+ parent->last = tree->root;
+ }
+
+ /* Append the node, the hope here is that the caller knows
+ what s/he is doing. */
+ rbt_tree_add_child(tree, parent, node);
+ rbt_balance_tree(tree, node);
+
+ ++tree->n_nodes;
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(tree));
+#endif
+ return(node);
+}
+
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return NULL if not found else the node where key was found */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to use for search */
+{
+ const ib_rbt_node_t* current = ROOT(tree);
+
+ /* Regular binary search. */
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result < 0) {
+ current = current->left;
+ } else if (result > 0) {
+ current = current->right;
+ } else {
+ break;
+ }
+ }
+
+ return(current != tree->nil ? current : NULL);
+}
+
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to delete */
+{
+ ibool deleted = FALSE;
+ ib_rbt_node_t* node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+ if (node) {
+ rbt_remove_node_and_rebalance(tree, node);
+
+ ut_free(node);
+ deleted = TRUE;
+ }
+
+ return(deleted);
+}
+
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return deleted node but without the const */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+ ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* const_node) /*!< in: node to delete, this
+ is a fudge and declared const
+ because the caller can access
+ only const nodes */
+{
+ /* Cast away the const. */
+ rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+ /* This is to make it easier to do something like this:
+ ut_free(rbt_remove_node(node));
+ */
+
+ return((ib_rbt_node_t*) const_node);
+}
+
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return node satisfying the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* lb_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ current = current->right;
+
+ } else if (result < 0) {
+
+ lb_node = current;
+ current = current->left;
+
+ } else {
+ lb_node = current;
+ break;
+ }
+ }
+
+ return(lb_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return node satisfying the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* ub_node = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ int result = tree->compare(key, current->value);
+
+ if (result > 0) {
+
+ ub_node = current;
+ current = current->right;
+
+ } else if (result < 0) {
+
+ current = current->left;
+
+ } else {
+ ub_node = current;
+ break;
+ }
+ }
+
+ return(ub_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key) /*!< in: key to search */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = tree->compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return value of result */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ ib_rbt_bound_t* parent, /*!< in: search bounds */
+ const void* key, /*!< in: key to search */
+ ib_rbt_compare compare) /*!< in: fn to compare items */
+{
+ ib_rbt_node_t* current = ROOT(tree);
+
+ /* Every thing is greater than the NULL root. */
+ parent->result = 1;
+ parent->last = NULL;
+
+ while (current != tree->nil) {
+
+ parent->last = current;
+ parent->result = compare(key, current->value);
+
+ if (parent->result > 0) {
+ current = current->right;
+ } else if (parent->result < 0) {
+ current = current->left;
+ } else {
+ break;
+ }
+ }
+
+ return(parent->result);
+}
+
+/****************************************************************//**
+Get the leftmost node.
+Return the left most node in the tree. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+ const ib_rbt_t* tree) /* in: rb tree */
+{
+ ib_rbt_node_t* first = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ first = current;
+ current = current->left;
+ }
+
+ return(first);
+}
+
+/****************************************************************//**
+Return the right most node in the tree.
+@return the rightmost node or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+ const ib_rbt_t* tree) /*!< in: rb tree */
+{
+ ib_rbt_node_t* last = NULL;
+ ib_rbt_node_t* current = ROOT(tree);
+
+ while (current != tree->nil) {
+ last = current;
+ current = current->right;
+ }
+
+ return(last);
+}
+
+/****************************************************************//**
+Return the next node.
+@return node next from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: current node */
+{
+ return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Return the previous node.
+@return node prev from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+ const ib_rbt_t* tree, /*!< in: rb tree */
+ const ib_rbt_node_t* current)/*!< in: current node */
+{
+ return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Reset the tree. Delete all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+ ib_rbt_t* tree) /*!< in: rb tree */
+{
+ rbt_free_node(ROOT(tree), tree->nil);
+
+ tree->n_nodes = 0;
+ tree->root->left = tree->root->right = tree->nil;
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ const ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ulint n_merged = 0;
+ const ib_rbt_node_t* src_node = rbt_first(src);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+ if (rbt_search(dst, &parent, src_node->value) != 0) {
+ rbt_add_node(dst, &parent, src_node->value);
+ ++n_merged;
+ }
+ }
+
+ return(n_merged);
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src.
+@return no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+ ib_rbt_t* dst, /*!< in: dst rb tree */
+ ib_rbt_t* src) /*!< in: src rb tree */
+{
+ ib_rbt_bound_t parent;
+ ib_rbt_node_t* src_node;
+ ulint old_size = rbt_size(dst);
+
+ if (rbt_empty(src) || dst == src) {
+ return(0);
+ }
+
+ for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
+ ib_rbt_node_t* prev = src_node;
+
+ src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+
+ /* Skip duplicates. */
+ if (rbt_search(dst, &parent, prev->value) != 0) {
+
+ /* Remove and reset the node but preserve
+ the node (data) value. */
+ rbt_remove_node_and_rebalance(src, prev);
+
+ /* The nil should be taken from the dst tree. */
+ prev->parent = prev->left = prev->right = dst->nil;
+ rbt_tree_add_child(dst, &parent, prev);
+ rbt_balance_tree(dst, prev);
+
+ ++dst->n_nodes;
+ }
+ }
+
+#if defined(IB_RBT_TESTING)
+ ut_a(rbt_validate(dst));
+ ut_a(rbt_validate(src));
+#endif
+ return(rbt_size(dst) - old_size);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return TRUE if OK FALSE otherwise */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+ const ib_rbt_t* tree) /*!< in: RB tree to validate */
+{
+ if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+ return(rbt_check_ordering(tree));
+ }
+
+ return(FALSE);
+}
+
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+ const ib_rbt_t* tree, /*!< in: tree to traverse */
+ ib_rbt_print_node print) /*!< in: print function */
+{
+ rbt_print_subtree(tree, ROOT(tree), print);
+}
diff --git a/storage/xtradb/ut/ut0rnd.c b/storage/xtradb/ut/ut0rnd.c
new file mode 100644
index 00000000000..cefd0990ecc
--- /dev/null
+++ b/storage/xtradb/ut/ut0rnd.c
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0rnd.c
+Random numbers and hashing
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+/** These random numbers are used in ut_find_prime */
+/*@{*/
+#define UT_RANDOM_1 1.0412321
+#define UT_RANDOM_2 1.1131347
+#define UT_RANDOM_3 1.0132677
+/*@}*/
+
+/** Seed value of ut_rnd_gen_ulint(). */
+UNIV_INTERN ulint ut_rnd_ulint_counter = 65654363;
+
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+ ulint n) /*!< in: positive number > 100 */
+{
+ ulint pow2;
+ ulint i;
+
+ n += 100;
+
+ pow2 = 1;
+ while (pow2 * 2 < n) {
+ pow2 = 2 * pow2;
+ }
+
+ if ((double)n < 1.05 * (double)pow2) {
+ n = (ulint) ((double)n * UT_RANDOM_1);
+ }
+
+ pow2 = 2 * pow2;
+
+ if ((double)n > 0.95 * (double)pow2) {
+ n = (ulint) ((double)n * UT_RANDOM_2);
+ }
+
+ if (n > pow2 - 20) {
+ n += 30;
+ }
+
+ /* Now we have n far enough from powers of 2. To make
+ n more random (especially, if it was not near
+ a power of 2), we then multiply it by a random number. */
+
+ n = (ulint) ((double)n * UT_RANDOM_3);
+
+ for (;; n++) {
+ i = 2;
+ while (i * i <= n) {
+ if (n % i == 0) {
+ goto next_n;
+ }
+ i++;
+ }
+
+ /* Found a prime */
+ break;
+next_n: ;
+ }
+
+ return(n);
+}
diff --git a/storage/xtradb/ut/ut0ut.c b/storage/xtradb/ut/ut0ut.c
new file mode 100644
index 00000000000..498873e290a
--- /dev/null
+++ b/storage/xtradb/ut/ut0ut.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0ut.c
+Various utilities for Innobase.
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0ut.h"
+
+#ifdef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "ha_prototypes.h"
+# include "mysql_com.h" /* NAME_LEN */
+#endif /* UNIV_HOTBACKUP */
+
+/** A constant to prevent the compiler from optimizing ut_delay() away. */
+UNIV_INTERN ibool ut_always_false = FALSE;
+
+#ifdef __WIN__
+/*****************************************************************//**
+NOTE: The Windows epoch starts from 1601/01/01 whereas the Unix
+epoch starts from 1970/1/1. For selection of constant see:
+http://support.microsoft.com/kb/167296/ */
+#define WIN_TO_UNIX_DELTA_USEC ((ib_int64_t) 11644473600000000ULL)
+
+
+/*****************************************************************//**
+This is the Windows version of gettimeofday(2).
+@return 0 if all OK else -1 */
+static
+int
+ut_gettimeofday(
+/*============*/
+ struct timeval* tv, /*!< out: Values are relative to Unix epoch */
+ void* tz) /*!< in: not used */
+{
+ FILETIME ft;
+ ib_int64_t tm;
+
+ if (!tv) {
+ errno = EINVAL;
+ return(-1);
+ }
+
+ GetSystemTimeAsFileTime(&ft);
+
+ tm = (ib_int64_t) ft.dwHighDateTime << 32;
+ tm |= ft.dwLowDateTime;
+
+ ut_a(tm >= 0); /* If tm wraps over to negative, the quotient / 10
+ does not work */
+
+ tm /= 10; /* Convert from 100 nsec periods to usec */
+
+ /* If we don't convert to the Unix epoch the value for
+ struct timeval::tv_sec will overflow.*/
+ tm -= WIN_TO_UNIX_DELTA_USEC;
+
+ tv->tv_sec = (long) (tm / 1000000L);
+ tv->tv_usec = (long) (tm % 1000000L);
+
+ return(0);
+}
+#else
+/** An alias for gettimeofday(2). On Microsoft Windows, we have to
+reimplement this function. */
+#define ut_gettimeofday gettimeofday
+#endif
+
+/********************************************************//**
+Gets the high 32 bits in a ulint. That is makes a shift >> 32,
+but since there seem to be compiler bugs in both gcc and Visual C++,
+we do this by a special conversion.
+@return a >> 32 */
+UNIV_INTERN
+ulint
+ut_get_high32(
+/*==========*/
+ ulint a) /*!< in: ulint */
+{
+ ib_int64_t i;
+
+ i = (ib_int64_t)a;
+
+ i = i >> 32;
+
+ return((ulint)i);
+}
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void)
+/*=========*/
+{
+ return(time(NULL));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return 0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+ ulint* sec, /*!< out: seconds since the Epoch */
+ ulint* ms) /*!< out: microseconds since the Epoch+*sec */
+{
+ struct timeval tv;
+ int ret;
+ int errno_gettimeofday;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+
+ ret = ut_gettimeofday(&tv, NULL);
+
+ if (ret == -1) {
+ errno_gettimeofday = errno;
+ ut_print_timestamp(stderr);
+ fprintf(stderr, " InnoDB: gettimeofday(): %s\n",
+ strerror(errno_gettimeofday));
+ os_thread_sleep(100000); /* 0.1 sec */
+ errno = errno_gettimeofday;
+ } else {
+ break;
+ }
+ }
+
+ if (ret != -1) {
+ *sec = (ulint) tv.tv_sec;
+ *ms = (ulint) tv.tv_usec;
+ }
+
+ return(ret);
+}
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+ ullint* tloc) /*!< out: us since epoch, if non-NULL */
+{
+ struct timeval tv;
+ ullint us;
+
+ ut_gettimeofday(&tv, NULL);
+
+ us = (ullint) tv.tv_sec * 1000000 + tv.tv_usec;
+
+ if (tloc != NULL) {
+ *tloc = us;
+ }
+
+ return(us);
+}
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch. The
+value may wrap around. It should only be used for heuristic
+purposes.
+@return ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void)
+/*============*/
+{
+ struct timeval tv;
+
+ ut_gettimeofday(&tv, NULL);
+
+ return((ulint) tv.tv_sec * 1000 + tv.tv_usec / 1000);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+ ib_time_t time2, /*!< in: time */
+ ib_time_t time1) /*!< in: time */
+{
+ return(difftime(time2, time1));
+}
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+ FILE* file) /*!< in: file where to print */
+{
+#ifdef __WIN__
+ SYSTEMTIME cal_tm;
+
+ GetLocalTime(&cal_tm);
+
+ fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
+ (int)cal_tm.wYear % 100,
+ (int)cal_tm.wMonth,
+ (int)cal_tm.wDay,
+ (int)cal_tm.wHour,
+ (int)cal_tm.wMinute,
+ (int)cal_tm.wSecond);
+#else
+ struct tm cal_tm;
+ struct tm* cal_tm_ptr;
+ time_t tm;
+
+ time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+ localtime_r(&tm, &cal_tm);
+ cal_tm_ptr = &cal_tm;
+#else
+ cal_tm_ptr = localtime(&tm);
+#endif
+ fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
+ cal_tm_ptr->tm_year % 100,
+ cal_tm_ptr->tm_mon + 1,
+ cal_tm_ptr->tm_mday,
+ cal_tm_ptr->tm_hour,
+ cal_tm_ptr->tm_min,
+ cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+ char* buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+ SYSTEMTIME cal_tm;
+
+ GetLocalTime(&cal_tm);
+
+ sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+ (int)cal_tm.wYear % 100,
+ (int)cal_tm.wMonth,
+ (int)cal_tm.wDay,
+ (int)cal_tm.wHour,
+ (int)cal_tm.wMinute,
+ (int)cal_tm.wSecond);
+#else
+ struct tm cal_tm;
+ struct tm* cal_tm_ptr;
+ time_t tm;
+
+ time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+ localtime_r(&tm, &cal_tm);
+ cal_tm_ptr = &cal_tm;
+#else
+ cal_tm_ptr = localtime(&tm);
+#endif
+ sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+ cal_tm_ptr->tm_year % 100,
+ cal_tm_ptr->tm_mon + 1,
+ cal_tm_ptr->tm_mday,
+ cal_tm_ptr->tm_hour,
+ cal_tm_ptr->tm_min,
+ cal_tm_ptr->tm_sec);
+#endif
+}
+
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+ char* buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+ SYSTEMTIME cal_tm;
+
+ GetLocalTime(&cal_tm);
+
+ sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+ (int)cal_tm.wYear % 100,
+ (int)cal_tm.wMonth,
+ (int)cal_tm.wDay,
+ (int)cal_tm.wHour,
+ (int)cal_tm.wMinute,
+ (int)cal_tm.wSecond);
+#else
+ struct tm cal_tm;
+ struct tm* cal_tm_ptr;
+ time_t tm;
+
+ time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+ localtime_r(&tm, &cal_tm);
+ cal_tm_ptr = &cal_tm;
+#else
+ cal_tm_ptr = localtime(&tm);
+#endif
+ sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+ cal_tm_ptr->tm_year % 100,
+ cal_tm_ptr->tm_mon + 1,
+ cal_tm_ptr->tm_mday,
+ cal_tm_ptr->tm_hour,
+ cal_tm_ptr->tm_min,
+ cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+ ulint* year, /*!< out: current year */
+ ulint* month, /*!< out: month */
+ ulint* day) /*!< out: day */
+{
+#ifdef __WIN__
+ SYSTEMTIME cal_tm;
+
+ GetLocalTime(&cal_tm);
+
+ *year = (ulint)cal_tm.wYear;
+ *month = (ulint)cal_tm.wMonth;
+ *day = (ulint)cal_tm.wDay;
+#else
+ struct tm cal_tm;
+ struct tm* cal_tm_ptr;
+ time_t tm;
+
+ time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+ localtime_r(&tm, &cal_tm);
+ cal_tm_ptr = &cal_tm;
+#else
+ cal_tm_ptr = localtime(&tm);
+#endif
+ *year = (ulint)cal_tm_ptr->tm_year + 1900;
+ *month = (ulint)cal_tm_ptr->tm_mon + 1;
+ *day = (ulint)cal_tm_ptr->tm_mday;
+#endif
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+ ulint delay) /*!< in: delay in microseconds on 100 MHz Pentium */
+{
+ ulint i, j;
+
+ j = 0;
+
+ for (i = 0; i < delay * 50; i++) {
+ j += i;
+ UT_RELAX_CPU();
+ }
+
+ if (ut_always_false) {
+ ut_always_false = (ibool) j;
+ }
+
+ return(j);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+ FILE* file, /*!< in: file where to print */
+ const void* buf, /*!< in: memory buffer */
+ ulint len) /*!< in: length of the buffer */
+{
+ const byte* data;
+ ulint i;
+
+ UNIV_MEM_ASSERT_RW(buf, len);
+
+ fprintf(file, " len %lu; hex ", len);
+
+ for (data = (const byte*)buf, i = 0; i < len; i++) {
+ fprintf(file, "%02lx", (ulong)*data++);
+ }
+
+ fputs("; asc ", file);
+
+ data = (const byte*)buf;
+
+ for (i = 0; i < len; i++) {
+ int c = (int) *data++;
+ putc(isprint(c) ? c : ' ', file);
+ }
+
+ putc(';', file);
+}
+
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+ ulint n) /*!< in: number != 0 */
+{
+ ulint res;
+
+ res = 1;
+
+ ut_ad(n > 0);
+
+ while (res < n) {
+ res = res * 2;
+ }
+
+ return(res);
+}
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+ FILE* f, /*!< in: output stream */
+ const char* name) /*!< in: name to print */
+{
+ putc('\'', f);
+ for (;;) {
+ int c = *name++;
+ switch (c) {
+ case 0:
+ goto done;
+ case '\'':
+ putc(c, f);
+ /* fall through */
+ default:
+ putc(c, f);
+ }
+ }
+done:
+ putc('\'', f);
+}
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name) /*!< in: name to print */
+{
+ ut_print_namel(f, trx, table_id, name, strlen(name));
+}
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+ FILE* f, /*!< in: output stream */
+ trx_t* trx, /*!< in: transaction (NULL=no quotes) */
+ ibool table_id,/*!< in: TRUE=print a table name,
+ FALSE=print other identifier */
+ const char* name, /*!< in: name to print */
+ ulint namelen)/*!< in: length of name */
+{
+ /* 2 * NAME_LEN for database and table name,
+ and some slack for the #mysql50# prefix and quotes */
+ char buf[3 * NAME_LEN];
+ const char* bufend;
+
+ bufend = innobase_convert_name(buf, sizeof buf,
+ name, namelen,
+ trx ? trx->mysql_thd : NULL,
+ table_id);
+
+ fwrite(buf, 1, bufend - buf, f);
+}
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+ FILE* dest, /*!< in: output file */
+ FILE* src) /*!< in: input file to be appended to output */
+{
+ long len = ftell(src);
+ char buf[4096];
+
+ rewind(src);
+ do {
+ size_t maxs = len < (long) sizeof buf
+ ? (size_t) len
+ : sizeof buf;
+ size_t size = fread(buf, 1, maxs, src);
+ fwrite(buf, 1, size, dest);
+ len -= (long) size;
+ if (size < maxs) {
+ break;
+ }
+ } while (len > 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+# include <stdarg.h>
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+ char* str, /*!< out: string */
+ size_t size, /*!< in: str size */
+ const char* fmt, /*!< in: format */
+ ...) /*!< in: format values */
+{
+ int res;
+ va_list ap1;
+ va_list ap2;
+
+ va_start(ap1, fmt);
+ va_start(ap2, fmt);
+
+ res = _vscprintf(fmt, ap1);
+ ut_a(res != -1);
+
+ if (size > 0) {
+ _vsnprintf(str, size, fmt, ap2);
+
+ if ((size_t) res >= size) {
+ str[size - 1] = '\0';
+ }
+ }
+
+ va_end(ap1);
+ va_end(ap2);
+
+ return(res);
+}
+#endif /* __WIN__ */
diff --git a/storage/xtradb/ut/ut0vec.c b/storage/xtradb/ut/ut0vec.c
new file mode 100644
index 00000000000..45f2bc9771f
--- /dev/null
+++ b/storage/xtradb/ut/ut0vec.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0vec.c
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0vec.h"
+#ifdef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+#include <string.h>
+
+/****************************************************************//**
+Create a new vector with the given initial size.
+@return vector */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+ mem_heap_t* heap, /*!< in: heap */
+ ulint size) /*!< in: initial size */
+{
+ ib_vector_t* vec;
+
+ ut_a(size > 0);
+
+ vec = mem_heap_alloc(heap, sizeof(*vec));
+
+ vec->heap = heap;
+ vec->data = mem_heap_alloc(heap, sizeof(void*) * size);
+ vec->used = 0;
+ vec->total = size;
+
+ return(vec);
+}
+
+/****************************************************************//**
+Push a new element to the vector, increasing its size if necessary. */
+UNIV_INTERN
+void
+ib_vector_push(
+/*===========*/
+ ib_vector_t* vec, /*!< in: vector */
+ void* elem) /*!< in: data element */
+{
+ if (vec->used >= vec->total) {
+ void** new_data;
+ ulint new_total = vec->total * 2;
+
+ new_data = mem_heap_alloc(vec->heap,
+ sizeof(void*) * new_total);
+ memcpy(new_data, vec->data, sizeof(void*) * vec->total);
+
+ vec->data = new_data;
+ vec->total = new_total;
+ }
+
+ vec->data[vec->used] = elem;
+ vec->used++;
+}
diff --git a/storage/xtradb/ut/ut0wqueue.c b/storage/xtradb/ut/ut0wqueue.c
new file mode 100644
index 00000000000..5220d1e17f4
--- /dev/null
+++ b/storage/xtradb/ut/ut0wqueue.c
@@ -0,0 +1,118 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+#include "ut0wqueue.h"
+
+/*******************************************************************//**
+@file ut/ut0wqueue.c
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Create a new work queue.
+@return work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void)
+/*===================*/
+{
+ ib_wqueue_t* wq = mem_alloc(sizeof(ib_wqueue_t));
+
+ mutex_create(&wq->mutex, SYNC_WORK_QUEUE);
+
+ wq->items = ib_list_create();
+ wq->event = os_event_create(NULL);
+
+ return(wq);
+}
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+ ib_wqueue_t* wq) /*!< in: work queue */
+{
+ ut_a(!ib_list_get_first(wq->items));
+
+ mutex_free(&wq->mutex);
+ ib_list_free(wq->items);
+ os_event_free(wq->event);
+
+ mem_free(wq);
+}
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+ ib_wqueue_t* wq, /*!< in: work queue */
+ void* item, /*!< in: work item */
+ mem_heap_t* heap) /*!< in: memory heap to use for allocating the
+ list node */
+{
+ mutex_enter(&wq->mutex);
+
+ ib_list_add_last(wq->items, item, heap);
+ os_event_set(wq->event);
+
+ mutex_exit(&wq->mutex);
+}
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+ ib_wqueue_t* wq) /*!< in: work queue */
+{
+ ib_list_node_t* node;
+
+ for (;;) {
+ os_event_wait(wq->event);
+
+ mutex_enter(&wq->mutex);
+
+ node = ib_list_get_first(wq->items);
+
+ if (node) {
+ ib_list_remove(wq->items, node);
+
+ if (!ib_list_get_first(wq->items)) {
+ /* We must reset the event when the list
+ gets emptied. */
+ os_event_reset(wq->event);
+ }
+
+ break;
+ }
+
+ mutex_exit(&wq->mutex);
+ }
+
+ mutex_exit(&wq->mutex);
+
+ return(node->data);
+}